diff options
Diffstat (limited to 'fs')
571 files changed, 32573 insertions, 21322 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c index e98f56d3787d..9a1d42630751 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -96,14 +96,10 @@ static struct posix_acl *v9fs_get_cached_acl(struct inode *inode, int type) return acl; } -int v9fs_check_acl(struct inode *inode, int mask) +struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type) { - struct posix_acl *acl; struct v9fs_session_info *v9ses; - if (mask & MAY_NOT_BLOCK) - return -ECHILD; - v9ses = v9fs_inode2v9ses(inode); if (((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) || ((v9ses->flags & V9FS_ACL_MASK) != V9FS_POSIX_ACL)) { @@ -111,18 +107,10 @@ int v9fs_check_acl(struct inode *inode, int mask) * On access = client and acl = on mode get the acl * values from the server */ - return 0; + return NULL; } - acl = v9fs_get_cached_acl(inode, ACL_TYPE_ACCESS); + return v9fs_get_cached_acl(inode, type); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl) { - int error = posix_acl_permission(inode, acl, mask); - posix_acl_release(acl); - return error; - } - return -EAGAIN; } static int v9fs_set_acl(struct dentry *dentry, int type, struct posix_acl *acl) @@ -165,40 +153,40 @@ err_free_out: int v9fs_acl_chmod(struct dentry *dentry) { int retval = 0; - struct posix_acl *acl, *clone; + struct posix_acl *acl; struct inode *inode = dentry->d_inode; if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; acl = v9fs_get_cached_acl(inode, ACL_TYPE_ACCESS); if (acl) { - clone = posix_acl_clone(acl, GFP_KERNEL); + retval = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); + if (retval) + return retval; + retval = v9fs_set_acl(dentry, ACL_TYPE_ACCESS, acl); posix_acl_release(acl); - if (!clone) - return -ENOMEM; - retval = posix_acl_chmod_masq(clone, inode->i_mode); - if (!retval) - retval = v9fs_set_acl(dentry, ACL_TYPE_ACCESS, clone); - posix_acl_release(clone); } return retval; } int v9fs_set_create_acl(struct dentry *dentry, - struct posix_acl *dpacl, struct posix_acl *pacl) + struct posix_acl **dpacl, struct posix_acl **pacl) { - v9fs_set_acl(dentry, ACL_TYPE_DEFAULT, dpacl); - v9fs_set_acl(dentry, ACL_TYPE_ACCESS, pacl); - posix_acl_release(dpacl); - posix_acl_release(pacl); + if (dentry) { + v9fs_set_acl(dentry, ACL_TYPE_DEFAULT, *dpacl); + v9fs_set_acl(dentry, ACL_TYPE_ACCESS, *pacl); + } + posix_acl_release(*dpacl); + posix_acl_release(*pacl); + *dpacl = *pacl = NULL; return 0; } -int v9fs_acl_mode(struct inode *dir, mode_t *modep, +int v9fs_acl_mode(struct inode *dir, umode_t *modep, struct posix_acl **dpacl, struct posix_acl **pacl) { int retval = 0; - mode_t mode = *modep; + umode_t mode = *modep; struct posix_acl *acl = NULL; if (!S_ISLNK(mode)) { @@ -209,29 +197,18 @@ int v9fs_acl_mode(struct inode *dir, mode_t *modep, mode &= ~current_umask(); } if (acl) { - struct posix_acl *clone; - if (S_ISDIR(mode)) - *dpacl = acl; - clone = posix_acl_clone(acl, GFP_NOFS); - retval = -ENOMEM; - if (!clone) - goto cleanup; - - retval = posix_acl_create_masq(clone, &mode); - if (retval < 0) { - posix_acl_release(clone); - goto cleanup; - } + *dpacl = posix_acl_dup(acl); + retval = posix_acl_create(&acl, GFP_NOFS, &mode); + if (retval < 0) + return retval; if (retval > 0) - *pacl = clone; + *pacl = acl; + else + posix_acl_release(acl); } *modep = mode; return 0; -cleanup: - posix_acl_release(acl); - return retval; - } static int v9fs_remote_get_acl(struct dentry *dentry, const char *name, @@ -342,7 +319,7 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name, case ACL_TYPE_ACCESS: name = POSIX_ACL_XATTR_ACCESS; if (acl) { - mode_t mode = inode->i_mode; + umode_t mode = inode->i_mode; retval = posix_acl_equiv_mode(acl, &mode); if (retval < 0) goto err_out; diff --git a/fs/9p/acl.h b/fs/9p/acl.h index 59e18c2e8c7e..559556411965 100644 --- a/fs/9p/acl.h +++ b/fs/9p/acl.h @@ -16,14 +16,14 @@ #ifdef CONFIG_9P_FS_POSIX_ACL extern int v9fs_get_acl(struct inode *, struct p9_fid *); -extern int v9fs_check_acl(struct inode *inode, int mask); +extern struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type); extern int v9fs_acl_chmod(struct dentry *); extern int v9fs_set_create_acl(struct dentry *, - struct posix_acl *, struct posix_acl *); -extern int v9fs_acl_mode(struct inode *dir, mode_t *modep, + struct posix_acl **, struct posix_acl **); +extern int v9fs_acl_mode(struct inode *dir, umode_t *modep, struct posix_acl **dpacl, struct posix_acl **pacl); #else -#define v9fs_check_acl NULL +#define v9fs_iop_get_acl NULL static inline int v9fs_get_acl(struct inode *inode, struct p9_fid *fid) { return 0; @@ -33,12 +33,12 @@ static inline int v9fs_acl_chmod(struct dentry *dentry) return 0; } static inline int v9fs_set_create_acl(struct dentry *dentry, - struct posix_acl *dpacl, - struct posix_acl *pacl) + struct posix_acl **dpacl, + struct posix_acl **pacl) { return 0; } -static inline int v9fs_acl_mode(struct inode *dir, mode_t *modep, +static inline int v9fs_acl_mode(struct inode *dir, umode_t *modep, struct posix_acl **dpacl, struct posix_acl **pacl) { diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index ef9661886112..2b78014a124a 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -132,21 +132,19 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) options = tmp_options; while ((p = strsep(&options, ",")) != NULL) { - int token; + int token, r; if (!*p) continue; token = match_token(p, tokens, args); - if (token < Opt_uname) { - int r = match_int(&args[0], &option); + switch (token) { + case Opt_debug: + r = match_int(&args[0], &option); if (r < 0) { P9_DPRINTK(P9_DEBUG_ERROR, - "integer field, but no integer?\n"); + "integer field, but no integer?\n"); ret = r; continue; } - } - switch (token) { - case Opt_debug: v9ses->debug = option; #ifdef CONFIG_NET_9P_DEBUG p9_debug_level = option; @@ -154,12 +152,33 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) break; case Opt_dfltuid: + r = match_int(&args[0], &option); + if (r < 0) { + P9_DPRINTK(P9_DEBUG_ERROR, + "integer field, but no integer?\n"); + ret = r; + continue; + } v9ses->dfltuid = option; break; case Opt_dfltgid: + r = match_int(&args[0], &option); + if (r < 0) { + P9_DPRINTK(P9_DEBUG_ERROR, + "integer field, but no integer?\n"); + ret = r; + continue; + } v9ses->dfltgid = option; break; case Opt_afid: + r = match_int(&args[0], &option); + if (r < 0) { + P9_DPRINTK(P9_DEBUG_ERROR, + "integer field, but no integer?\n"); + ret = r; + continue; + } v9ses->afid = option; break; case Opt_uname: diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index 46ce357ca1ab..410ffd6ceb5f 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -54,9 +54,9 @@ extern struct kmem_cache *v9fs_inode_cache; struct inode *v9fs_alloc_inode(struct super_block *sb); void v9fs_destroy_inode(struct inode *inode); -struct inode *v9fs_get_inode(struct super_block *sb, int mode); +struct inode *v9fs_get_inode(struct super_block *sb, int mode, dev_t); int v9fs_init_inode(struct v9fs_session_info *v9ses, - struct inode *inode, int mode); + struct inode *inode, int mode, dev_t); void v9fs_evict_inode(struct inode *inode); ino_t v9fs_qid2ino(struct p9_qid *qid); void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *); @@ -83,4 +83,6 @@ static inline void v9fs_invalidate_inode_attr(struct inode *inode) v9inode->cache_validity |= V9FS_INO_INVALID_ATTR; return; } + +int v9fs_open_to_dotl_flags(int flags); #endif diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index 9c2bdda5cd9d..598fff1a54e5 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -165,9 +165,8 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) } while (rdir->head < rdir->tail) { p9stat_init(&st); - err = p9stat_read(rdir->buf + rdir->head, - rdir->tail - rdir->head, &st, - fid->clnt->proto_version); + err = p9stat_read(fid->clnt, rdir->buf + rdir->head, + rdir->tail - rdir->head, &st); if (err) { P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err); err = -EIO; @@ -231,7 +230,7 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent, while (err == 0) { if (rdir->tail == rdir->head) { err = p9_client_readdir(fid, rdir->buf, buflen, - filp->f_pos); + filp->f_pos); if (err <= 0) goto unlock_and_exit; @@ -241,10 +240,9 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent, while (rdir->head < rdir->tail) { - err = p9dirent_read(rdir->buf + rdir->head, - rdir->tail - rdir->head, - &curdirent, - fid->clnt->proto_version); + err = p9dirent_read(fid->clnt, rdir->buf + rdir->head, + rdir->tail - rdir->head, + &curdirent); if (err < 0) { P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err); err = -EIO; diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 3c173fcc2c5a..62857a810a79 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -65,7 +65,7 @@ int v9fs_file_open(struct inode *inode, struct file *file) v9inode = V9FS_I(inode); v9ses = v9fs_inode2v9ses(inode); if (v9fs_proto_dotl(v9ses)) - omode = file->f_flags; + omode = v9fs_open_to_dotl_flags(file->f_flags); else omode = v9fs_uflags2omode(file->f_flags, v9fs_proto_dotu(v9ses)); @@ -169,7 +169,18 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl) /* convert posix lock to p9 tlock args */ memset(&flock, 0, sizeof(flock)); - flock.type = fl->fl_type; + /* map the lock type */ + switch (fl->fl_type) { + case F_RDLCK: + flock.type = P9_LOCK_TYPE_RDLCK; + break; + case F_WRLCK: + flock.type = P9_LOCK_TYPE_WRLCK; + break; + case F_UNLCK: + flock.type = P9_LOCK_TYPE_UNLCK; + break; + } flock.start = fl->fl_start; if (fl->fl_end == OFFSET_MAX) flock.length = 0; @@ -245,7 +256,7 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl) /* convert posix lock to p9 tgetlock args */ memset(&glock, 0, sizeof(glock)); - glock.type = fl->fl_type; + glock.type = P9_LOCK_TYPE_UNLCK; glock.start = fl->fl_start; if (fl->fl_end == OFFSET_MAX) glock.length = 0; @@ -257,17 +268,26 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl) res = p9_client_getlock_dotl(fid, &glock); if (res < 0) return res; - if (glock.type != F_UNLCK) { - fl->fl_type = glock.type; + /* map 9p lock type to os lock type */ + switch (glock.type) { + case P9_LOCK_TYPE_RDLCK: + fl->fl_type = F_RDLCK; + break; + case P9_LOCK_TYPE_WRLCK: + fl->fl_type = F_WRLCK; + break; + case P9_LOCK_TYPE_UNLCK: + fl->fl_type = F_UNLCK; + break; + } + if (glock.type != P9_LOCK_TYPE_UNLCK) { fl->fl_start = glock.start; if (glock.length == 0) fl->fl_end = OFFSET_MAX; else fl->fl_end = glock.start + glock.length - 1; fl->fl_pid = glock.proc_id; - } else - fl->fl_type = F_UNLCK; - + } return res; } diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 8bb5507e822f..879ed8851737 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -95,15 +95,18 @@ static int unixmode2p9mode(struct v9fs_session_info *v9ses, int mode) /** * p9mode2unixmode- convert plan9 mode bits to unix mode bits * @v9ses: v9fs session information - * @mode: mode to convert + * @stat: p9_wstat from which mode need to be derived + * @rdev: major number, minor number in case of device files. * */ - -static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode) +static int p9mode2unixmode(struct v9fs_session_info *v9ses, + struct p9_wstat *stat, dev_t *rdev) { int res; + int mode = stat->mode; - res = mode & 0777; + res = mode & S_IALLUGO; + *rdev = 0; if ((mode & P9_DMDIR) == P9_DMDIR) res |= S_IFDIR; @@ -116,9 +119,26 @@ static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode) && (v9ses->nodev == 0)) res |= S_IFIFO; else if ((mode & P9_DMDEVICE) && (v9fs_proto_dotu(v9ses)) - && (v9ses->nodev == 0)) - res |= S_IFBLK; - else + && (v9ses->nodev == 0)) { + char type = 0, ext[32]; + int major = -1, minor = -1; + + strncpy(ext, stat->extension, sizeof(ext)); + sscanf(ext, "%c %u %u", &type, &major, &minor); + switch (type) { + case 'c': + res |= S_IFCHR; + break; + case 'b': + res |= S_IFBLK; + break; + default: + P9_DPRINTK(P9_DEBUG_ERROR, + "Unknown special type %c %s\n", type, + stat->extension); + }; + *rdev = MKDEV(major, minor); + } else res |= S_IFREG; if (v9fs_proto_dotu(v9ses)) { @@ -131,7 +151,6 @@ static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode) if ((mode & P9_DMSETVTX) == P9_DMSETVTX) res |= S_ISVTX; } - return res; } @@ -242,13 +261,13 @@ void v9fs_destroy_inode(struct inode *inode) } int v9fs_init_inode(struct v9fs_session_info *v9ses, - struct inode *inode, int mode) + struct inode *inode, int mode, dev_t rdev) { int err = 0; inode_init_owner(inode, NULL, mode); inode->i_blocks = 0; - inode->i_rdev = 0; + inode->i_rdev = rdev; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_mapping->a_ops = &v9fs_addr_operations; @@ -259,10 +278,8 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses, case S_IFSOCK: if (v9fs_proto_dotl(v9ses)) { inode->i_op = &v9fs_file_inode_operations_dotl; - inode->i_fop = &v9fs_file_operations_dotl; } else if (v9fs_proto_dotu(v9ses)) { inode->i_op = &v9fs_file_inode_operations; - inode->i_fop = &v9fs_file_operations; } else { P9_DPRINTK(P9_DEBUG_ERROR, "special files without extended mode\n"); @@ -335,7 +352,7 @@ error: * */ -struct inode *v9fs_get_inode(struct super_block *sb, int mode) +struct inode *v9fs_get_inode(struct super_block *sb, int mode, dev_t rdev) { int err; struct inode *inode; @@ -348,7 +365,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode) P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n"); return ERR_PTR(-ENOMEM); } - err = v9fs_init_inode(v9ses, inode, mode); + err = v9fs_init_inode(v9ses, inode, mode, rdev); if (err) { iput(inode); return ERR_PTR(err); @@ -435,11 +452,12 @@ void v9fs_evict_inode(struct inode *inode) static int v9fs_test_inode(struct inode *inode, void *data) { int umode; + dev_t rdev; struct v9fs_inode *v9inode = V9FS_I(inode); struct p9_wstat *st = (struct p9_wstat *)data; struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode); - umode = p9mode2unixmode(v9ses, st->mode); + umode = p9mode2unixmode(v9ses, st, &rdev); /* don't match inode of different type */ if ((inode->i_mode & S_IFMT) != (umode & S_IFMT)) return 0; @@ -473,6 +491,7 @@ static struct inode *v9fs_qid_iget(struct super_block *sb, struct p9_wstat *st, int new) { + dev_t rdev; int retval, umode; unsigned long i_ino; struct inode *inode; @@ -496,8 +515,8 @@ static struct inode *v9fs_qid_iget(struct super_block *sb, * later. */ inode->i_ino = i_ino; - umode = p9mode2unixmode(v9ses, st->mode); - retval = v9fs_init_inode(v9ses, inode, umode); + umode = p9mode2unixmode(v9ses, st, &rdev); + retval = v9fs_init_inode(v9ses, inode, umode, rdev); if (retval) goto error; @@ -532,6 +551,19 @@ v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, } /** + * v9fs_at_to_dotl_flags- convert Linux specific AT flags to + * plan 9 AT flag. + * @flags: flags to convert + */ +static int v9fs_at_to_dotl_flags(int flags) +{ + int rflags = 0; + if (flags & AT_REMOVEDIR) + rflags |= P9_DOTL_AT_REMOVEDIR; + return rflags; +} + +/** * v9fs_remove - helper function to remove files and directories * @dir: directory inode that is being deleted * @dentry: dentry that is being deleted @@ -558,7 +590,8 @@ static int v9fs_remove(struct inode *dir, struct dentry *dentry, int flags) return retval; } if (v9fs_proto_dotl(v9ses)) - retval = p9_client_unlinkat(dfid, dentry->d_name.name, flags); + retval = p9_client_unlinkat(dfid, dentry->d_name.name, + v9fs_at_to_dotl_flags(flags)); if (retval == -EOPNOTSUPP) { /* Try the one based on path */ v9fid = v9fs_fid_clone(dentry); @@ -645,13 +678,11 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err); goto error; } - d_instantiate(dentry, inode); err = v9fs_fid_add(dentry, fid); if (err < 0) goto error; - + d_instantiate(dentry, inode); return ofid; - error: if (ofid) p9_client_clunk(ofid); @@ -792,6 +823,7 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nameidata) { + struct dentry *res; struct super_block *sb; struct v9fs_session_info *v9ses; struct p9_fid *dfid, *fid; @@ -823,22 +855,35 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, return ERR_PTR(result); } - - inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb); + /* + * Make sure we don't use a wrong inode due to parallel + * unlink. For cached mode create calls request for new + * inode. But with cache disabled, lookup should do this. + */ + if (v9ses->cache) + inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb); + else + inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { result = PTR_ERR(inode); inode = NULL; goto error; } - result = v9fs_fid_add(dentry, fid); if (result < 0) goto error_iput; - inst_out: - d_add(dentry, inode); - return NULL; - + /* + * If we had a rename on the server and a parallel lookup + * for the new name, then make sure we instantiate with + * the new name. ie look up for a/b, while on server somebody + * moved b under k and client parallely did a lookup for + * k/b. + */ + res = d_materialise_unique(dentry, inode); + if (!IS_ERR(res)) + return res; + result = PTR_ERR(res); error_iput: iput(inode); error: @@ -1002,7 +1047,7 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, return PTR_ERR(st); v9fs_stat2inode(st, dentry->d_inode, dentry->d_inode->i_sb); - generic_fillattr(dentry->d_inode, stat); + generic_fillattr(dentry->d_inode, stat); p9stat_free(st); kfree(st); @@ -1086,13 +1131,14 @@ void v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, struct super_block *sb) { + mode_t mode; char ext[32]; char tag_name[14]; unsigned int i_nlink; struct v9fs_session_info *v9ses = sb->s_fs_info; struct v9fs_inode *v9inode = V9FS_I(inode); - inode->i_nlink = 1; + set_nlink(inode, 1); inode->i_atime.tv_sec = stat->atime; inode->i_mtime.tv_sec = stat->mtime; @@ -1118,34 +1164,12 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, /* HARDLINKCOUNT %u */ sscanf(ext, "%13s %u", tag_name, &i_nlink); if (!strncmp(tag_name, "HARDLINKCOUNT", 13)) - inode->i_nlink = i_nlink; + set_nlink(inode, i_nlink); } } - inode->i_mode = p9mode2unixmode(v9ses, stat->mode); - if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode))) { - char type = 0; - int major = -1; - int minor = -1; - - strncpy(ext, stat->extension, sizeof(ext)); - sscanf(ext, "%c %u %u", &type, &major, &minor); - switch (type) { - case 'c': - inode->i_mode &= ~S_IFBLK; - inode->i_mode |= S_IFCHR; - break; - case 'b': - break; - default: - P9_DPRINTK(P9_DEBUG_ERROR, - "Unknown special type %c %s\n", type, - stat->extension); - }; - inode->i_rdev = MKDEV(major, minor); - init_special_inode(inode, inode->i_mode, inode->i_rdev); - } else - inode->i_rdev = 0; - + mode = stat->mode & S_IALLUGO; + mode |= inode->i_mode & ~S_IALLUGO; + inode->i_mode = mode; i_size_write(inode, stat->length); /* not real number of blocks, but 512 byte ones ... */ @@ -1411,6 +1435,8 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode) { + int umode; + dev_t rdev; loff_t i_size; struct p9_wstat *st; struct v9fs_session_info *v9ses; @@ -1419,6 +1445,12 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode) st = p9_client_stat(fid); if (IS_ERR(st)) return PTR_ERR(st); + /* + * Don't update inode if the file type is different + */ + umode = p9mode2unixmode(v9ses, st, &rdev); + if ((inode->i_mode & S_IFMT) != (umode & S_IFMT)) + goto out; spin_lock(&inode->i_lock); /* @@ -1430,6 +1462,7 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode) if (v9ses->cache) inode->i_size = i_size; spin_unlock(&inode->i_lock); +out: p9stat_free(st); kfree(st); return 0; diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 276f4a69ecd4..0b5745e21946 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -153,7 +153,8 @@ static struct inode *v9fs_qid_iget_dotl(struct super_block *sb, * later. */ inode->i_ino = i_ino; - retval = v9fs_init_inode(v9ses, inode, st->st_mode); + retval = v9fs_init_inode(v9ses, inode, + st->st_mode, new_decode_dev(st->st_rdev)); if (retval) goto error; @@ -190,6 +191,58 @@ v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid, return inode; } +struct dotl_openflag_map { + int open_flag; + int dotl_flag; +}; + +static int v9fs_mapped_dotl_flags(int flags) +{ + int i; + int rflags = 0; + struct dotl_openflag_map dotl_oflag_map[] = { + { O_CREAT, P9_DOTL_CREATE }, + { O_EXCL, P9_DOTL_EXCL }, + { O_NOCTTY, P9_DOTL_NOCTTY }, + { O_TRUNC, P9_DOTL_TRUNC }, + { O_APPEND, P9_DOTL_APPEND }, + { O_NONBLOCK, P9_DOTL_NONBLOCK }, + { O_DSYNC, P9_DOTL_DSYNC }, + { FASYNC, P9_DOTL_FASYNC }, + { O_DIRECT, P9_DOTL_DIRECT }, + { O_LARGEFILE, P9_DOTL_LARGEFILE }, + { O_DIRECTORY, P9_DOTL_DIRECTORY }, + { O_NOFOLLOW, P9_DOTL_NOFOLLOW }, + { O_NOATIME, P9_DOTL_NOATIME }, + { O_CLOEXEC, P9_DOTL_CLOEXEC }, + { O_SYNC, P9_DOTL_SYNC}, + }; + for (i = 0; i < ARRAY_SIZE(dotl_oflag_map); i++) { + if (flags & dotl_oflag_map[i].open_flag) + rflags |= dotl_oflag_map[i].dotl_flag; + } + return rflags; +} + +/** + * v9fs_open_to_dotl_flags- convert Linux specific open flags to + * plan 9 open flag. + * @flags: flags to convert + */ +int v9fs_open_to_dotl_flags(int flags) +{ + int rflags = 0; + + /* + * We have same bits for P9_DOTL_READONLY, P9_DOTL_WRONLY + * and P9_DOTL_NOACCESS + */ + rflags |= flags & O_ACCMODE; + rflags |= v9fs_mapped_dotl_flags(flags); + + return rflags; +} + /** * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol. * @dir: directory inode that is being created @@ -206,7 +259,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode, int err = 0; gid_t gid; int flags; - mode_t mode; + umode_t mode; char *name = NULL; struct file *filp; struct p9_qid qid; @@ -258,7 +311,8 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode, "Failed to get acl values in creat %d\n", err); goto error; } - err = p9_client_create_dotl(ofid, name, flags, mode, gid, &qid); + err = p9_client_create_dotl(ofid, name, v9fs_open_to_dotl_flags(flags), + mode, gid, &qid); if (err < 0) { P9_DPRINTK(P9_DEBUG_VFS, "p9_client_open_dotl failed in creat %d\n", @@ -281,13 +335,13 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode, P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err); goto error; } - d_instantiate(dentry, inode); err = v9fs_fid_add(dentry, fid); if (err < 0) goto error; + d_instantiate(dentry, inode); /* Now set the ACL based on the default value */ - v9fs_set_create_acl(dentry, dacl, pacl); + v9fs_set_create_acl(dentry, &dacl, &pacl); v9inode = V9FS_I(inode); mutex_lock(&v9inode->v_mutex); @@ -328,6 +382,7 @@ error: err_clunk_old_fid: if (ofid) p9_client_clunk(ofid); + v9fs_set_create_acl(NULL, &dacl, &pacl); return err; } @@ -347,7 +402,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, struct p9_fid *fid = NULL, *dfid = NULL; gid_t gid; char *name; - mode_t mode; + umode_t mode; struct inode *inode; struct p9_qid qid; struct dentry *dir_dentry; @@ -402,10 +457,10 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, err); goto error; } - d_instantiate(dentry, inode); err = v9fs_fid_add(dentry, fid); if (err < 0) goto error; + d_instantiate(dentry, inode); fid = NULL; } else { /* @@ -413,7 +468,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, * inode with stat. We need to get an inode * so that we can set the acl with dentry */ - inode = v9fs_get_inode(dir->i_sb, mode); + inode = v9fs_get_inode(dir->i_sb, mode, 0); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto error; @@ -421,12 +476,13 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, d_instantiate(dentry, inode); } /* Now set the ACL based on the default value */ - v9fs_set_create_acl(dentry, dacl, pacl); + v9fs_set_create_acl(dentry, &dacl, &pacl); inc_nlink(dir); v9fs_invalidate_inode_attr(dir); error: if (fid) p9_client_clunk(fid); + v9fs_set_create_acl(NULL, &dacl, &pacl); return err; } @@ -538,6 +594,7 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr) void v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode) { + mode_t mode; struct v9fs_inode *v9inode = V9FS_I(inode); if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) { @@ -549,12 +606,11 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode) inode->i_ctime.tv_nsec = stat->st_ctime_nsec; inode->i_uid = stat->st_uid; inode->i_gid = stat->st_gid; - inode->i_nlink = stat->st_nlink; - inode->i_mode = stat->st_mode; - inode->i_rdev = new_decode_dev(stat->st_rdev); + set_nlink(inode, stat->st_nlink); - if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode))) - init_special_inode(inode, inode->i_mode, inode->i_rdev); + mode = stat->st_mode & S_IALLUGO; + mode |= inode->i_mode & ~S_IALLUGO; + inode->i_mode = mode; i_size_write(inode, stat->st_size); inode->i_blocks = stat->st_blocks; @@ -576,7 +632,7 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode) if (stat->st_result_mask & P9_STATS_GID) inode->i_gid = stat->st_gid; if (stat->st_result_mask & P9_STATS_NLINK) - inode->i_nlink = stat->st_nlink; + set_nlink(inode, stat->st_nlink); if (stat->st_result_mask & P9_STATS_MODE) { inode->i_mode = stat->st_mode; if ((S_ISBLK(inode->i_mode)) || @@ -655,14 +711,14 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry, err); goto error; } - d_instantiate(dentry, inode); err = v9fs_fid_add(dentry, fid); if (err < 0) goto error; + d_instantiate(dentry, inode); fid = NULL; } else { /* Not in cached mode. No need to populate inode with stat */ - inode = v9fs_get_inode(dir->i_sb, S_IFLNK); + inode = v9fs_get_inode(dir->i_sb, S_IFLNK, 0); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto error; @@ -749,7 +805,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode, int err; gid_t gid; char *name; - mode_t mode; + umode_t mode; struct v9fs_session_info *v9ses; struct p9_fid *fid = NULL, *dfid = NULL; struct inode *inode; @@ -808,17 +864,17 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode, err); goto error; } - d_instantiate(dentry, inode); err = v9fs_fid_add(dentry, fid); if (err < 0) goto error; + d_instantiate(dentry, inode); fid = NULL; } else { /* * Not in cached mode. No need to populate inode with stat. * socket syscall returns a fd, so we need instantiate */ - inode = v9fs_get_inode(dir->i_sb, mode); + inode = v9fs_get_inode(dir->i_sb, mode, rdev); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto error; @@ -826,10 +882,11 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode, d_instantiate(dentry, inode); } /* Now set the ACL based on the default value */ - v9fs_set_create_acl(dentry, dacl, pacl); + v9fs_set_create_acl(dentry, &dacl, &pacl); error: if (fid) p9_client_clunk(fid); + v9fs_set_create_acl(NULL, &dacl, &pacl); return err; } @@ -883,6 +940,11 @@ int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode) st = p9_client_getattr_dotl(fid, P9_STATS_ALL); if (IS_ERR(st)) return PTR_ERR(st); + /* + * Don't update inode if the file type is different + */ + if ((inode->i_mode & S_IFMT) != (st->st_mode & S_IFMT)) + goto out; spin_lock(&inode->i_lock); /* @@ -894,6 +956,7 @@ int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode) if (v9ses->cache) inode->i_size = i_size; spin_unlock(&inode->i_lock); +out: kfree(st); return 0; } @@ -914,7 +977,7 @@ const struct inode_operations v9fs_dir_inode_operations_dotl = { .getxattr = generic_getxattr, .removexattr = generic_removexattr, .listxattr = v9fs_listxattr, - .check_acl = v9fs_check_acl, + .get_acl = v9fs_iop_get_acl, }; const struct inode_operations v9fs_file_inode_operations_dotl = { @@ -924,7 +987,7 @@ const struct inode_operations v9fs_file_inode_operations_dotl = { .getxattr = generic_getxattr, .removexattr = generic_removexattr, .listxattr = v9fs_listxattr, - .check_acl = v9fs_check_acl, + .get_acl = v9fs_iop_get_acl, }; const struct inode_operations v9fs_symlink_inode_operations_dotl = { diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index feef6cdc1fd2..c70251d47ed1 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -149,7 +149,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, else sb->s_d_op = &v9fs_dentry_operations; - inode = v9fs_get_inode(sb, S_IFDIR | mode); + inode = v9fs_get_inode(sb, S_IFDIR | mode, 0); if (IS_ERR(inode)) { retval = PTR_ERR(inode); goto release_sb; diff --git a/fs/Kconfig b/fs/Kconfig index 19891aab9c6e..5f4c45d4aa10 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -109,7 +109,7 @@ source "fs/proc/Kconfig" source "fs/sysfs/Kconfig" config TMPFS - bool "Virtual memory file system support (former shm fs)" + bool "Tmpfs virtual memory file system support (former shm fs)" depends on SHMEM help Tmpfs is a file system which keeps all files in virtual memory. @@ -127,14 +127,21 @@ config TMPFS_POSIX_ACL select TMPFS_XATTR select GENERIC_ACL help - POSIX Access Control Lists (ACLs) support permissions for users and - groups beyond the owner/group/world scheme. + POSIX Access Control Lists (ACLs) support additional access rights + for users and groups beyond the standard owner/group/world scheme, + and this option selects support for ACLs specifically for tmpfs + filesystems. + + If you've selected TMPFS, it's possible that you'll also need + this option as there are a number of Linux distros that require + POSIX ACL support under /dev for certain features to work properly. + For example, some distros need this feature for ALSA-related /dev + files for sound to work properly. In short, if you're not sure, + say Y. To learn more about Access Control Lists, visit the POSIX ACLs for Linux website <http://acl.bestbits.at/>. - If you don't know what Access Control Lists are, say N. - config TMPFS_XATTR bool "Tmpfs extended attributes" depends on TMPFS diff --git a/fs/Makefile b/fs/Makefile index fb68c2b8cf8a..d2c3353d5477 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -29,7 +29,6 @@ obj-$(CONFIG_EVENTFD) += eventfd.o obj-$(CONFIG_AIO) += aio.o obj-$(CONFIG_FILE_LOCKING) += locks.o obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o -obj-$(CONFIG_NFSD_DEPRECATED) += nfsctl.o obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o obj-$(CONFIG_BINFMT_EM86) += binfmt_em86.o obj-$(CONFIG_BINFMT_MISC) += binfmt_misc.o @@ -121,6 +120,6 @@ obj-$(CONFIG_DEBUG_FS) += debugfs/ obj-$(CONFIG_OCFS2_FS) += ocfs2/ obj-$(CONFIG_BTRFS_FS) += btrfs/ obj-$(CONFIG_GFS2_FS) += gfs2/ -obj-$(CONFIG_EXOFS_FS) += exofs/ +obj-y += exofs/ # Multiple modules obj-$(CONFIG_CEPH_FS) += ceph/ obj-$(CONFIG_PSTORE) += pstore/ diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index d5250c5aae21..1dab6a174d6a 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -247,7 +247,7 @@ adfs_iget(struct super_block *sb, struct object_info *obj) inode->i_gid = ADFS_SB(sb)->s_gid; inode->i_ino = obj->file_id; inode->i_size = obj->size; - inode->i_nlink = 2; + set_nlink(inode, 2); inode->i_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c index 3a4557e8325c..de37ec842340 100644 --- a/fs/affs/amigaffs.c +++ b/fs/affs/amigaffs.c @@ -215,7 +215,7 @@ affs_remove_link(struct dentry *dentry) break; default: if (!AFFS_TAIL(sb, bh)->link_chain) - inode->i_nlink = 1; + set_nlink(inode, 1); } affs_free_block(sb, link_ino); goto done; @@ -316,7 +316,7 @@ affs_remove_header(struct dentry *dentry) if (inode->i_nlink > 1) retval = affs_remove_link(dentry); else - inode->i_nlink = 0; + clear_nlink(inode); affs_unlock_link(inode); inode->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(inode); diff --git a/fs/affs/inode.c b/fs/affs/inode.c index 5d828903ac69..88a4b0b50058 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -54,7 +54,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino) prot = be32_to_cpu(tail->protect); inode->i_size = 0; - inode->i_nlink = 1; + set_nlink(inode, 1); inode->i_mode = 0; AFFS_I(inode)->i_extcnt = 1; AFFS_I(inode)->i_ext_last = ~1; @@ -137,7 +137,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino) sbi->s_hashsize + 1; } if (tail->link_chain) - inode->i_nlink = 2; + set_nlink(inode, 2); inode->i_mapping->a_ops = (sbi->s_flags & SF_OFS) ? &affs_aops_ofs : &affs_aops; inode->i_op = &affs_file_inode_operations; inode->i_fop = &affs_file_operations; @@ -304,7 +304,7 @@ affs_new_inode(struct inode *dir) inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); inode->i_ino = block; - inode->i_nlink = 1; + set_nlink(inode, 1); inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; atomic_set(&AFFS_I(inode)->i_opencnt, 0); AFFS_I(inode)->i_blkcnt = 0; @@ -387,7 +387,7 @@ affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s3 AFFS_TAIL(sb, inode_bh)->link_chain = cpu_to_be32(block); affs_adjust_checksum(inode_bh, block - be32_to_cpu(chain)); mark_buffer_dirty_inode(inode_bh, inode); - inode->i_nlink = 2; + set_nlink(inode, 2); ihold(inode); } affs_fix_checksum(sb, bh); diff --git a/fs/affs/namei.c b/fs/affs/namei.c index e3e9efc1fdd8..780a11dc6318 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -277,7 +277,7 @@ affs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata inode->i_mapping->a_ops = (AFFS_SB(sb)->s_flags & SF_OFS) ? &affs_aops_ofs : &affs_aops; error = affs_add_entry(dir, inode, dentry, ST_FILE); if (error) { - inode->i_nlink = 0; + clear_nlink(inode); iput(inode); return error; } @@ -305,7 +305,7 @@ affs_mkdir(struct inode *dir, struct dentry *dentry, int mode) error = affs_add_entry(dir, inode, dentry, ST_USERDIR); if (error) { - inode->i_nlink = 0; + clear_nlink(inode); mark_inode_dirty(inode); iput(inode); return error; @@ -392,7 +392,7 @@ affs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) return 0; err: - inode->i_nlink = 0; + clear_nlink(inode); mark_inode_dirty(inode); iput(inode); return error; diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 346e3289abd7..2f213d109c21 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -90,7 +90,7 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp, vnode->vfs_inode.i_uid = status->owner; vnode->vfs_inode.i_gid = status->group; vnode->vfs_inode.i_generation = vnode->fid.unique; - vnode->vfs_inode.i_nlink = status->nlink; + set_nlink(&vnode->vfs_inode, status->nlink); mode = vnode->vfs_inode.i_mode; mode &= ~S_IALLUGO; diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 0fdab6e03d87..d890ae3b2ce6 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -67,7 +67,7 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key) fscache_attr_changed(vnode->cache); #endif - inode->i_nlink = vnode->status.nlink; + set_nlink(inode, vnode->status.nlink); inode->i_uid = vnode->status.owner; inode->i_gid = 0; inode->i_size = vnode->status.size; @@ -174,7 +174,7 @@ struct inode *afs_iget_autocell(struct inode *dir, const char *dev_name, inode->i_size = 0; inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; inode->i_op = &afs_autocell_inode_operations; - inode->i_nlink = 2; + set_nlink(inode, 2); inode->i_uid = 0; inode->i_gid = 0; inode->i_ctime.tv_sec = get_seconds(); @@ -440,8 +440,6 @@ void exit_aio(struct mm_struct *mm) static struct kiocb *__aio_get_req(struct kioctx *ctx) { struct kiocb *req = NULL; - struct aio_ring *ring; - int okay = 0; req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL); if (unlikely(!req)) @@ -459,39 +457,114 @@ static struct kiocb *__aio_get_req(struct kioctx *ctx) INIT_LIST_HEAD(&req->ki_run_list); req->ki_eventfd = NULL; - /* Check if the completion queue has enough free space to - * accept an event from this io. - */ + return req; +} + +/* + * struct kiocb's are allocated in batches to reduce the number of + * times the ctx lock is acquired and released. + */ +#define KIOCB_BATCH_SIZE 32L +struct kiocb_batch { + struct list_head head; + long count; /* number of requests left to allocate */ +}; + +static void kiocb_batch_init(struct kiocb_batch *batch, long total) +{ + INIT_LIST_HEAD(&batch->head); + batch->count = total; +} + +static void kiocb_batch_free(struct kiocb_batch *batch) +{ + struct kiocb *req, *n; + + list_for_each_entry_safe(req, n, &batch->head, ki_batch) { + list_del(&req->ki_batch); + kmem_cache_free(kiocb_cachep, req); + } +} + +/* + * Allocate a batch of kiocbs. This avoids taking and dropping the + * context lock a lot during setup. + */ +static int kiocb_batch_refill(struct kioctx *ctx, struct kiocb_batch *batch) +{ + unsigned short allocated, to_alloc; + long avail; + bool called_fput = false; + struct kiocb *req, *n; + struct aio_ring *ring; + + to_alloc = min(batch->count, KIOCB_BATCH_SIZE); + for (allocated = 0; allocated < to_alloc; allocated++) { + req = __aio_get_req(ctx); + if (!req) + /* allocation failed, go with what we've got */ + break; + list_add(&req->ki_batch, &batch->head); + } + + if (allocated == 0) + goto out; + +retry: spin_lock_irq(&ctx->ctx_lock); - ring = kmap_atomic(ctx->ring_info.ring_pages[0], KM_USER0); - if (ctx->reqs_active < aio_ring_avail(&ctx->ring_info, ring)) { + ring = kmap_atomic(ctx->ring_info.ring_pages[0]); + + avail = aio_ring_avail(&ctx->ring_info, ring) - ctx->reqs_active; + BUG_ON(avail < 0); + if (avail == 0 && !called_fput) { + /* + * Handle a potential starvation case. It is possible that + * we hold the last reference on a struct file, causing us + * to delay the final fput to non-irq context. In this case, + * ctx->reqs_active is artificially high. Calling the fput + * routine here may free up a slot in the event completion + * ring, allowing this allocation to succeed. + */ + kunmap_atomic(ring); + spin_unlock_irq(&ctx->ctx_lock); + aio_fput_routine(NULL); + called_fput = true; + goto retry; + } + + if (avail < allocated) { + /* Trim back the number of requests. */ + list_for_each_entry_safe(req, n, &batch->head, ki_batch) { + list_del(&req->ki_batch); + kmem_cache_free(kiocb_cachep, req); + if (--allocated <= avail) + break; + } + } + + batch->count -= allocated; + list_for_each_entry(req, &batch->head, ki_batch) { list_add(&req->ki_list, &ctx->active_reqs); ctx->reqs_active++; - okay = 1; } - kunmap_atomic(ring, KM_USER0); - spin_unlock_irq(&ctx->ctx_lock); - if (!okay) { - kmem_cache_free(kiocb_cachep, req); - req = NULL; - } + kunmap_atomic(ring); + spin_unlock_irq(&ctx->ctx_lock); - return req; +out: + return allocated; } -static inline struct kiocb *aio_get_req(struct kioctx *ctx) +static inline struct kiocb *aio_get_req(struct kioctx *ctx, + struct kiocb_batch *batch) { struct kiocb *req; - /* Handle a potential starvation case -- should be exceedingly rare as - * requests will be stuck on fput_head only if the aio_fput_routine is - * delayed and the requests were the last user of the struct file. - */ - req = __aio_get_req(ctx); - if (unlikely(NULL == req)) { - aio_fput_routine(NULL); - req = __aio_get_req(ctx); - } + + if (list_empty(&batch->head)) + if (kiocb_batch_refill(ctx, batch) == 0) + return NULL; + req = list_first_entry(&batch->head, struct kiocb, ki_batch); + list_del(&req->ki_batch); return req; } @@ -1387,13 +1460,13 @@ static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb, bool compat) ret = compat_rw_copy_check_uvector(type, (struct compat_iovec __user *)kiocb->ki_buf, kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec, - &kiocb->ki_iovec); + &kiocb->ki_iovec, 1); else #endif ret = rw_copy_check_uvector(type, (struct iovec __user *)kiocb->ki_buf, kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec, - &kiocb->ki_iovec); + &kiocb->ki_iovec, 1); if (ret < 0) goto out; @@ -1515,7 +1588,8 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat) } static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, - struct iocb *iocb, bool compat) + struct iocb *iocb, struct kiocb_batch *batch, + bool compat) { struct kiocb *req; struct file *file; @@ -1541,7 +1615,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, if (unlikely(!file)) return -EBADF; - req = aio_get_req(ctx); /* returns with 2 references to req */ + req = aio_get_req(ctx, batch); /* returns with 2 references to req */ if (unlikely(!req)) { fput(file); return -EAGAIN; @@ -1621,8 +1695,9 @@ long do_io_submit(aio_context_t ctx_id, long nr, { struct kioctx *ctx; long ret = 0; - int i; + int i = 0; struct blk_plug plug; + struct kiocb_batch batch; if (unlikely(nr < 0)) return -EINVAL; @@ -1639,6 +1714,8 @@ long do_io_submit(aio_context_t ctx_id, long nr, return -EINVAL; } + kiocb_batch_init(&batch, nr); + blk_start_plug(&plug); /* @@ -1659,12 +1736,13 @@ long do_io_submit(aio_context_t ctx_id, long nr, break; } - ret = io_submit_one(ctx, user_iocb, &tmp, compat); + ret = io_submit_one(ctx, user_iocb, &tmp, &batch, compat); if (ret) break; } blk_finish_plug(&plug); + kiocb_batch_free(&batch); put_ioctx(ctx); return i ? i : ret; } diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index c5567cb78432..f11e43ed907d 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -187,7 +187,7 @@ EXPORT_SYMBOL_GPL(anon_inode_getfd); */ static struct inode *anon_inode_mkinode(void) { - struct inode *inode = new_inode(anon_inode_mnt->mnt_sb); + struct inode *inode = new_inode_pseudo(anon_inode_mnt->mnt_sb); if (!inode) return ERR_PTR(-ENOMEM); @@ -233,7 +233,7 @@ static int __init anon_inode_init(void) return 0; err_mntput: - mntput(anon_inode_mnt); + kern_unmount(anon_inode_mnt); err_unregister_filesystem: unregister_filesystem(&anon_inode_fs_type); err_exit: diff --git a/fs/attr.c b/fs/attr.c index 538e27959d3f..7ee7ba488313 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -13,6 +13,7 @@ #include <linux/fsnotify.h> #include <linux/fcntl.h> #include <linux/security.h> +#include <linux/evm.h> /** * inode_change_ok - check if attribute changes to an inode are allowed @@ -237,8 +238,10 @@ int notify_change(struct dentry * dentry, struct iattr * attr) else error = simple_setattr(dentry, attr); - if (!error) + if (!error) { fsnotify_change(dentry, ia_valid); + evm_inode_post_setattr(dentry, ia_valid); + } return error; } diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index 475f9c597cb7..326dc08d3e3f 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -39,27 +39,17 @@ /* #define DEBUG */ -#ifdef DEBUG -#define DPRINTK(fmt, args...) \ -do { \ - printk(KERN_DEBUG "pid %d: %s: " fmt "\n", \ - current->pid, __func__, ##args); \ -} while (0) -#else -#define DPRINTK(fmt, args...) do {} while (0) -#endif - -#define AUTOFS_WARN(fmt, args...) \ -do { \ +#define DPRINTK(fmt, ...) \ + pr_debug("pid %d: %s: " fmt "\n", \ + current->pid, __func__, ##__VA_ARGS__) + +#define AUTOFS_WARN(fmt, ...) \ printk(KERN_WARNING "pid %d: %s: " fmt "\n", \ - current->pid, __func__, ##args); \ -} while (0) + current->pid, __func__, ##__VA_ARGS__) -#define AUTOFS_ERROR(fmt, args...) \ -do { \ +#define AUTOFS_ERROR(fmt, ...) \ printk(KERN_ERR "pid %d: %s: " fmt "\n", \ - current->pid, __func__, ##args); \ -} while (0) + current->pid, __func__, ##__VA_ARGS__) /* Unified info structure. This is pointed to by both the dentry and inode structures. Each file in the filesystem has an instance of this diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index 180fa2425e49..8179f1ab8175 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -342,7 +342,7 @@ struct inode *autofs4_get_inode(struct super_block *sb, mode_t mode) inode->i_ino = get_next_ino(); if (S_ISDIR(mode)) { - inode->i_nlink = 2; + set_nlink(inode, 2); inode->i_op = &autofs4_dir_inode_operations; inode->i_fop = &autofs4_dir_operations; } else if (S_ISLNK(mode)) { diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 25435987d6ae..e1fbdeef85db 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -104,7 +104,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, size_t pktsz; DPRINTK("wait id = 0x%08lx, name = %.*s, type=%d", - wq->wait_queue_token, wq->name.len, wq->name.name, type); + (unsigned long) wq->wait_queue_token, wq->name.len, wq->name.name, type); memset(&pkt,0,sizeof pkt); /* For security reasons */ diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 54b8c28bebc8..8342ca67abcd 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -357,7 +357,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino) inode->i_gid = befs_sb->mount_opts.use_gid ? befs_sb->mount_opts.gid : (gid_t) fs32_to_cpu(sb, raw_inode->gid); - inode->i_nlink = 1; + set_nlink(inode, 1); /* * BEFS's time is 64 bits, but current VFS is 32 bits... @@ -474,17 +474,22 @@ befs_follow_link(struct dentry *dentry, struct nameidata *nd) befs_data_stream *data = &befs_ino->i_data.ds; befs_off_t len = data->size; - befs_debug(sb, "Follow long symlink"); - - link = kmalloc(len, GFP_NOFS); - if (!link) { - link = ERR_PTR(-ENOMEM); - } else if (befs_read_lsymlink(sb, data, link, len) != len) { - kfree(link); - befs_error(sb, "Failed to read entire long symlink"); + if (len == 0) { + befs_error(sb, "Long symlink with illegal length"); link = ERR_PTR(-EIO); } else { - link[len - 1] = '\0'; + befs_debug(sb, "Follow long symlink"); + + link = kmalloc(len, GFP_NOFS); + if (!link) { + link = ERR_PTR(-ENOMEM); + } else if (befs_read_lsymlink(sb, data, link, len) != len) { + kfree(link); + befs_error(sb, "Failed to read entire long symlink"); + link = ERR_PTR(-EIO); + } else { + link[len - 1] = '\0'; + } } } else { link = befs_ino->i_data.symlink; diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index b14cebfd9047..9cc074019479 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -199,7 +199,7 @@ static int bfs_unlink(struct inode *dir, struct dentry *dentry) printf("unlinking non-existent file %s:%lu (nlink=%d)\n", inode->i_sb->s_id, inode->i_ino, inode->i_nlink); - inode->i_nlink = 1; + set_nlink(inode, 1); } de->ino = 0; mark_buffer_dirty_inode(bh, dir); diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index a8e37f81d097..697af5bf70b3 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -78,7 +78,7 @@ struct inode *bfs_iget(struct super_block *sb, unsigned long ino) BFS_I(inode)->i_dsk_ino = le16_to_cpu(di->i_ino); inode->i_uid = le32_to_cpu(di->i_uid); inode->i_gid = le32_to_cpu(di->i_gid); - inode->i_nlink = le32_to_cpu(di->i_nlink); + set_nlink(inode, le32_to_cpu(di->i_nlink)); inode->i_size = BFS_FILESIZE(di); inode->i_blocks = BFS_FILEBLOCKS(di); inode->i_atime.tv_sec = le32_to_cpu(di->i_atime); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index dd0fdfc56d38..21ac5ee4b43f 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -795,7 +795,16 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) * might try to exec. This is because the brk will * follow the loader, and is not movable. */ #if defined(CONFIG_X86) || defined(CONFIG_ARM) - load_bias = 0; + /* Memory randomization might have been switched off + * in runtime via sysctl. + * If that is the case, retain the original non-zero + * load_bias value in order to establish proper + * non-randomized mappings. + */ + if (current->flags & PF_RANDOMIZE) + load_bias = 0; + else + load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr); #else load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr); #endif diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index ba1a1ae4a18a..1e9edbdeda7e 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -521,7 +521,7 @@ static void kill_node(Node *e) write_unlock(&entries_lock); if (dentry) { - dentry->d_inode->i_nlink--; + drop_nlink(dentry->d_inode); d_drop(dentry); dput(dentry); simple_release_fs(&bm_mnt, &entry_count); diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index 9c5e6b2cd11a..c2183f3917cd 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -22,6 +22,7 @@ #include <linux/blkdev.h> #include <linux/mempool.h> +#include <linux/export.h> #include <linux/bio.h> #include <linux/workqueue.h> #include <linux/slab.h> @@ -255,7 +255,6 @@ void bio_init(struct bio *bio) { memset(bio, 0, sizeof(*bio)); bio->bi_flags = 1 << BIO_UPTODATE; - bio->bi_comp_cpu = -1; atomic_set(&bio->bi_cnt, 1); } EXPORT_SYMBOL(bio_init); @@ -338,7 +337,7 @@ static void bio_fs_destructor(struct bio *bio) * RETURNS: * Pointer to new bio on success, NULL on failure. */ -struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs) +struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) { struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set); @@ -366,7 +365,7 @@ static void bio_kmalloc_destructor(struct bio *bio) * %__GFP_WAIT, the allocation is guaranteed to succeed. * **/ -struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs) +struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs) { struct bio *bio; @@ -697,7 +696,8 @@ static void bio_free_map_data(struct bio_map_data *bmd) kfree(bmd); } -static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count, +static struct bio_map_data *bio_alloc_map_data(int nr_segs, + unsigned int iov_count, gfp_t gfp_mask) { struct bio_map_data *bmd; diff --git a/fs/block_dev.c b/fs/block_dev.c index 9fb0b15331d3..b07f1da1de4e 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -44,24 +44,28 @@ inline struct block_device *I_BDEV(struct inode *inode) { return &BDEV_I(inode)->bdev; } - EXPORT_SYMBOL(I_BDEV); /* - * move the inode from it's current bdi to the a new bdi. if the inode is dirty - * we need to move it onto the dirty list of @dst so that the inode is always - * on the right list. + * Move the inode from its current bdi to a new bdi. If the inode is dirty we + * need to move it onto the dirty list of @dst so that the inode is always on + * the right list. */ static void bdev_inode_switch_bdi(struct inode *inode, struct backing_dev_info *dst) { - spin_lock(&inode_wb_list_lock); + struct backing_dev_info *old = inode->i_data.backing_dev_info; + + if (unlikely(dst == old)) /* deadlock avoidance */ + return; + bdi_lock_two(&old->wb, &dst->wb); spin_lock(&inode->i_lock); inode->i_data.backing_dev_info = dst; if (inode->i_state & I_DIRTY) list_move(&inode->i_wb_list, &dst->wb.b_dirty); spin_unlock(&inode->i_lock); - spin_unlock(&inode_wb_list_lock); + spin_unlock(&old->wb.list_lock); + spin_unlock(&dst->wb.list_lock); } static sector_t max_block(struct block_device *bdev) @@ -383,6 +387,10 @@ int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync) struct inode *bd_inode = filp->f_mapping->host; struct block_device *bdev = I_BDEV(bd_inode); int error; + + error = filemap_write_and_wait_range(filp->f_mapping, start, end); + if (error) + return error; /* * There is no need to serialise calls to blkdev_issue_flush with @@ -548,6 +556,7 @@ struct block_device *bdget(dev_t dev) if (inode->i_state & I_NEW) { bdev->bd_contains = NULL; + bdev->bd_super = NULL; bdev->bd_inode = inode; bdev->bd_block_size = (1 << inode->i_blkbits); bdev->bd_part_count = 0; @@ -962,7 +971,7 @@ static void flush_disk(struct block_device *bdev, bool kill_dirty) if (!bdev->bd_disk) return; - if (disk_partitionable(bdev->bd_disk)) + if (disk_part_scan_enabled(bdev->bd_disk)) bdev->bd_invalidated = 1; } @@ -1076,6 +1085,7 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) { struct gendisk *disk; + struct module *owner; int ret; int partno; int perm = 0; @@ -1101,6 +1111,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) disk = get_gendisk(bdev->bd_dev, &partno); if (!disk) goto out; + owner = disk->fops->owner; disk_block_events(disk); mutex_lock_nested(&bdev->bd_mutex, for_part); @@ -1128,8 +1139,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdev->bd_disk = NULL; mutex_unlock(&bdev->bd_mutex); disk_unblock_events(disk); - module_put(disk->fops->owner); put_disk(disk); + module_put(owner); goto restart; } } @@ -1185,8 +1196,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) goto out_unlock_bdev; } /* only one opener holds refs to the module and disk */ - module_put(disk->fops->owner); put_disk(disk); + module_put(owner); } bdev->bd_openers++; if (for_part) @@ -1206,8 +1217,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) out_unlock_bdev: mutex_unlock(&bdev->bd_mutex); disk_unblock_events(disk); - module_put(disk->fops->owner); put_disk(disk); + module_put(owner); out: bdput(bdev); @@ -1420,6 +1431,11 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) WARN_ON_ONCE(bdev->bd_holders); sync_blockdev(bdev); kill_bdev(bdev); + /* ->release can cause the old bdi to disappear, + * so must switch it out first + */ + bdev_inode_switch_bdi(bdev->bd_inode, + &default_backing_dev_info); } if (bdev->bd_contains == bdev) { if (disk->fops->release) @@ -1428,16 +1444,15 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) if (!bdev->bd_openers) { struct module *owner = disk->fops->owner; - put_disk(disk); - module_put(owner); disk_put_part(bdev->bd_part); bdev->bd_part = NULL; bdev->bd_disk = NULL; - bdev_inode_switch_bdi(bdev->bd_inode, - &default_backing_dev_info); if (bdev != bdev->bd_contains) victim = bdev->bd_contains; bdev->bd_contains = NULL; + + put_disk(disk); + module_put(owner); } mutex_unlock(&bdev->bd_mutex); bdput(bdev); @@ -1448,6 +1463,8 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) int blkdev_put(struct block_device *bdev, fmode_t mode) { + mutex_lock(&bdev->bd_mutex); + if (mode & FMODE_EXCL) { bool bdev_free; @@ -1456,7 +1473,6 @@ int blkdev_put(struct block_device *bdev, fmode_t mode) * are protected with bdev_lock. bd_mutex is to * synchronize disk_holder unlinking. */ - mutex_lock(&bdev->bd_mutex); spin_lock(&bdev_lock); WARN_ON_ONCE(--bdev->bd_holders < 0); @@ -1474,17 +1490,21 @@ int blkdev_put(struct block_device *bdev, fmode_t mode) * If this was the last claim, remove holder link and * unblock evpoll if it was a write holder. */ - if (bdev_free) { - if (bdev->bd_write_holder) { - disk_unblock_events(bdev->bd_disk); - disk_check_events(bdev->bd_disk); - bdev->bd_write_holder = false; - } + if (bdev_free && bdev->bd_write_holder) { + disk_unblock_events(bdev->bd_disk); + bdev->bd_write_holder = false; } - - mutex_unlock(&bdev->bd_mutex); } + /* + * Trigger event checking and tell drivers to flush MEDIA_CHANGE + * event. This is to ensure detection of media removal commanded + * from userland - e.g. eject(1). + */ + disk_flush_events(bdev->bd_disk, DISK_EVENT_MEDIA_CHANGE); + + mutex_unlock(&bdev->bd_mutex); + return __blkdev_put(bdev, mode, 0); } EXPORT_SYMBOL(blkdev_put); diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 9b72dcf1cd25..c0ddfd29c5e5 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -6,5 +6,8 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ transaction.o inode.o file.o tree-defrag.o \ extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ - export.o tree-log.o acl.o free-space-cache.o zlib.o lzo.o \ - compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o + export.o tree-log.o free-space-cache.o zlib.o lzo.o \ + compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ + reada.o backref.o + +btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 9f62ab2a7282..89b156d85d63 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -28,9 +28,7 @@ #include "btrfs_inode.h" #include "xattr.h" -#ifdef CONFIG_BTRFS_FS_POSIX_ACL - -static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) +struct posix_acl *btrfs_get_acl(struct inode *inode, int type) { int size; const char *name; @@ -61,22 +59,19 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) if (!value) return ERR_PTR(-ENOMEM); size = __btrfs_getxattr(inode, name, value, size); - if (size > 0) { - acl = posix_acl_from_xattr(value, size); - if (IS_ERR(acl)) { - kfree(value); - return acl; - } - set_cached_acl(inode, type, acl); - } - kfree(value); + } + if (size > 0) { + acl = posix_acl_from_xattr(value, size); } else if (size == -ENOENT || size == -ENODATA || size == 0) { /* FIXME, who returns -ENOENT? I think nobody */ acl = NULL; - set_cached_acl(inode, type, acl); } else { acl = ERR_PTR(-EIO); } + kfree(value); + + if (!IS_ERR(acl)) + set_cached_acl(inode, type, acl); return acl; } @@ -111,7 +106,6 @@ static int btrfs_set_acl(struct btrfs_trans_handle *trans, int ret, size = 0; const char *name; char *value = NULL; - mode_t mode; if (acl) { ret = posix_acl_valid(acl); @@ -122,13 +116,11 @@ static int btrfs_set_acl(struct btrfs_trans_handle *trans, switch (type) { case ACL_TYPE_ACCESS: - mode = inode->i_mode; name = POSIX_ACL_XATTR_ACCESS; if (acl) { - ret = posix_acl_equiv_mode(acl, &mode); + ret = posix_acl_equiv_mode(acl, &inode->i_mode); if (ret < 0) return ret; - inode->i_mode = mode; } ret = 0; break; @@ -195,27 +187,6 @@ out: return ret; } -int btrfs_check_acl(struct inode *inode, int mask) -{ - int error = -EAGAIN; - - if (mask & MAY_NOT_BLOCK) { - if (!negative_cached_acl(inode, ACL_TYPE_ACCESS)) - error = -ECHILD; - } else { - struct posix_acl *acl; - acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl) { - error = posix_acl_permission(inode, acl, mask); - posix_acl_release(acl); - } - } - - return error; -} - /* * btrfs_init_acl is already generally called under fs_mutex, so the locking * stuff has been fixed to work with that. If the locking stuff changes, we @@ -243,31 +214,20 @@ int btrfs_init_acl(struct btrfs_trans_handle *trans, } if (IS_POSIXACL(dir) && acl) { - struct posix_acl *clone; - mode_t mode; - if (S_ISDIR(inode->i_mode)) { ret = btrfs_set_acl(trans, inode, acl, ACL_TYPE_DEFAULT); if (ret) goto failed; } - clone = posix_acl_clone(acl, GFP_NOFS); - ret = -ENOMEM; - if (!clone) - goto failed; - - mode = inode->i_mode; - ret = posix_acl_create_masq(clone, &mode); - if (ret >= 0) { - inode->i_mode = mode; - if (ret > 0) { - /* we need an acl */ - ret = btrfs_set_acl(trans, inode, clone, - ACL_TYPE_ACCESS); - } + ret = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); + if (ret < 0) + return ret; + + if (ret > 0) { + /* we need an acl */ + ret = btrfs_set_acl(trans, inode, acl, ACL_TYPE_ACCESS); } - posix_acl_release(clone); } failed: posix_acl_release(acl); @@ -277,7 +237,7 @@ failed: int btrfs_acl_chmod(struct inode *inode) { - struct posix_acl *acl, *clone; + struct posix_acl *acl; int ret = 0; if (S_ISLNK(inode->i_mode)) @@ -290,17 +250,11 @@ int btrfs_acl_chmod(struct inode *inode) if (IS_ERR_OR_NULL(acl)) return PTR_ERR(acl); - clone = posix_acl_clone(acl, GFP_KERNEL); + ret = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); + if (ret) + return ret; + ret = btrfs_set_acl(NULL, inode, acl, ACL_TYPE_ACCESS); posix_acl_release(acl); - if (!clone) - return -ENOMEM; - - ret = posix_acl_chmod_masq(clone, inode->i_mode); - if (!ret) - ret = btrfs_set_acl(NULL, inode, clone, ACL_TYPE_ACCESS); - - posix_acl_release(clone); - return ret; } @@ -317,18 +271,3 @@ const struct xattr_handler btrfs_xattr_acl_access_handler = { .get = btrfs_xattr_acl_get, .set = btrfs_xattr_acl_set, }; - -#else /* CONFIG_BTRFS_FS_POSIX_ACL */ - -int btrfs_acl_chmod(struct inode *inode) -{ - return 0; -} - -int btrfs_init_acl(struct btrfs_trans_handle *trans, - struct inode *inode, struct inode *dir) -{ - return 0; -} - -#endif /* CONFIG_BTRFS_FS_POSIX_ACL */ diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c new file mode 100644 index 000000000000..22c64fff1bd5 --- /dev/null +++ b/fs/btrfs/backref.c @@ -0,0 +1,776 @@ +/* + * Copyright (C) 2011 STRATO. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include "ctree.h" +#include "disk-io.h" +#include "backref.h" + +struct __data_ref { + struct list_head list; + u64 inum; + u64 root; + u64 extent_data_item_offset; +}; + +struct __shared_ref { + struct list_head list; + u64 disk_byte; +}; + +static int __inode_info(u64 inum, u64 ioff, u8 key_type, + struct btrfs_root *fs_root, struct btrfs_path *path, + struct btrfs_key *found_key) +{ + int ret; + struct btrfs_key key; + struct extent_buffer *eb; + + key.type = key_type; + key.objectid = inum; + key.offset = ioff; + + ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0); + if (ret < 0) + return ret; + + eb = path->nodes[0]; + if (ret && path->slots[0] >= btrfs_header_nritems(eb)) { + ret = btrfs_next_leaf(fs_root, path); + if (ret) + return ret; + eb = path->nodes[0]; + } + + btrfs_item_key_to_cpu(eb, found_key, path->slots[0]); + if (found_key->type != key.type || found_key->objectid != key.objectid) + return 1; + + return 0; +} + +/* + * this makes the path point to (inum INODE_ITEM ioff) + */ +int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, + struct btrfs_path *path) +{ + struct btrfs_key key; + return __inode_info(inum, ioff, BTRFS_INODE_ITEM_KEY, fs_root, path, + &key); +} + +static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, + struct btrfs_path *path, + struct btrfs_key *found_key) +{ + return __inode_info(inum, ioff, BTRFS_INODE_REF_KEY, fs_root, path, + found_key); +} + +/* + * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements + * of the path are separated by '/' and the path is guaranteed to be + * 0-terminated. the path is only given within the current file system. + * Therefore, it never starts with a '/'. the caller is responsible to provide + * "size" bytes in "dest". the dest buffer will be filled backwards. finally, + * the start point of the resulting string is returned. this pointer is within + * dest, normally. + * in case the path buffer would overflow, the pointer is decremented further + * as if output was written to the buffer, though no more output is actually + * generated. that way, the caller can determine how much space would be + * required for the path to fit into the buffer. in that case, the returned + * value will be smaller than dest. callers must check this! + */ +static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, + struct btrfs_inode_ref *iref, + struct extent_buffer *eb_in, u64 parent, + char *dest, u32 size) +{ + u32 len; + int slot; + u64 next_inum; + int ret; + s64 bytes_left = size - 1; + struct extent_buffer *eb = eb_in; + struct btrfs_key found_key; + + if (bytes_left >= 0) + dest[bytes_left] = '\0'; + + while (1) { + len = btrfs_inode_ref_name_len(eb, iref); + bytes_left -= len; + if (bytes_left >= 0) + read_extent_buffer(eb, dest + bytes_left, + (unsigned long)(iref + 1), len); + if (eb != eb_in) + free_extent_buffer(eb); + ret = inode_ref_info(parent, 0, fs_root, path, &found_key); + if (ret) + break; + next_inum = found_key.offset; + + /* regular exit ahead */ + if (parent == next_inum) + break; + + slot = path->slots[0]; + eb = path->nodes[0]; + /* make sure we can use eb after releasing the path */ + if (eb != eb_in) + atomic_inc(&eb->refs); + btrfs_release_path(path); + + iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref); + parent = next_inum; + --bytes_left; + if (bytes_left >= 0) + dest[bytes_left] = '/'; + } + + btrfs_release_path(path); + + if (ret) + return ERR_PTR(ret); + + return dest + bytes_left; +} + +/* + * this makes the path point to (logical EXTENT_ITEM *) + * returns BTRFS_EXTENT_FLAG_DATA for data, BTRFS_EXTENT_FLAG_TREE_BLOCK for + * tree blocks and <0 on error. + */ +int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, + struct btrfs_path *path, struct btrfs_key *found_key) +{ + int ret; + u64 flags; + u32 item_size; + struct extent_buffer *eb; + struct btrfs_extent_item *ei; + struct btrfs_key key; + + key.type = BTRFS_EXTENT_ITEM_KEY; + key.objectid = logical; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); + if (ret < 0) + return ret; + ret = btrfs_previous_item(fs_info->extent_root, path, + 0, BTRFS_EXTENT_ITEM_KEY); + if (ret < 0) + return ret; + + btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]); + if (found_key->type != BTRFS_EXTENT_ITEM_KEY || + found_key->objectid > logical || + found_key->objectid + found_key->offset <= logical) + return -ENOENT; + + eb = path->nodes[0]; + item_size = btrfs_item_size_nr(eb, path->slots[0]); + BUG_ON(item_size < sizeof(*ei)); + + ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); + flags = btrfs_extent_flags(eb, ei); + + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) + return BTRFS_EXTENT_FLAG_TREE_BLOCK; + if (flags & BTRFS_EXTENT_FLAG_DATA) + return BTRFS_EXTENT_FLAG_DATA; + + return -EIO; +} + +/* + * helper function to iterate extent inline refs. ptr must point to a 0 value + * for the first call and may be modified. it is used to track state. + * if more refs exist, 0 is returned and the next call to + * __get_extent_inline_ref must pass the modified ptr parameter to get the + * next ref. after the last ref was processed, 1 is returned. + * returns <0 on error + */ +static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb, + struct btrfs_extent_item *ei, u32 item_size, + struct btrfs_extent_inline_ref **out_eiref, + int *out_type) +{ + unsigned long end; + u64 flags; + struct btrfs_tree_block_info *info; + + if (!*ptr) { + /* first call */ + flags = btrfs_extent_flags(eb, ei); + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + info = (struct btrfs_tree_block_info *)(ei + 1); + *out_eiref = + (struct btrfs_extent_inline_ref *)(info + 1); + } else { + *out_eiref = (struct btrfs_extent_inline_ref *)(ei + 1); + } + *ptr = (unsigned long)*out_eiref; + if ((void *)*ptr >= (void *)ei + item_size) + return -ENOENT; + } + + end = (unsigned long)ei + item_size; + *out_eiref = (struct btrfs_extent_inline_ref *)*ptr; + *out_type = btrfs_extent_inline_ref_type(eb, *out_eiref); + + *ptr += btrfs_extent_inline_ref_size(*out_type); + WARN_ON(*ptr > end); + if (*ptr == end) + return 1; /* last */ + + return 0; +} + +/* + * reads the tree block backref for an extent. tree level and root are returned + * through out_level and out_root. ptr must point to a 0 value for the first + * call and may be modified (see __get_extent_inline_ref comment). + * returns 0 if data was provided, 1 if there was no more data to provide or + * <0 on error. + */ +int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, + struct btrfs_extent_item *ei, u32 item_size, + u64 *out_root, u8 *out_level) +{ + int ret; + int type; + struct btrfs_tree_block_info *info; + struct btrfs_extent_inline_ref *eiref; + + if (*ptr == (unsigned long)-1) + return 1; + + while (1) { + ret = __get_extent_inline_ref(ptr, eb, ei, item_size, + &eiref, &type); + if (ret < 0) + return ret; + + if (type == BTRFS_TREE_BLOCK_REF_KEY || + type == BTRFS_SHARED_BLOCK_REF_KEY) + break; + + if (ret == 1) + return 1; + } + + /* we can treat both ref types equally here */ + info = (struct btrfs_tree_block_info *)(ei + 1); + *out_root = btrfs_extent_inline_ref_offset(eb, eiref); + *out_level = btrfs_tree_block_level(eb, info); + + if (ret == 1) + *ptr = (unsigned long)-1; + + return 0; +} + +static int __data_list_add(struct list_head *head, u64 inum, + u64 extent_data_item_offset, u64 root) +{ + struct __data_ref *ref; + + ref = kmalloc(sizeof(*ref), GFP_NOFS); + if (!ref) + return -ENOMEM; + + ref->inum = inum; + ref->extent_data_item_offset = extent_data_item_offset; + ref->root = root; + list_add_tail(&ref->list, head); + + return 0; +} + +static int __data_list_add_eb(struct list_head *head, struct extent_buffer *eb, + struct btrfs_extent_data_ref *dref) +{ + return __data_list_add(head, btrfs_extent_data_ref_objectid(eb, dref), + btrfs_extent_data_ref_offset(eb, dref), + btrfs_extent_data_ref_root(eb, dref)); +} + +static int __shared_list_add(struct list_head *head, u64 disk_byte) +{ + struct __shared_ref *ref; + + ref = kmalloc(sizeof(*ref), GFP_NOFS); + if (!ref) + return -ENOMEM; + + ref->disk_byte = disk_byte; + list_add_tail(&ref->list, head); + + return 0; +} + +static int __iter_shared_inline_ref_inodes(struct btrfs_fs_info *fs_info, + u64 logical, u64 inum, + u64 extent_data_item_offset, + u64 extent_offset, + struct btrfs_path *path, + struct list_head *data_refs, + iterate_extent_inodes_t *iterate, + void *ctx) +{ + u64 ref_root; + u32 item_size; + struct btrfs_key key; + struct extent_buffer *eb; + struct btrfs_extent_item *ei; + struct btrfs_extent_inline_ref *eiref; + struct __data_ref *ref; + int ret; + int type; + int last; + unsigned long ptr = 0; + + WARN_ON(!list_empty(data_refs)); + ret = extent_from_logical(fs_info, logical, path, &key); + if (ret & BTRFS_EXTENT_FLAG_DATA) + ret = -EIO; + if (ret < 0) + goto out; + + eb = path->nodes[0]; + ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); + item_size = btrfs_item_size_nr(eb, path->slots[0]); + + ret = 0; + ref_root = 0; + /* + * as done in iterate_extent_inodes, we first build a list of refs to + * iterate, then free the path and then iterate them to avoid deadlocks. + */ + do { + last = __get_extent_inline_ref(&ptr, eb, ei, item_size, + &eiref, &type); + if (last < 0) { + ret = last; + goto out; + } + if (type == BTRFS_TREE_BLOCK_REF_KEY || + type == BTRFS_SHARED_BLOCK_REF_KEY) { + ref_root = btrfs_extent_inline_ref_offset(eb, eiref); + ret = __data_list_add(data_refs, inum, + extent_data_item_offset, + ref_root); + } + } while (!ret && !last); + + btrfs_release_path(path); + + if (ref_root == 0) { + printk(KERN_ERR "btrfs: failed to find tree block ref " + "for shared data backref %llu\n", logical); + WARN_ON(1); + ret = -EIO; + } + +out: + while (!list_empty(data_refs)) { + ref = list_first_entry(data_refs, struct __data_ref, list); + list_del(&ref->list); + if (!ret) + ret = iterate(ref->inum, extent_offset + + ref->extent_data_item_offset, + ref->root, ctx); + kfree(ref); + } + + return ret; +} + +static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info, + u64 logical, u64 orig_extent_item_objectid, + u64 extent_offset, struct btrfs_path *path, + struct list_head *data_refs, + iterate_extent_inodes_t *iterate, + void *ctx) +{ + u64 disk_byte; + struct btrfs_key key; + struct btrfs_file_extent_item *fi; + struct extent_buffer *eb; + int slot; + int nritems; + int ret; + int found = 0; + + eb = read_tree_block(fs_info->tree_root, logical, + fs_info->tree_root->leafsize, 0); + if (!eb) + return -EIO; + + /* + * from the shared data ref, we only have the leaf but we need + * the key. thus, we must look into all items and see that we + * find one (some) with a reference to our extent item. + */ + nritems = btrfs_header_nritems(eb); + for (slot = 0; slot < nritems; ++slot) { + btrfs_item_key_to_cpu(eb, &key, slot); + if (key.type != BTRFS_EXTENT_DATA_KEY) + continue; + fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); + if (!fi) { + free_extent_buffer(eb); + return -EIO; + } + disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); + if (disk_byte != orig_extent_item_objectid) { + if (found) + break; + else + continue; + } + ++found; + ret = __iter_shared_inline_ref_inodes(fs_info, logical, + key.objectid, + key.offset, + extent_offset, path, + data_refs, + iterate, ctx); + if (ret) + break; + } + + if (!found) { + printk(KERN_ERR "btrfs: failed to follow shared data backref " + "to parent %llu\n", logical); + WARN_ON(1); + ret = -EIO; + } + + free_extent_buffer(eb); + return ret; +} + +/* + * calls iterate() for every inode that references the extent identified by + * the given parameters. will use the path given as a parameter and return it + * released. + * when the iterator function returns a non-zero value, iteration stops. + */ +int iterate_extent_inodes(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, + u64 extent_item_objectid, + u64 extent_offset, + iterate_extent_inodes_t *iterate, void *ctx) +{ + unsigned long ptr = 0; + int last; + int ret; + int type; + u64 logical; + u32 item_size; + struct btrfs_extent_inline_ref *eiref; + struct btrfs_extent_data_ref *dref; + struct extent_buffer *eb; + struct btrfs_extent_item *ei; + struct btrfs_key key; + struct list_head data_refs = LIST_HEAD_INIT(data_refs); + struct list_head shared_refs = LIST_HEAD_INIT(shared_refs); + struct __data_ref *ref_d; + struct __shared_ref *ref_s; + + eb = path->nodes[0]; + ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); + item_size = btrfs_item_size_nr(eb, path->slots[0]); + + /* first we iterate the inline refs, ... */ + do { + last = __get_extent_inline_ref(&ptr, eb, ei, item_size, + &eiref, &type); + if (last == -ENOENT) { + ret = 0; + break; + } + if (last < 0) { + ret = last; + break; + } + + if (type == BTRFS_EXTENT_DATA_REF_KEY) { + dref = (struct btrfs_extent_data_ref *)(&eiref->offset); + ret = __data_list_add_eb(&data_refs, eb, dref); + } else if (type == BTRFS_SHARED_DATA_REF_KEY) { + logical = btrfs_extent_inline_ref_offset(eb, eiref); + ret = __shared_list_add(&shared_refs, logical); + } + } while (!ret && !last); + + /* ... then we proceed to in-tree references and ... */ + while (!ret) { + ++path->slots[0]; + if (path->slots[0] > btrfs_header_nritems(eb)) { + ret = btrfs_next_leaf(fs_info->extent_root, path); + if (ret) { + if (ret == 1) + ret = 0; /* we're done */ + break; + } + eb = path->nodes[0]; + } + btrfs_item_key_to_cpu(eb, &key, path->slots[0]); + if (key.objectid != extent_item_objectid) + break; + if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { + dref = btrfs_item_ptr(eb, path->slots[0], + struct btrfs_extent_data_ref); + ret = __data_list_add_eb(&data_refs, eb, dref); + } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { + ret = __shared_list_add(&shared_refs, key.offset); + } + } + + btrfs_release_path(path); + + /* + * ... only at the very end we can process the refs we found. this is + * because the iterator function we call is allowed to make tree lookups + * and we have to avoid deadlocks. additionally, we need more tree + * lookups ourselves for shared data refs. + */ + while (!list_empty(&data_refs)) { + ref_d = list_first_entry(&data_refs, struct __data_ref, list); + list_del(&ref_d->list); + if (!ret) + ret = iterate(ref_d->inum, extent_offset + + ref_d->extent_data_item_offset, + ref_d->root, ctx); + kfree(ref_d); + } + + while (!list_empty(&shared_refs)) { + ref_s = list_first_entry(&shared_refs, struct __shared_ref, + list); + list_del(&ref_s->list); + if (!ret) + ret = __iter_shared_inline_ref(fs_info, + ref_s->disk_byte, + extent_item_objectid, + extent_offset, path, + &data_refs, + iterate, ctx); + kfree(ref_s); + } + + return ret; +} + +int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, + struct btrfs_path *path, + iterate_extent_inodes_t *iterate, void *ctx) +{ + int ret; + u64 offset; + struct btrfs_key found_key; + + ret = extent_from_logical(fs_info, logical, path, + &found_key); + if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) + ret = -EINVAL; + if (ret < 0) + return ret; + + offset = logical - found_key.objectid; + ret = iterate_extent_inodes(fs_info, path, found_key.objectid, + offset, iterate, ctx); + + return ret; +} + +static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, + struct btrfs_path *path, + iterate_irefs_t *iterate, void *ctx) +{ + int ret; + int slot; + u32 cur; + u32 len; + u32 name_len; + u64 parent = 0; + int found = 0; + struct extent_buffer *eb; + struct btrfs_item *item; + struct btrfs_inode_ref *iref; + struct btrfs_key found_key; + + while (1) { + ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path, + &found_key); + if (ret < 0) + break; + if (ret) { + ret = found ? 0 : -ENOENT; + break; + } + ++found; + + parent = found_key.offset; + slot = path->slots[0]; + eb = path->nodes[0]; + /* make sure we can use eb after releasing the path */ + atomic_inc(&eb->refs); + btrfs_release_path(path); + + item = btrfs_item_nr(eb, slot); + iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref); + + for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) { + name_len = btrfs_inode_ref_name_len(eb, iref); + /* path must be released before calling iterate()! */ + ret = iterate(parent, iref, eb, ctx); + if (ret) { + free_extent_buffer(eb); + break; + } + len = sizeof(*iref) + name_len; + iref = (struct btrfs_inode_ref *)((char *)iref + len); + } + free_extent_buffer(eb); + } + + btrfs_release_path(path); + + return ret; +} + +/* + * returns 0 if the path could be dumped (probably truncated) + * returns <0 in case of an error + */ +static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref, + struct extent_buffer *eb, void *ctx) +{ + struct inode_fs_paths *ipath = ctx; + char *fspath; + char *fspath_min; + int i = ipath->fspath->elem_cnt; + const int s_ptr = sizeof(char *); + u32 bytes_left; + + bytes_left = ipath->fspath->bytes_left > s_ptr ? + ipath->fspath->bytes_left - s_ptr : 0; + + fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr; + fspath = iref_to_path(ipath->fs_root, ipath->btrfs_path, iref, eb, + inum, fspath_min, bytes_left); + if (IS_ERR(fspath)) + return PTR_ERR(fspath); + + if (fspath > fspath_min) { + ipath->fspath->val[i] = (u64)(unsigned long)fspath; + ++ipath->fspath->elem_cnt; + ipath->fspath->bytes_left = fspath - fspath_min; + } else { + ++ipath->fspath->elem_missed; + ipath->fspath->bytes_missing += fspath_min - fspath; + ipath->fspath->bytes_left = 0; + } + + return 0; +} + +/* + * this dumps all file system paths to the inode into the ipath struct, provided + * is has been created large enough. each path is zero-terminated and accessed + * from ipath->fspath->val[i]. + * when it returns, there are ipath->fspath->elem_cnt number of paths available + * in ipath->fspath->val[]. when the allocated space wasn't sufficient, the + * number of missed paths in recored in ipath->fspath->elem_missed, otherwise, + * it's zero. ipath->fspath->bytes_missing holds the number of bytes that would + * have been needed to return all paths. + */ +int paths_from_inode(u64 inum, struct inode_fs_paths *ipath) +{ + return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path, + inode_to_path, ipath); +} + +/* + * allocates space to return multiple file system paths for an inode. + * total_bytes to allocate are passed, note that space usable for actual path + * information will be total_bytes - sizeof(struct inode_fs_paths). + * the returned pointer must be freed with free_ipath() in the end. + */ +struct btrfs_data_container *init_data_container(u32 total_bytes) +{ + struct btrfs_data_container *data; + size_t alloc_bytes; + + alloc_bytes = max_t(size_t, total_bytes, sizeof(*data)); + data = kmalloc(alloc_bytes, GFP_NOFS); + if (!data) + return ERR_PTR(-ENOMEM); + + if (total_bytes >= sizeof(*data)) { + data->bytes_left = total_bytes - sizeof(*data); + data->bytes_missing = 0; + } else { + data->bytes_missing = sizeof(*data) - total_bytes; + data->bytes_left = 0; + } + + data->elem_cnt = 0; + data->elem_missed = 0; + + return data; +} + +/* + * allocates space to return multiple file system paths for an inode. + * total_bytes to allocate are passed, note that space usable for actual path + * information will be total_bytes - sizeof(struct inode_fs_paths). + * the returned pointer must be freed with free_ipath() in the end. + */ +struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, + struct btrfs_path *path) +{ + struct inode_fs_paths *ifp; + struct btrfs_data_container *fspath; + + fspath = init_data_container(total_bytes); + if (IS_ERR(fspath)) + return (void *)fspath; + + ifp = kmalloc(sizeof(*ifp), GFP_NOFS); + if (!ifp) { + kfree(fspath); + return ERR_PTR(-ENOMEM); + } + + ifp->btrfs_path = path; + ifp->fspath = fspath; + ifp->fs_root = fs_root; + + return ifp; +} + +void free_ipath(struct inode_fs_paths *ipath) +{ + kfree(ipath); +} diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h new file mode 100644 index 000000000000..92618837cb8f --- /dev/null +++ b/fs/btrfs/backref.h @@ -0,0 +1,62 @@ +/* + * Copyright (C) 2011 STRATO. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_BACKREF__ +#define __BTRFS_BACKREF__ + +#include "ioctl.h" + +struct inode_fs_paths { + struct btrfs_path *btrfs_path; + struct btrfs_root *fs_root; + struct btrfs_data_container *fspath; +}; + +typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root, + void *ctx); +typedef int (iterate_irefs_t)(u64 parent, struct btrfs_inode_ref *iref, + struct extent_buffer *eb, void *ctx); + +int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, + struct btrfs_path *path); + +int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, + struct btrfs_path *path, struct btrfs_key *found_key); + +int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, + struct btrfs_extent_item *ei, u32 item_size, + u64 *out_root, u8 *out_level); + +int iterate_extent_inodes(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, + u64 extent_item_objectid, + u64 extent_offset, + iterate_extent_inodes_t *iterate, void *ctx); + +int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, + struct btrfs_path *path, + iterate_extent_inodes_t *iterate, void *ctx); + +int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); + +struct btrfs_data_container *init_data_container(u32 total_bytes); +struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, + struct btrfs_path *path); +void free_ipath(struct inode_fs_paths *ipath); + +#endif diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 52d7eca8c7bf..634608d2a6d0 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -34,6 +34,9 @@ struct btrfs_inode { */ struct btrfs_key location; + /* Lock for counters */ + spinlock_t lock; + /* the extent_tree has caches of all the extent mappings to disk */ struct extent_map_tree extent_tree; @@ -100,11 +103,6 @@ struct btrfs_inode { */ u64 delalloc_bytes; - /* total number of bytes that may be used for this inode for - * delalloc - */ - u64 reserved_bytes; - /* * the size of the file stored in the metadata on disk. data=ordered * means the in-memory i_size might be larger than the size on disk @@ -112,9 +110,6 @@ struct btrfs_inode { */ u64 disk_i_size; - /* flags field from the on disk inode */ - u32 flags; - /* * if this is a directory then index_cnt is the counter for the index * number for new files that are created @@ -129,13 +124,22 @@ struct btrfs_inode { u64 last_unlink_trans; /* + * Number of bytes outstanding that are going to need csums. This is + * used in ENOSPC accounting. + */ + u64 csum_bytes; + + /* flags field from the on disk inode */ + u32 flags; + + /* * Counters to keep track of the number of extent item's we may use due * to delalloc and such. outstanding_extents is the number of extent * items we think we'll end up using, and reserved_extents is the number * of extent items we've reserved metadata for. */ - atomic_t outstanding_extents; - atomic_t reserved_extents; + unsigned outstanding_extents; + unsigned reserved_extents; /* * ordered_data_close is set by truncate when a file that used @@ -143,14 +147,12 @@ struct btrfs_inode { * the btrfs file release call will add this inode to the * ordered operations list so that we make sure to flush out any * new data the application may have written before commit. - * - * yes, its silly to have a single bitflag, but we might grow more - * of these. */ unsigned ordered_data_close:1; unsigned orphan_meta_reserved:1; unsigned dummy_inode:1; unsigned in_defrag:1; + unsigned delalloc_meta_reserved:1; /* * always compress this one file @@ -173,7 +175,11 @@ static inline u64 btrfs_ino(struct inode *inode) { u64 ino = BTRFS_I(inode)->location.objectid; - if (ino <= BTRFS_FIRST_FREE_OBJECTID) + /* + * !ino: btree_inode + * type == BTRFS_ROOT_ITEM_KEY: subvol dir + */ + if (!ino || BTRFS_I(inode)->location.type == BTRFS_ROOT_ITEM_KEY) ino = inode->i_ino; return ino; } @@ -184,4 +190,13 @@ static inline void btrfs_i_size_write(struct inode *inode, u64 size) BTRFS_I(inode)->disk_i_size = size; } +static inline bool btrfs_is_free_space_inode(struct btrfs_root *root, + struct inode *inode) +{ + if (root == root->fs_info->tree_root || + BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) + return true; + return false; +} + #endif diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index bfe42b03eaf9..14f1c5a0b2d2 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -85,7 +85,8 @@ struct compressed_bio { static inline int compressed_bio_size(struct btrfs_root *root, unsigned long disk_size) { - u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); + return sizeof(struct compressed_bio) + ((disk_size + root->sectorsize - 1) / root->sectorsize) * csum_size; @@ -338,6 +339,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, u64 first_byte = disk_start; struct block_device *bdev; int ret; + int skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1)); cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); @@ -392,8 +394,11 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); BUG_ON(ret); - ret = btrfs_csum_one_bio(root, inode, bio, start, 1); - BUG_ON(ret); + if (!skip_sum) { + ret = btrfs_csum_one_bio(root, inode, bio, + start, 1); + BUG_ON(ret); + } ret = btrfs_map_bio(root, WRITE, bio, 0, 1); BUG_ON(ret); @@ -418,8 +423,10 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); BUG_ON(ret); - ret = btrfs_csum_one_bio(root, inode, bio, start, 1); - BUG_ON(ret); + if (!skip_sum) { + ret = btrfs_csum_one_bio(root, inode, bio, start, 1); + BUG_ON(ret); + } ret = btrfs_map_bio(root, WRITE, bio, 0, 1); BUG_ON(ret); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 2e667868e0d2..dede441bdeee 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -54,8 +54,13 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p) { int i; for (i = 0; i < BTRFS_MAX_LEVEL; i++) { - if (p->nodes[i] && p->locks[i]) - btrfs_set_lock_blocking(p->nodes[i]); + if (!p->nodes[i] || !p->locks[i]) + continue; + btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]); + if (p->locks[i] == BTRFS_READ_LOCK) + p->locks[i] = BTRFS_READ_LOCK_BLOCKING; + else if (p->locks[i] == BTRFS_WRITE_LOCK) + p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING; } } @@ -68,7 +73,7 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p) * for held */ noinline void btrfs_clear_path_blocking(struct btrfs_path *p, - struct extent_buffer *held) + struct extent_buffer *held, int held_rw) { int i; @@ -79,19 +84,29 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p, * really sure by forcing the path to blocking before we clear * the path blocking. */ - if (held) - btrfs_set_lock_blocking(held); + if (held) { + btrfs_set_lock_blocking_rw(held, held_rw); + if (held_rw == BTRFS_WRITE_LOCK) + held_rw = BTRFS_WRITE_LOCK_BLOCKING; + else if (held_rw == BTRFS_READ_LOCK) + held_rw = BTRFS_READ_LOCK_BLOCKING; + } btrfs_set_path_blocking(p); #endif for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) { - if (p->nodes[i] && p->locks[i]) - btrfs_clear_lock_blocking(p->nodes[i]); + if (p->nodes[i] && p->locks[i]) { + btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]); + if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING) + p->locks[i] = BTRFS_WRITE_LOCK; + else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING) + p->locks[i] = BTRFS_READ_LOCK; + } } #ifdef CONFIG_DEBUG_LOCK_ALLOC if (held) - btrfs_clear_lock_blocking(held); + btrfs_clear_lock_blocking_rw(held, held_rw); #endif } @@ -119,7 +134,7 @@ noinline void btrfs_release_path(struct btrfs_path *p) if (!p->nodes[i]) continue; if (p->locks[i]) { - btrfs_tree_unlock(p->nodes[i]); + btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]); p->locks[i] = 0; } free_extent_buffer(p->nodes[i]); @@ -167,6 +182,25 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) return eb; } +/* loop around taking references on and locking the root node of the + * tree until you end up with a lock on the root. A locked buffer + * is returned, with a reference held. + */ +struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) +{ + struct extent_buffer *eb; + + while (1) { + eb = btrfs_root_node(root); + btrfs_tree_read_lock(eb); + if (eb == root->node) + break; + btrfs_tree_read_unlock(eb); + free_extent_buffer(eb); + } + return eb; +} + /* cowonly root (everything not a reference counted cow subvolume), just get * put onto a simple dirty list. transaction.c walks this to make sure they * get properly updated on disk. @@ -480,10 +514,25 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf) { + /* ensure we can see the force_cow */ + smp_rmb(); + + /* + * We do not need to cow a block if + * 1) this block is not created or changed in this transaction; + * 2) this block does not belong to TREE_RELOC tree; + * 3) the root is not forced COW. + * + * What is forced COW: + * when we create snapshot during commiting the transaction, + * after we've finished coping src root, we must COW the shared + * block to ensure the metadata consistency. + */ if (btrfs_header_generation(buf) == trans->transid && !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && - btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) + btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) && + !root->force_cow) return 0; return 1; } @@ -626,14 +675,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, for (i = start_slot; i < end_slot; i++) { int close = 1; - if (!parent->map_token) { - map_extent_buffer(parent, - btrfs_node_key_ptr_offset(i), - sizeof(struct btrfs_key_ptr), - &parent->map_token, &parent->kaddr, - &parent->map_start, &parent->map_len, - KM_USER1); - } btrfs_node_key(parent, &disk_key, i); if (!progress_passed && comp_keys(&disk_key, progress) < 0) continue; @@ -656,11 +697,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, last_block = blocknr; continue; } - if (parent->map_token) { - unmap_extent_buffer(parent, parent->map_token, - KM_USER1); - parent->map_token = NULL; - } cur = btrfs_find_tree_block(root, blocknr, blocksize); if (cur) @@ -701,11 +737,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, btrfs_tree_unlock(cur); free_extent_buffer(cur); } - if (parent->map_token) { - unmap_extent_buffer(parent, parent->map_token, - KM_USER1); - parent->map_token = NULL; - } return err; } @@ -746,7 +777,6 @@ static noinline int generic_bin_search(struct extent_buffer *eb, struct btrfs_disk_key *tmp = NULL; struct btrfs_disk_key unaligned; unsigned long offset; - char *map_token = NULL; char *kaddr = NULL; unsigned long map_start = 0; unsigned long map_len = 0; @@ -756,18 +786,13 @@ static noinline int generic_bin_search(struct extent_buffer *eb, mid = (low + high) / 2; offset = p + mid * item_size; - if (!map_token || offset < map_start || + if (!kaddr || offset < map_start || (offset + sizeof(struct btrfs_disk_key)) > map_start + map_len) { - if (map_token) { - unmap_extent_buffer(eb, map_token, KM_USER0); - map_token = NULL; - } err = map_private_extent_buffer(eb, offset, sizeof(struct btrfs_disk_key), - &map_token, &kaddr, - &map_start, &map_len, KM_USER0); + &kaddr, &map_start, &map_len); if (!err) { tmp = (struct btrfs_disk_key *)(kaddr + offset - @@ -790,14 +815,10 @@ static noinline int generic_bin_search(struct extent_buffer *eb, high = mid; else { *slot = mid; - if (map_token) - unmap_extent_buffer(eb, map_token, KM_USER0); return 0; } } *slot = low; - if (map_token) - unmap_extent_buffer(eb, map_token, KM_USER0); return 1; } @@ -890,14 +911,16 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, mid = path->nodes[level]; - WARN_ON(!path->locks[level]); + WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK && + path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING); WARN_ON(btrfs_header_generation(mid) != trans->transid); orig_ptr = btrfs_node_blockptr(mid, orig_slot); - if (level < BTRFS_MAX_LEVEL - 1) + if (level < BTRFS_MAX_LEVEL - 1) { parent = path->nodes[level + 1]; - pslot = path->slots[level + 1]; + pslot = path->slots[level + 1]; + } /* * deal with the case where there is only one pointer in the root @@ -1100,9 +1123,10 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, mid = path->nodes[level]; WARN_ON(btrfs_header_generation(mid) != trans->transid); - if (level < BTRFS_MAX_LEVEL - 1) + if (level < BTRFS_MAX_LEVEL - 1) { parent = path->nodes[level + 1]; - pslot = path->slots[level + 1]; + pslot = path->slots[level + 1]; + } if (!parent) return 1; @@ -1228,7 +1252,6 @@ static void reada_for_search(struct btrfs_root *root, u32 nr; u32 blocksize; u32 nscan = 0; - bool map = true; if (level != 1) return; @@ -1250,19 +1273,8 @@ static void reada_for_search(struct btrfs_root *root, nritems = btrfs_header_nritems(node); nr = slot; - if (node->map_token || path->skip_locking) - map = false; while (1) { - if (map && !node->map_token) { - unsigned long offset = btrfs_node_key_ptr_offset(nr); - map_private_extent_buffer(node, offset, - sizeof(struct btrfs_key_ptr), - &node->map_token, - &node->kaddr, - &node->map_start, - &node->map_len, KM_USER1); - } if (direction < 0) { if (nr == 0) break; @@ -1281,11 +1293,6 @@ static void reada_for_search(struct btrfs_root *root, if ((search <= target && target - search <= 65536) || (search > target && search - target <= 65536)) { gen = btrfs_node_ptr_generation(node, nr); - if (map && node->map_token) { - unmap_extent_buffer(node, node->map_token, - KM_USER1); - node->map_token = NULL; - } readahead_tree_block(root, search, blocksize, gen); nread += blocksize; } @@ -1293,10 +1300,6 @@ static void reada_for_search(struct btrfs_root *root, if ((nread > 65536 || nscan > 32)) break; } - if (map && node->map_token) { - unmap_extent_buffer(node, node->map_token, KM_USER1); - node->map_token = NULL; - } } /* @@ -1409,7 +1412,7 @@ static noinline void unlock_up(struct btrfs_path *path, int level, t = path->nodes[i]; if (i >= lowest_unlock && i > skip_level && path->locks[i]) { - btrfs_tree_unlock(t); + btrfs_tree_unlock_rw(t, path->locks[i]); path->locks[i] = 0; } } @@ -1436,7 +1439,7 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level) continue; if (!path->locks[i]) continue; - btrfs_tree_unlock(path->nodes[i]); + btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); path->locks[i] = 0; } } @@ -1485,6 +1488,8 @@ read_block_for_search(struct btrfs_trans_handle *trans, * we can trust our generation number */ free_extent_buffer(tmp); + btrfs_set_path_blocking(p); + tmp = read_tree_block(root, blocknr, blocksize, gen); if (tmp && btrfs_buffer_uptodate(tmp, gen)) { *eb_ret = tmp; @@ -1540,20 +1545,27 @@ read_block_for_search(struct btrfs_trans_handle *trans, static int setup_nodes_for_search(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *p, - struct extent_buffer *b, int level, int ins_len) + struct extent_buffer *b, int level, int ins_len, + int *write_lock_level) { int ret; if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >= BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { int sret; + if (*write_lock_level < level + 1) { + *write_lock_level = level + 1; + btrfs_release_path(p); + goto again; + } + sret = reada_for_balance(root, p, level); if (sret) goto again; btrfs_set_path_blocking(p); sret = split_node(trans, root, p, level); - btrfs_clear_path_blocking(p, NULL); + btrfs_clear_path_blocking(p, NULL, 0); BUG_ON(sret > 0); if (sret) { @@ -1565,13 +1577,19 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans, BTRFS_NODEPTRS_PER_BLOCK(root) / 2) { int sret; + if (*write_lock_level < level + 1) { + *write_lock_level = level + 1; + btrfs_release_path(p); + goto again; + } + sret = reada_for_balance(root, p, level); if (sret) goto again; btrfs_set_path_blocking(p); sret = balance_level(trans, root, p, level); - btrfs_clear_path_blocking(p, NULL); + btrfs_clear_path_blocking(p, NULL, 0); if (sret) { ret = sret; @@ -1615,27 +1633,78 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root int err; int level; int lowest_unlock = 1; + int root_lock; + /* everything at write_lock_level or lower must be write locked */ + int write_lock_level = 0; u8 lowest_level = 0; lowest_level = p->lowest_level; WARN_ON(lowest_level && ins_len > 0); WARN_ON(p->nodes[0] != NULL); - if (ins_len < 0) + if (ins_len < 0) { lowest_unlock = 2; + /* when we are removing items, we might have to go up to level + * two as we update tree pointers Make sure we keep write + * for those levels as well + */ + write_lock_level = 2; + } else if (ins_len > 0) { + /* + * for inserting items, make sure we have a write lock on + * level 1 so we can update keys + */ + write_lock_level = 1; + } + + if (!cow) + write_lock_level = -1; + + if (cow && (p->keep_locks || p->lowest_level)) + write_lock_level = BTRFS_MAX_LEVEL; + again: + /* + * we try very hard to do read locks on the root + */ + root_lock = BTRFS_READ_LOCK; + level = 0; if (p->search_commit_root) { + /* + * the commit roots are read only + * so we always do read locks + */ b = root->commit_root; extent_buffer_get(b); + level = btrfs_header_level(b); if (!p->skip_locking) - btrfs_tree_lock(b); + btrfs_tree_read_lock(b); } else { - if (p->skip_locking) + if (p->skip_locking) { b = btrfs_root_node(root); - else - b = btrfs_lock_root_node(root); + level = btrfs_header_level(b); + } else { + /* we don't know the level of the root node + * until we actually have it read locked + */ + b = btrfs_read_lock_root_node(root); + level = btrfs_header_level(b); + if (level <= write_lock_level) { + /* whoops, must trade for write lock */ + btrfs_tree_read_unlock(b); + free_extent_buffer(b); + b = btrfs_lock_root_node(root); + root_lock = BTRFS_WRITE_LOCK; + + /* the level might have changed, check again */ + level = btrfs_header_level(b); + } + } } + p->nodes[level] = b; + if (!p->skip_locking) + p->locks[level] = root_lock; while (b) { level = btrfs_header_level(b); @@ -1644,10 +1713,6 @@ again: * setup the path here so we can release it under lock * contention with the cow code */ - p->nodes[level] = b; - if (!p->skip_locking) - p->locks[level] = 1; - if (cow) { /* * if we don't really need to cow this block @@ -1659,6 +1724,16 @@ again: btrfs_set_path_blocking(p); + /* + * must have write locks on this node and the + * parent + */ + if (level + 1 > write_lock_level) { + write_lock_level = level + 1; + btrfs_release_path(p); + goto again; + } + err = btrfs_cow_block(trans, root, b, p->nodes[level + 1], p->slots[level + 1], &b); @@ -1671,10 +1746,7 @@ cow_done: BUG_ON(!cow && ins_len); p->nodes[level] = b; - if (!p->skip_locking) - p->locks[level] = 1; - - btrfs_clear_path_blocking(p, NULL); + btrfs_clear_path_blocking(p, NULL, 0); /* * we have a lock on b and as long as we aren't changing @@ -1700,7 +1772,7 @@ cow_done: } p->slots[level] = slot; err = setup_nodes_for_search(trans, root, p, b, level, - ins_len); + ins_len, &write_lock_level); if (err == -EAGAIN) goto again; if (err) { @@ -1710,6 +1782,19 @@ cow_done: b = p->nodes[level]; slot = p->slots[level]; + /* + * slot 0 is special, if we change the key + * we have to update the parent pointer + * which means we must have a write lock + * on the parent + */ + if (slot == 0 && cow && + write_lock_level < level + 1) { + write_lock_level = level + 1; + btrfs_release_path(p); + goto again; + } + unlock_up(p, level, lowest_unlock); if (level == lowest_level) { @@ -1728,23 +1813,42 @@ cow_done: } if (!p->skip_locking) { - btrfs_clear_path_blocking(p, NULL); - err = btrfs_try_spin_lock(b); - - if (!err) { - btrfs_set_path_blocking(p); - btrfs_tree_lock(b); - btrfs_clear_path_blocking(p, b); + level = btrfs_header_level(b); + if (level <= write_lock_level) { + err = btrfs_try_tree_write_lock(b); + if (!err) { + btrfs_set_path_blocking(p); + btrfs_tree_lock(b); + btrfs_clear_path_blocking(p, b, + BTRFS_WRITE_LOCK); + } + p->locks[level] = BTRFS_WRITE_LOCK; + } else { + err = btrfs_try_tree_read_lock(b); + if (!err) { + btrfs_set_path_blocking(p); + btrfs_tree_read_lock(b); + btrfs_clear_path_blocking(p, b, + BTRFS_READ_LOCK); + } + p->locks[level] = BTRFS_READ_LOCK; } + p->nodes[level] = b; } } else { p->slots[level] = slot; if (ins_len > 0 && btrfs_leaf_free_space(root, b) < ins_len) { + if (write_lock_level < 1) { + write_lock_level = 1; + btrfs_release_path(p); + goto again; + } + btrfs_set_path_blocking(p); err = split_leaf(trans, root, key, p, ins_len, ret == 0); - btrfs_clear_path_blocking(p, NULL); + btrfs_clear_path_blocking(p, NULL, 0); BUG_ON(err > 0); if (err) { @@ -2025,7 +2129,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, add_root_to_dirty_list(root); extent_buffer_get(c); path->nodes[level] = c; - path->locks[level] = 1; + path->locks[level] = BTRFS_WRITE_LOCK; path->slots[level] = 0; return 0; } @@ -2253,14 +2357,6 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, if (path->slots[0] == i) push_space += data_size; - if (!left->map_token) { - map_extent_buffer(left, (unsigned long)item, - sizeof(struct btrfs_item), - &left->map_token, &left->kaddr, - &left->map_start, &left->map_len, - KM_USER1); - } - this_item_size = btrfs_item_size(left, item); if (this_item_size + sizeof(*item) + push_space > free_space) break; @@ -2271,10 +2367,6 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, break; i--; } - if (left->map_token) { - unmap_extent_buffer(left, left->map_token, KM_USER1); - left->map_token = NULL; - } if (push_items == 0) goto out_unlock; @@ -2316,21 +2408,10 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, push_space = BTRFS_LEAF_DATA_SIZE(root); for (i = 0; i < right_nritems; i++) { item = btrfs_item_nr(right, i); - if (!right->map_token) { - map_extent_buffer(right, (unsigned long)item, - sizeof(struct btrfs_item), - &right->map_token, &right->kaddr, - &right->map_start, &right->map_len, - KM_USER1); - } push_space -= btrfs_item_size(right, item); btrfs_set_item_offset(right, item, push_space); } - if (right->map_token) { - unmap_extent_buffer(right, right->map_token, KM_USER1); - right->map_token = NULL; - } left_nritems -= push_items; btrfs_set_header_nritems(left, left_nritems); @@ -2467,13 +2548,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, for (i = 0; i < nr; i++) { item = btrfs_item_nr(right, i); - if (!right->map_token) { - map_extent_buffer(right, (unsigned long)item, - sizeof(struct btrfs_item), - &right->map_token, &right->kaddr, - &right->map_start, &right->map_len, - KM_USER1); - } if (!empty && push_items > 0) { if (path->slots[0] < i) @@ -2496,11 +2570,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, push_space += this_item_size + sizeof(*item); } - if (right->map_token) { - unmap_extent_buffer(right, right->map_token, KM_USER1); - right->map_token = NULL; - } - if (push_items == 0) { ret = 1; goto out; @@ -2530,23 +2599,12 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, u32 ioff; item = btrfs_item_nr(left, i); - if (!left->map_token) { - map_extent_buffer(left, (unsigned long)item, - sizeof(struct btrfs_item), - &left->map_token, &left->kaddr, - &left->map_start, &left->map_len, - KM_USER1); - } ioff = btrfs_item_offset(left, item); btrfs_set_item_offset(left, item, ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size)); } btrfs_set_header_nritems(left, old_left_nritems + push_items); - if (left->map_token) { - unmap_extent_buffer(left, left->map_token, KM_USER1); - left->map_token = NULL; - } /* fixup right node */ if (push_items > right_nritems) { @@ -2574,21 +2632,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, for (i = 0; i < right_nritems; i++) { item = btrfs_item_nr(right, i); - if (!right->map_token) { - map_extent_buffer(right, (unsigned long)item, - sizeof(struct btrfs_item), - &right->map_token, &right->kaddr, - &right->map_start, &right->map_len, - KM_USER1); - } - push_space = push_space - btrfs_item_size(right, item); btrfs_set_item_offset(right, item, push_space); } - if (right->map_token) { - unmap_extent_buffer(right, right->map_token, KM_USER1); - right->map_token = NULL; - } btrfs_mark_buffer_dirty(left); if (right_nritems) @@ -2729,23 +2775,10 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans, struct btrfs_item *item = btrfs_item_nr(right, i); u32 ioff; - if (!right->map_token) { - map_extent_buffer(right, (unsigned long)item, - sizeof(struct btrfs_item), - &right->map_token, &right->kaddr, - &right->map_start, &right->map_len, - KM_USER1); - } - ioff = btrfs_item_offset(right, item); btrfs_set_item_offset(right, item, ioff + rt_data_off); } - if (right->map_token) { - unmap_extent_buffer(right, right->map_token, KM_USER1); - right->map_token = NULL; - } - btrfs_set_header_nritems(l, mid); ret = 0; btrfs_item_key(right, &disk_key, 0); @@ -3264,23 +3297,10 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans, u32 ioff; item = btrfs_item_nr(leaf, i); - if (!leaf->map_token) { - map_extent_buffer(leaf, (unsigned long)item, - sizeof(struct btrfs_item), - &leaf->map_token, &leaf->kaddr, - &leaf->map_start, &leaf->map_len, - KM_USER1); - } - ioff = btrfs_item_offset(leaf, item); btrfs_set_item_offset(leaf, item, ioff + size_diff); } - if (leaf->map_token) { - unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); - leaf->map_token = NULL; - } - /* shift the data */ if (from_end) { memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + @@ -3377,22 +3397,10 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans, u32 ioff; item = btrfs_item_nr(leaf, i); - if (!leaf->map_token) { - map_extent_buffer(leaf, (unsigned long)item, - sizeof(struct btrfs_item), - &leaf->map_token, &leaf->kaddr, - &leaf->map_start, &leaf->map_len, - KM_USER1); - } ioff = btrfs_item_offset(leaf, item); btrfs_set_item_offset(leaf, item, ioff - data_size); } - if (leaf->map_token) { - unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); - leaf->map_token = NULL; - } - /* shift the data */ memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + data_end - data_size, btrfs_leaf_data(leaf) + @@ -3494,27 +3502,13 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans, * item0..itemN ... dataN.offset..dataN.size .. data0.size */ /* first correct the data pointers */ - WARN_ON(leaf->map_token); for (i = slot; i < nritems; i++) { u32 ioff; item = btrfs_item_nr(leaf, i); - if (!leaf->map_token) { - map_extent_buffer(leaf, (unsigned long)item, - sizeof(struct btrfs_item), - &leaf->map_token, &leaf->kaddr, - &leaf->map_start, &leaf->map_len, - KM_USER1); - } - ioff = btrfs_item_offset(leaf, item); btrfs_set_item_offset(leaf, item, ioff - total_data); } - if (leaf->map_token) { - unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); - leaf->map_token = NULL; - } - /* shift the items */ memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr), btrfs_item_nr_offset(slot), @@ -3608,27 +3602,13 @@ int setup_items_for_insert(struct btrfs_trans_handle *trans, * item0..itemN ... dataN.offset..dataN.size .. data0.size */ /* first correct the data pointers */ - WARN_ON(leaf->map_token); for (i = slot; i < nritems; i++) { u32 ioff; item = btrfs_item_nr(leaf, i); - if (!leaf->map_token) { - map_extent_buffer(leaf, (unsigned long)item, - sizeof(struct btrfs_item), - &leaf->map_token, &leaf->kaddr, - &leaf->map_start, &leaf->map_len, - KM_USER1); - } - ioff = btrfs_item_offset(leaf, item); btrfs_set_item_offset(leaf, item, ioff - total_data); } - if (leaf->map_token) { - unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); - leaf->map_token = NULL; - } - /* shift the items */ memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr), btrfs_item_nr_offset(slot), @@ -3840,22 +3820,10 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 ioff; item = btrfs_item_nr(leaf, i); - if (!leaf->map_token) { - map_extent_buffer(leaf, (unsigned long)item, - sizeof(struct btrfs_item), - &leaf->map_token, &leaf->kaddr, - &leaf->map_start, &leaf->map_len, - KM_USER1); - } ioff = btrfs_item_offset(leaf, item); btrfs_set_item_offset(leaf, item, ioff + dsize); } - if (leaf->map_token) { - unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); - leaf->map_token = NULL; - } - memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot), btrfs_item_nr_offset(slot + nr), sizeof(struct btrfs_item) * @@ -4004,11 +3972,11 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, WARN_ON(!path->keep_locks); again: - cur = btrfs_lock_root_node(root); + cur = btrfs_read_lock_root_node(root); level = btrfs_header_level(cur); WARN_ON(path->nodes[level]); path->nodes[level] = cur; - path->locks[level] = 1; + path->locks[level] = BTRFS_READ_LOCK; if (btrfs_header_generation(cur) < min_trans) { ret = 1; @@ -4098,12 +4066,12 @@ find_next_key: cur = read_node_slot(root, cur, slot); BUG_ON(!cur); - btrfs_tree_lock(cur); + btrfs_tree_read_lock(cur); - path->locks[level - 1] = 1; + path->locks[level - 1] = BTRFS_READ_LOCK; path->nodes[level - 1] = cur; unlock_up(path, level, 1); - btrfs_clear_path_blocking(path, NULL); + btrfs_clear_path_blocking(path, NULL, 0); } out: if (ret == 0) @@ -4218,30 +4186,21 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) u32 nritems; int ret; int old_spinning = path->leave_spinning; - int force_blocking = 0; + int next_rw_lock = 0; nritems = btrfs_header_nritems(path->nodes[0]); if (nritems == 0) return 1; - /* - * we take the blocks in an order that upsets lockdep. Using - * blocking mode is the only way around it. - */ -#ifdef CONFIG_DEBUG_LOCK_ALLOC - force_blocking = 1; -#endif - btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); again: level = 1; next = NULL; + next_rw_lock = 0; btrfs_release_path(path); path->keep_locks = 1; - - if (!force_blocking) - path->leave_spinning = 1; + path->leave_spinning = 1; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); path->keep_locks = 0; @@ -4281,11 +4240,12 @@ again: } if (next) { - btrfs_tree_unlock(next); + btrfs_tree_unlock_rw(next, next_rw_lock); free_extent_buffer(next); } next = c; + next_rw_lock = path->locks[level]; ret = read_block_for_search(NULL, root, path, &next, level, slot, &key); if (ret == -EAGAIN) @@ -4297,15 +4257,14 @@ again: } if (!path->skip_locking) { - ret = btrfs_try_spin_lock(next); + ret = btrfs_try_tree_read_lock(next); if (!ret) { btrfs_set_path_blocking(path); - btrfs_tree_lock(next); - if (!force_blocking) - btrfs_clear_path_blocking(path, next); + btrfs_tree_read_lock(next); + btrfs_clear_path_blocking(path, next, + BTRFS_READ_LOCK); } - if (force_blocking) - btrfs_set_lock_blocking(next); + next_rw_lock = BTRFS_READ_LOCK; } break; } @@ -4314,14 +4273,13 @@ again: level--; c = path->nodes[level]; if (path->locks[level]) - btrfs_tree_unlock(c); + btrfs_tree_unlock_rw(c, path->locks[level]); free_extent_buffer(c); path->nodes[level] = next; path->slots[level] = 0; if (!path->skip_locking) - path->locks[level] = 1; - + path->locks[level] = next_rw_lock; if (!level) break; @@ -4336,16 +4294,14 @@ again: } if (!path->skip_locking) { - btrfs_assert_tree_locked(path->nodes[level]); - ret = btrfs_try_spin_lock(next); + ret = btrfs_try_tree_read_lock(next); if (!ret) { btrfs_set_path_blocking(path); - btrfs_tree_lock(next); - if (!force_blocking) - btrfs_clear_path_blocking(path, next); + btrfs_tree_read_lock(next); + btrfs_clear_path_blocking(path, next, + BTRFS_READ_LOCK); } - if (force_blocking) - btrfs_set_lock_blocking(next); + next_rw_lock = BTRFS_READ_LOCK; } } ret = 0; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 82be74efbb26..50634abef9b4 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -30,6 +30,7 @@ #include <linux/kobject.h> #include <trace/events/btrfs.h> #include <asm/kmap_types.h> +#include <linux/pagemap.h> #include "extent_io.h" #include "extent_map.h" #include "async-thread.h" @@ -360,6 +361,47 @@ struct btrfs_header { #define BTRFS_LABEL_SIZE 256 /* + * just in case we somehow lose the roots and are not able to mount, + * we store an array of the roots from previous transactions + * in the super. + */ +#define BTRFS_NUM_BACKUP_ROOTS 4 +struct btrfs_root_backup { + __le64 tree_root; + __le64 tree_root_gen; + + __le64 chunk_root; + __le64 chunk_root_gen; + + __le64 extent_root; + __le64 extent_root_gen; + + __le64 fs_root; + __le64 fs_root_gen; + + __le64 dev_root; + __le64 dev_root_gen; + + __le64 csum_root; + __le64 csum_root_gen; + + __le64 total_bytes; + __le64 bytes_used; + __le64 num_devices; + /* future */ + __le64 unsed_64[4]; + + u8 tree_root_level; + u8 chunk_root_level; + u8 extent_root_level; + u8 fs_root_level; + u8 dev_root_level; + u8 csum_root_level; + /* future and to align */ + u8 unused_8[10]; +} __attribute__ ((__packed__)); + +/* * the super block basically lists the main trees of the FS * it currently lacks any block count etc etc */ @@ -405,6 +447,7 @@ struct btrfs_super_block { /* future expansion */ __le64 reserved[31]; u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; + struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS]; } __attribute__ ((__packed__)); /* @@ -755,6 +798,8 @@ struct btrfs_space_info { chunks for this space */ unsigned int chunk_alloc:1; /* set if we are allocating a chunk */ + unsigned int flush:1; /* set if we are trying to make space */ + unsigned int force_alloc; /* set if we need to force a chunk alloc for this space */ @@ -764,20 +809,14 @@ struct btrfs_space_info { struct list_head block_groups[BTRFS_NR_RAID_TYPES]; spinlock_t lock; struct rw_semaphore groups_sem; - atomic_t caching_threads; + wait_queue_head_t wait; }; struct btrfs_block_rsv { u64 size; u64 reserved; - u64 freed[2]; struct btrfs_space_info *space_info; - struct list_head list; spinlock_t lock; - atomic_t usage; - unsigned int priority:8; - unsigned int durable:1; - unsigned int refill_used:1; unsigned int full:1; }; @@ -809,7 +848,8 @@ struct btrfs_free_cluster { enum btrfs_caching_type { BTRFS_CACHE_NO = 0, BTRFS_CACHE_STARTED = 1, - BTRFS_CACHE_FINISHED = 2, + BTRFS_CACHE_FAST = 2, + BTRFS_CACHE_FINISHED = 3, }; enum btrfs_disk_cache_state { @@ -824,6 +864,7 @@ struct btrfs_caching_control { struct list_head list; struct mutex mutex; wait_queue_head_t wait; + struct btrfs_work work; struct btrfs_block_group_cache *block_group; u64 progress; atomic_t count; @@ -837,10 +878,10 @@ struct btrfs_block_group_cache { spinlock_t lock; u64 pinned; u64 reserved; - u64 reserved_pinned; u64 bytes_super; u64 flags; u64 sectorsize; + u64 cache_generation; unsigned int ro:1; unsigned int dirty:1; unsigned int iref:1; @@ -896,6 +937,10 @@ struct btrfs_fs_info { spinlock_t block_group_cache_lock; struct rb_root block_group_cache_tree; + /* keep track of unallocated space */ + spinlock_t free_chunk_lock; + u64 free_chunk_space; + struct extent_io_tree freed_extents[2]; struct extent_io_tree *pinned_extents; @@ -913,14 +958,11 @@ struct btrfs_fs_info { struct btrfs_block_rsv trans_block_rsv; /* block reservation for chunk tree */ struct btrfs_block_rsv chunk_block_rsv; + /* block reservation for delayed operations */ + struct btrfs_block_rsv delayed_block_rsv; struct btrfs_block_rsv empty_block_rsv; - /* list of block reservations that cross multiple transactions */ - struct list_head durable_block_rsv_list; - - struct mutex durable_block_rsv_mutex; - u64 generation; u64 last_trans_committed; @@ -939,8 +981,8 @@ struct btrfs_fs_info { wait_queue_head_t transaction_blocked_wait; wait_queue_head_t async_submit_wait; - struct btrfs_super_block super_copy; - struct btrfs_super_block super_for_commit; + struct btrfs_super_block *super_copy; + struct btrfs_super_block *super_for_commit; struct block_device *__bdev; struct super_block *sb; struct inode *btree_inode; @@ -1032,6 +1074,9 @@ struct btrfs_fs_info { struct btrfs_workers endio_write_workers; struct btrfs_workers endio_freespace_worker; struct btrfs_workers submit_workers; + struct btrfs_workers caching_workers; + struct btrfs_workers readahead_workers; + /* * fixup workers take dirty pages that didn't properly go through * the cow mechanism and make them safe to write. It happens @@ -1114,6 +1159,13 @@ struct btrfs_fs_info { u64 fs_state; struct btrfs_delayed_root *delayed_root; + + /* readahead tree */ + spinlock_t reada_lock; + struct radix_tree_root reada_tree; + + /* next backup root to be overwritten */ + int backup_root_index; }; /* @@ -1220,6 +1272,8 @@ struct btrfs_root { * for stat. It may be used for more later */ dev_t anon_dev; + + int force_cow; }; struct btrfs_ioctl_defrag_range_args { @@ -1358,6 +1412,7 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15) #define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16) #define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17) +#define BTRFS_MOUNT_RECOVERY (1 << 18) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) @@ -1410,17 +1465,15 @@ void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val); #define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \ static inline u##bits btrfs_##name(struct extent_buffer *eb) \ { \ - type *p = kmap_atomic(eb->first_page, KM_USER0); \ + type *p = page_address(eb->first_page); \ u##bits res = le##bits##_to_cpu(p->member); \ - kunmap_atomic(p, KM_USER0); \ return res; \ } \ static inline void btrfs_set_##name(struct extent_buffer *eb, \ u##bits val) \ { \ - type *p = kmap_atomic(eb->first_page, KM_USER0); \ + type *p = page_address(eb->first_page); \ p->member = cpu_to_le##bits(val); \ - kunmap_atomic(p, KM_USER0); \ } #define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \ @@ -1975,6 +2028,55 @@ static inline bool btrfs_root_readonly(struct btrfs_root *root) return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY; } +/* struct btrfs_root_backup */ +BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup, + tree_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_tree_root_gen, struct btrfs_root_backup, + tree_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_tree_root_level, struct btrfs_root_backup, + tree_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_chunk_root, struct btrfs_root_backup, + chunk_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_gen, struct btrfs_root_backup, + chunk_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_level, struct btrfs_root_backup, + chunk_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_extent_root, struct btrfs_root_backup, + extent_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_extent_root_gen, struct btrfs_root_backup, + extent_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_extent_root_level, struct btrfs_root_backup, + extent_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_fs_root, struct btrfs_root_backup, + fs_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_fs_root_gen, struct btrfs_root_backup, + fs_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_fs_root_level, struct btrfs_root_backup, + fs_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_dev_root, struct btrfs_root_backup, + dev_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_dev_root_gen, struct btrfs_root_backup, + dev_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_dev_root_level, struct btrfs_root_backup, + dev_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_csum_root, struct btrfs_root_backup, + csum_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_csum_root_gen, struct btrfs_root_backup, + csum_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_csum_root_level, struct btrfs_root_backup, + csum_root_level, 8); +BTRFS_SETGET_STACK_FUNCS(backup_total_bytes, struct btrfs_root_backup, + total_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup, + bytes_used, 64); +BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup, + num_devices, 64); + /* struct btrfs_super_block */ BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); @@ -2126,14 +2228,30 @@ static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info) (space_info->flags & BTRFS_BLOCK_GROUP_DATA)); } +static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) +{ + return mapping_gfp_mask(mapping) & ~__GFP_FS; +} + /* extent-tree.c */ static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root, - int num_items) + unsigned num_items) { return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) * 3 * num_items; } +/* + * Doing a truncate won't result in new nodes or leaves, just what we need for + * COW. + */ +static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root, + unsigned num_items) +{ + return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) * + num_items; +} + void btrfs_put_block_group(struct btrfs_block_group_cache *cache); int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root, unsigned long count); @@ -2143,6 +2261,9 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, u64 num_bytes, u64 *refs, u64 *flags); int btrfs_pin_extent(struct btrfs_root *root, u64 bytenr, u64 num, int reserved); +int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes); int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid, u64 offset, u64 bytenr); @@ -2193,8 +2314,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, u64 root_objectid, u64 owner, u64 offset); int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); -int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, - u64 num_bytes, int reserve, int sinfo); +int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, + u64 start, u64 len); int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, @@ -2222,9 +2343,6 @@ void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); int btrfs_check_data_free_space(struct inode *inode, u64 bytes); void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); -int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - int num_items); void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, @@ -2240,25 +2358,26 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv); struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root); void btrfs_free_block_rsv(struct btrfs_root *root, struct btrfs_block_rsv *rsv); -void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *rsv); -int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +int btrfs_block_rsv_add(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 num_bytes); -int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +int btrfs_block_rsv_add_noflush(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes); +int btrfs_block_rsv_check(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, int min_factor); +int btrfs_block_rsv_refill(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, - u64 min_reserved, int min_factor); + u64 min_reserved); +int btrfs_block_rsv_refill_noflush(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 min_reserved); int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, struct btrfs_block_rsv *dst_rsv, u64 num_bytes); void btrfs_block_rsv_release(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 num_bytes); -int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_block_rsv *rsv); int btrfs_set_block_group_ro(struct btrfs_root *root, struct btrfs_block_group_cache *cache); int btrfs_set_block_group_rw(struct btrfs_root *root, @@ -2330,7 +2449,7 @@ struct btrfs_path *btrfs_alloc_path(void); void btrfs_free_path(struct btrfs_path *p); void btrfs_set_path_blocking(struct btrfs_path *p); void btrfs_clear_path_blocking(struct btrfs_path *p, - struct extent_buffer *held); + struct extent_buffer *held, int held_rw); void btrfs_unlock_up_safe(struct btrfs_path *p, int level); int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -2365,8 +2484,8 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf); -int btrfs_drop_snapshot(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, int update_ref); +void btrfs_drop_snapshot(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, int update_ref); int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *node, @@ -2379,6 +2498,18 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info) smp_mb(); return fs_info->closing; } +static inline void free_fs_info(struct btrfs_fs_info *fs_info) +{ + kfree(fs_info->delayed_root); + kfree(fs_info->extent_root); + kfree(fs_info->tree_root); + kfree(fs_info->chunk_root); + kfree(fs_info->dev_root); + kfree(fs_info->csum_root); + kfree(fs_info->super_copy); + kfree(fs_info->super_for_commit); + kfree(fs_info); +} /* root-item.c */ int btrfs_find_root_ref(struct btrfs_root *tree_root, @@ -2404,8 +2535,8 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct btrfs_root_item *item, struct btrfs_key *key); int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid); int btrfs_find_orphan_roots(struct btrfs_root *tree_root); -int btrfs_set_root_node(struct btrfs_root_item *item, - struct extent_buffer *node); +void btrfs_set_root_node(struct btrfs_root_item *item, + struct extent_buffer *node); void btrfs_check_and_init_root_item(struct btrfs_root_item *item); /* dir-item.c */ @@ -2521,6 +2652,14 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag #define PageChecked PageFsMisc #endif +/* This forces readahead on a given range of bytes in an inode */ +static inline void btrfs_force_ra(struct address_space *mapping, + struct file_ra_state *ra, struct file *file, + pgoff_t offset, unsigned long req_size) +{ + page_cache_sync_readahead(mapping, ra, file, offset, req_size); +} + struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry); int btrfs_set_inode_index(struct inode *dir, u64 *index); int btrfs_unlink_inode(struct btrfs_trans_handle *trans, @@ -2549,9 +2688,6 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, int btrfs_merge_bio_hook(struct page *page, unsigned long offset, size_t size, struct bio *bio, unsigned long bio_flags); -unsigned long btrfs_force_ra(struct address_space *mapping, - struct file_ra_state *ra, struct file *file, - pgoff_t offset, pgoff_t last_index); int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); int btrfs_readpage(struct file *file, struct page *page); void btrfs_evict_inode(struct inode *inode); @@ -2574,11 +2710,6 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans, int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode); int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); int btrfs_orphan_cleanup(struct btrfs_root *root); -void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans, - struct btrfs_pending_snapshot *pending, - u64 *bytes_to_reserve); -void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans, - struct btrfs_pending_snapshot *pending); void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size); @@ -2645,13 +2776,22 @@ do { \ /* acl.c */ #ifdef CONFIG_BTRFS_FS_POSIX_ACL -int btrfs_check_acl(struct inode *inode, int mask); -#else -#define btrfs_check_acl NULL -#endif +struct posix_acl *btrfs_get_acl(struct inode *inode, int type); int btrfs_init_acl(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir); int btrfs_acl_chmod(struct inode *inode); +#else +#define btrfs_get_acl NULL +static inline int btrfs_init_acl(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *dir) +{ + return 0; +} +static inline int btrfs_acl_chmod(struct inode *inode) +{ + return 0; +} +#endif /* relocation.c */ int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start); @@ -2683,4 +2823,20 @@ int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid); int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, struct btrfs_scrub_progress *progress); +/* reada.c */ +struct reada_control { + struct btrfs_root *root; /* tree to prefetch */ + struct btrfs_key key_start; + struct btrfs_key key_end; /* exclusive */ + atomic_t elems; + struct kref refcnt; + wait_queue_head_t wait; +}; +struct reada_control *btrfs_reada_add(struct btrfs_root *root, + struct btrfs_key *start, struct btrfs_key *end); +int btrfs_reada_wait(void *handle); +void btrfs_reada_detach(void *handle); +int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, + u64 start, int err); + #endif diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 98c68e658a9b..5b163572e0ca 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -591,7 +591,7 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, return 0; src_rsv = trans->block_rsv; - dst_rsv = &root->fs_info->global_block_rsv; + dst_rsv = &root->fs_info->delayed_block_rsv; num_bytes = btrfs_calc_trans_metadata_size(root, 1); ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); @@ -609,7 +609,7 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root, if (!item->bytes_reserved) return; - rsv = &root->fs_info->global_block_rsv; + rsv = &root->fs_info->delayed_block_rsv; btrfs_block_rsv_release(root, rsv, item->bytes_reserved); } @@ -617,24 +617,102 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root, static int btrfs_delayed_inode_reserve_metadata( struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct inode *inode, struct btrfs_delayed_node *node) { struct btrfs_block_rsv *src_rsv; struct btrfs_block_rsv *dst_rsv; u64 num_bytes; int ret; - - if (!trans->bytes_reserved) - return 0; + int release = false; src_rsv = trans->block_rsv; - dst_rsv = &root->fs_info->global_block_rsv; + dst_rsv = &root->fs_info->delayed_block_rsv; num_bytes = btrfs_calc_trans_metadata_size(root, 1); + + /* + * btrfs_dirty_inode will update the inode under btrfs_join_transaction + * which doesn't reserve space for speed. This is a problem since we + * still need to reserve space for this update, so try to reserve the + * space. + * + * Now if src_rsv == delalloc_block_rsv we'll let it just steal since + * we're accounted for. + */ + if (!trans->bytes_reserved && + src_rsv != &root->fs_info->delalloc_block_rsv) { + ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); + /* + * Since we're under a transaction reserve_metadata_bytes could + * try to commit the transaction which will make it return + * EAGAIN to make us stop the transaction we have, so return + * ENOSPC instead so that btrfs_dirty_inode knows what to do. + */ + if (ret == -EAGAIN) + ret = -ENOSPC; + if (!ret) + node->bytes_reserved = num_bytes; + return ret; + } else if (src_rsv == &root->fs_info->delalloc_block_rsv) { + spin_lock(&BTRFS_I(inode)->lock); + if (BTRFS_I(inode)->delalloc_meta_reserved) { + BTRFS_I(inode)->delalloc_meta_reserved = 0; + spin_unlock(&BTRFS_I(inode)->lock); + release = true; + goto migrate; + } + spin_unlock(&BTRFS_I(inode)->lock); + + /* Ok we didn't have space pre-reserved. This shouldn't happen + * too often but it can happen if we do delalloc to an existing + * inode which gets dirtied because of the time update, and then + * isn't touched again until after the transaction commits and + * then we try to write out the data. First try to be nice and + * reserve something strictly for us. If not be a pain and try + * to steal from the delalloc block rsv. + */ + ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); + if (!ret) + goto out; + + ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); + if (!ret) + goto out; + + /* + * Ok this is a problem, let's just steal from the global rsv + * since this really shouldn't happen that often. + */ + WARN_ON(1); + ret = btrfs_block_rsv_migrate(&root->fs_info->global_block_rsv, + dst_rsv, num_bytes); + goto out; + } + +migrate: ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); + +out: + /* + * Migrate only takes a reservation, it doesn't touch the size of the + * block_rsv. This is to simplify people who don't normally have things + * migrated from their block rsv. If they go to release their + * reservation, that will decrease the size as well, so if migrate + * reduced size we'd end up with a negative size. But for the + * delalloc_meta_reserved stuff we will only know to drop 1 reservation, + * but we could in fact do this reserve/migrate dance several times + * between the time we did the original reservation and we'd clean it + * up. So to take care of this, release the space for the meta + * reservation here. I think it may be time for a documentation page on + * how block rsvs. work. + */ if (!ret) node->bytes_reserved = num_bytes; + if (release) + btrfs_block_rsv_release(root, src_rsv, num_bytes); + return ret; } @@ -646,7 +724,7 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root, if (!node->bytes_reserved) return; - rsv = &root->fs_info->global_block_rsv; + rsv = &root->fs_info->delayed_block_rsv; btrfs_block_rsv_release(root, rsv, node->bytes_reserved); node->bytes_reserved = 0; @@ -735,7 +813,7 @@ static int btrfs_batch_insert_items(struct btrfs_trans_handle *trans, } /* reset all the locked nodes in the patch to spinning locks. */ - btrfs_clear_path_blocking(path, NULL); + btrfs_clear_path_blocking(path, NULL, 0); /* insert the keys of the items */ ret = setup_items_for_insert(trans, root, path, keys, data_size, @@ -1026,7 +1104,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, path->leave_spinning = 1; block_rsv = trans->block_rsv; - trans->block_rsv = &root->fs_info->global_block_rsv; + trans->block_rsv = &root->fs_info->delayed_block_rsv; delayed_root = btrfs_get_delayed_root(root); @@ -1069,7 +1147,7 @@ static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, path->leave_spinning = 1; block_rsv = trans->block_rsv; - trans->block_rsv = &node->root->fs_info->global_block_rsv; + trans->block_rsv = &node->root->fs_info->delayed_block_rsv; ret = btrfs_insert_delayed_items(trans, path, node->root, node); if (!ret) @@ -1149,7 +1227,7 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work) goto free_path; block_rsv = trans->block_rsv; - trans->block_rsv = &root->fs_info->global_block_rsv; + trans->block_rsv = &root->fs_info->delayed_block_rsv; ret = btrfs_insert_delayed_items(trans, path, root, delayed_node); if (!ret) @@ -1641,7 +1719,7 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev) inode->i_gid = btrfs_stack_inode_gid(inode_item); btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item)); inode->i_mode = btrfs_stack_inode_mode(inode_item); - inode->i_nlink = btrfs_stack_inode_nlink(inode_item); + set_nlink(inode, btrfs_stack_inode_nlink(inode_item)); inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item)); BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item); BTRFS_I(inode)->sequence = btrfs_stack_inode_sequence(inode_item); @@ -1685,12 +1763,10 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, goto release_node; } - ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node); - /* - * we must reserve enough space when we start a new transaction, - * so reserving metadata failure is impossible - */ - BUG_ON(ret); + ret = btrfs_delayed_inode_reserve_metadata(trans, root, inode, + delayed_node); + if (ret) + goto release_node; fill_stack_inode_item(trans, &delayed_node->inode_item, inode); delayed_node->inode_dirty = 1; diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 8d27af4bd8b9..7083d08b2a21 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -25,7 +25,7 @@ #include <linux/mutex.h> #include <linux/list.h> #include <linux/wait.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include "ctree.h" diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 685f2593c4f0..31d84e78129b 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -89,13 +89,8 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, data_size = sizeof(*dir_item) + name_len + data_len; dir_item = insert_with_overflow(trans, root, path, &key, data_size, name, name_len); - /* - * FIXME: at some point we should handle xattr's that are larger than - * what we can fit in our leaf. We set location to NULL b/c we arent - * pointing at anything else, that will change if we store the xattr - * data in a separate inode. - */ - BUG_ON(IS_ERR(dir_item)); + if (IS_ERR(dir_item)) + return PTR_ERR(dir_item); memset(&location, 0, sizeof(location)); leaf = path->nodes[0]; @@ -203,8 +198,6 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_key key; int ins_len = mod < 0 ? -1 : 0; int cow = mod != 0; - struct btrfs_key found_key; - struct extent_buffer *leaf; key.objectid = dir; btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); @@ -214,18 +207,7 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); if (ret < 0) return ERR_PTR(ret); - if (ret > 0) { - if (path->slots[0] == 0) - return NULL; - path->slots[0]--; - } - - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - - if (found_key.objectid != dir || - btrfs_key_type(&found_key) != BTRFS_DIR_ITEM_KEY || - found_key.offset != key.offset) + if (ret > 0) return NULL; return btrfs_match_dir_item_name(root, path, name, name_len); @@ -320,8 +302,6 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, struct btrfs_key key; int ins_len = mod < 0 ? -1 : 0; int cow = mod != 0; - struct btrfs_key found_key; - struct extent_buffer *leaf; key.objectid = dir; btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); @@ -329,18 +309,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); if (ret < 0) return ERR_PTR(ret); - if (ret > 0) { - if (path->slots[0] == 0) - return NULL; - path->slots[0]--; - } - - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - - if (found_key.objectid != dir || - btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY || - found_key.offset != key.offset) + if (ret > 0) return NULL; return btrfs_match_dir_item_name(root, path, name, name_len); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b231ae13b269..632f8f3cc9db 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -100,38 +100,83 @@ struct async_submit_bio { struct btrfs_work work; }; -/* These are used to set the lockdep class on the extent buffer locks. - * The class is set by the readpage_end_io_hook after the buffer has - * passed csum validation but before the pages are unlocked. +/* + * Lockdep class keys for extent_buffer->lock's in this root. For a given + * eb, the lockdep key is determined by the btrfs_root it belongs to and + * the level the eb occupies in the tree. + * + * Different roots are used for different purposes and may nest inside each + * other and they require separate keysets. As lockdep keys should be + * static, assign keysets according to the purpose of the root as indicated + * by btrfs_root->objectid. This ensures that all special purpose roots + * have separate keysets. * - * The lockdep class is also set by btrfs_init_new_buffer on freshly - * allocated blocks. + * Lock-nesting across peer nodes is always done with the immediate parent + * node locked thus preventing deadlock. As lockdep doesn't know this, use + * subclass to avoid triggering lockdep warning in such cases. * - * The class is based on the level in the tree block, which allows lockdep - * to know that lower nodes nest inside the locks of higher nodes. + * The key is set by the readpage_end_io_hook after the buffer has passed + * csum validation but before the pages are unlocked. It is also set by + * btrfs_init_new_buffer on freshly allocated blocks. * - * We also add a check to make sure the highest level of the tree is - * the same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this - * code needs update as well. + * We also add a check to make sure the highest level of the tree is the + * same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this code + * needs update as well. */ #ifdef CONFIG_DEBUG_LOCK_ALLOC # if BTRFS_MAX_LEVEL != 8 # error # endif -static struct lock_class_key btrfs_eb_class[BTRFS_MAX_LEVEL + 1]; -static const char *btrfs_eb_name[BTRFS_MAX_LEVEL + 1] = { - /* leaf */ - "btrfs-extent-00", - "btrfs-extent-01", - "btrfs-extent-02", - "btrfs-extent-03", - "btrfs-extent-04", - "btrfs-extent-05", - "btrfs-extent-06", - "btrfs-extent-07", - /* highest possible level */ - "btrfs-extent-08", + +static struct btrfs_lockdep_keyset { + u64 id; /* root objectid */ + const char *name_stem; /* lock name stem */ + char names[BTRFS_MAX_LEVEL + 1][20]; + struct lock_class_key keys[BTRFS_MAX_LEVEL + 1]; +} btrfs_lockdep_keysets[] = { + { .id = BTRFS_ROOT_TREE_OBJECTID, .name_stem = "root" }, + { .id = BTRFS_EXTENT_TREE_OBJECTID, .name_stem = "extent" }, + { .id = BTRFS_CHUNK_TREE_OBJECTID, .name_stem = "chunk" }, + { .id = BTRFS_DEV_TREE_OBJECTID, .name_stem = "dev" }, + { .id = BTRFS_FS_TREE_OBJECTID, .name_stem = "fs" }, + { .id = BTRFS_CSUM_TREE_OBJECTID, .name_stem = "csum" }, + { .id = BTRFS_ORPHAN_OBJECTID, .name_stem = "orphan" }, + { .id = BTRFS_TREE_LOG_OBJECTID, .name_stem = "log" }, + { .id = BTRFS_TREE_RELOC_OBJECTID, .name_stem = "treloc" }, + { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, .name_stem = "dreloc" }, + { .id = 0, .name_stem = "tree" }, }; + +void __init btrfs_init_lockdep(void) +{ + int i, j; + + /* initialize lockdep class names */ + for (i = 0; i < ARRAY_SIZE(btrfs_lockdep_keysets); i++) { + struct btrfs_lockdep_keyset *ks = &btrfs_lockdep_keysets[i]; + + for (j = 0; j < ARRAY_SIZE(ks->names); j++) + snprintf(ks->names[j], sizeof(ks->names[j]), + "btrfs-%s-%02d", ks->name_stem, j); + } +} + +void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, + int level) +{ + struct btrfs_lockdep_keyset *ks; + + BUG_ON(level >= ARRAY_SIZE(ks->keys)); + + /* find the matching keyset, id 0 is the default entry */ + for (ks = btrfs_lockdep_keysets; ks->id; ks++) + if (ks->id == objectid) + break; + + lockdep_set_class_and_name(&eb->lock, + &ks->keys[level], ks->names[level]); +} + #endif /* @@ -211,13 +256,11 @@ void btrfs_csum_final(u32 crc, char *result) static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, int verify) { - u16 csum_size = - btrfs_super_csum_size(&root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); char *result = NULL; unsigned long len; unsigned long cur_len; unsigned long offset = BTRFS_CSUM_SIZE; - char *map_token = NULL; char *kaddr; unsigned long map_start; unsigned long map_len; @@ -228,8 +271,7 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, len = buf->len - offset; while (len > 0) { err = map_private_extent_buffer(buf, offset, 32, - &map_token, &kaddr, - &map_start, &map_len, KM_USER0); + &kaddr, &map_start, &map_len); if (err) return 1; cur_len = min(len, map_len - (offset - map_start)); @@ -237,7 +279,6 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, crc, cur_len); len -= cur_len; offset += cur_len; - unmap_extent_buffer(buf, map_token, KM_USER0); } if (csum_size > sizeof(inline_result)) { result = kzalloc(csum_size * sizeof(char), GFP_NOFS); @@ -325,7 +366,8 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; while (1) { - ret = read_extent_buffer_pages(io_tree, eb, start, 1, + ret = read_extent_buffer_pages(io_tree, eb, start, + WAIT_COMPLETE, btree_get_extent, mirror_num); if (!ret && !verify_parent_transid(io_tree, eb, parent_transid)) @@ -494,15 +536,6 @@ static noinline int check_leaf(struct btrfs_root *root, return 0; } -#ifdef CONFIG_DEBUG_LOCK_ALLOC -void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level) -{ - lockdep_set_class_and_name(&eb->lock, - &btrfs_eb_class[level], - btrfs_eb_name[level]); -} -#endif - static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, struct extent_state *state) { @@ -553,7 +586,8 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, } found_level = btrfs_header_level(eb); - btrfs_set_buffer_lockdep_class(eb, found_level); + btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), + eb, found_level); ret = csum_tree_block(root, eb, 1); if (ret) { @@ -574,11 +608,48 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, end = min_t(u64, eb->len, PAGE_CACHE_SIZE); end = eb->start + end - 1; err: + if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) { + clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags); + btree_readahead_hook(root, eb, eb->start, ret); + } + free_extent_buffer(eb); out: return ret; } +static int btree_io_failed_hook(struct bio *failed_bio, + struct page *page, u64 start, u64 end, + int mirror_num, struct extent_state *state) +{ + struct extent_io_tree *tree; + unsigned long len; + struct extent_buffer *eb; + struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; + + tree = &BTRFS_I(page->mapping->host)->io_tree; + if (page->private == EXTENT_PAGE_PRIVATE) + goto out; + if (!page->private) + goto out; + + len = page->private >> 2; + WARN_ON(len == 0); + + eb = alloc_extent_buffer(tree, start, len, page); + if (eb == NULL) + goto out; + + if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) { + clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags); + btree_readahead_hook(root, eb, eb->start, -EIO); + } + free_extent_buffer(eb); + +out: + return -EIO; /* we fixed nothing */ +} + static void end_workqueue_bio(struct bio *bio, int err) { struct end_io_wq *end_io_wq = bio->bi_private; @@ -874,7 +945,7 @@ static int btree_readpage(struct file *file, struct page *page) { struct extent_io_tree *tree; tree = &BTRFS_I(page->mapping->host)->io_tree; - return extent_read_full_page(tree, page, btree_get_extent); + return extent_read_full_page(tree, page, btree_get_extent, 0); } static int btree_releasepage(struct page *page, gfp_t gfp_flags) @@ -940,11 +1011,43 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, if (!buf) return 0; read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, - buf, 0, 0, btree_get_extent, 0); + buf, 0, WAIT_NONE, btree_get_extent, 0); free_extent_buffer(buf); return ret; } +int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, + int mirror_num, struct extent_buffer **eb) +{ + struct extent_buffer *buf = NULL; + struct inode *btree_inode = root->fs_info->btree_inode; + struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree; + int ret; + + buf = btrfs_find_create_tree_block(root, bytenr, blocksize); + if (!buf) + return 0; + + set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags); + + ret = read_extent_buffer_pages(io_tree, buf, 0, WAIT_PAGE_LOCK, + btree_get_extent, mirror_num); + if (ret) { + free_extent_buffer(buf); + return ret; + } + + if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) { + free_extent_buffer(buf); + return -EIO; + } else if (extent_buffer_uptodate(io_tree, buf, NULL)) { + *eb = buf; + } else { + free_extent_buffer(buf); + } + return 0; +} + struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize) { @@ -1101,10 +1204,12 @@ static int find_and_setup_root(struct btrfs_root *tree_root, generation = btrfs_root_generation(&root->root_item); blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); + root->commit_root = NULL; root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), blocksize, generation); if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) { free_extent_buffer(root->node); + root->node = NULL; return -EIO; } root->commit_root = btrfs_root_node(root); @@ -1543,6 +1648,235 @@ sleep: return 0; } +/* + * this will find the highest generation in the array of + * root backups. The index of the highest array is returned, + * or -1 if we can't find anything. + * + * We check to make sure the array is valid by comparing the + * generation of the latest root in the array with the generation + * in the super block. If they don't match we pitch it. + */ +static int find_newest_super_backup(struct btrfs_fs_info *info, u64 newest_gen) +{ + u64 cur; + int newest_index = -1; + struct btrfs_root_backup *root_backup; + int i; + + for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) { + root_backup = info->super_copy->super_roots + i; + cur = btrfs_backup_tree_root_gen(root_backup); + if (cur == newest_gen) + newest_index = i; + } + + /* check to see if we actually wrapped around */ + if (newest_index == BTRFS_NUM_BACKUP_ROOTS - 1) { + root_backup = info->super_copy->super_roots; + cur = btrfs_backup_tree_root_gen(root_backup); + if (cur == newest_gen) + newest_index = 0; + } + return newest_index; +} + + +/* + * find the oldest backup so we know where to store new entries + * in the backup array. This will set the backup_root_index + * field in the fs_info struct + */ +static void find_oldest_super_backup(struct btrfs_fs_info *info, + u64 newest_gen) +{ + int newest_index = -1; + + newest_index = find_newest_super_backup(info, newest_gen); + /* if there was garbage in there, just move along */ + if (newest_index == -1) { + info->backup_root_index = 0; + } else { + info->backup_root_index = (newest_index + 1) % BTRFS_NUM_BACKUP_ROOTS; + } +} + +/* + * copy all the root pointers into the super backup array. + * this will bump the backup pointer by one when it is + * done + */ +static void backup_super_roots(struct btrfs_fs_info *info) +{ + int next_backup; + struct btrfs_root_backup *root_backup; + int last_backup; + + next_backup = info->backup_root_index; + last_backup = (next_backup + BTRFS_NUM_BACKUP_ROOTS - 1) % + BTRFS_NUM_BACKUP_ROOTS; + + /* + * just overwrite the last backup if we're at the same generation + * this happens only at umount + */ + root_backup = info->super_for_commit->super_roots + last_backup; + if (btrfs_backup_tree_root_gen(root_backup) == + btrfs_header_generation(info->tree_root->node)) + next_backup = last_backup; + + root_backup = info->super_for_commit->super_roots + next_backup; + + /* + * make sure all of our padding and empty slots get zero filled + * regardless of which ones we use today + */ + memset(root_backup, 0, sizeof(*root_backup)); + + info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS; + + btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start); + btrfs_set_backup_tree_root_gen(root_backup, + btrfs_header_generation(info->tree_root->node)); + + btrfs_set_backup_tree_root_level(root_backup, + btrfs_header_level(info->tree_root->node)); + + btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start); + btrfs_set_backup_chunk_root_gen(root_backup, + btrfs_header_generation(info->chunk_root->node)); + btrfs_set_backup_chunk_root_level(root_backup, + btrfs_header_level(info->chunk_root->node)); + + btrfs_set_backup_extent_root(root_backup, info->extent_root->node->start); + btrfs_set_backup_extent_root_gen(root_backup, + btrfs_header_generation(info->extent_root->node)); + btrfs_set_backup_extent_root_level(root_backup, + btrfs_header_level(info->extent_root->node)); + + /* + * we might commit during log recovery, which happens before we set + * the fs_root. Make sure it is valid before we fill it in. + */ + if (info->fs_root && info->fs_root->node) { + btrfs_set_backup_fs_root(root_backup, + info->fs_root->node->start); + btrfs_set_backup_fs_root_gen(root_backup, + btrfs_header_generation(info->fs_root->node)); + btrfs_set_backup_fs_root_level(root_backup, + btrfs_header_level(info->fs_root->node)); + } + + btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start); + btrfs_set_backup_dev_root_gen(root_backup, + btrfs_header_generation(info->dev_root->node)); + btrfs_set_backup_dev_root_level(root_backup, + btrfs_header_level(info->dev_root->node)); + + btrfs_set_backup_csum_root(root_backup, info->csum_root->node->start); + btrfs_set_backup_csum_root_gen(root_backup, + btrfs_header_generation(info->csum_root->node)); + btrfs_set_backup_csum_root_level(root_backup, + btrfs_header_level(info->csum_root->node)); + + btrfs_set_backup_total_bytes(root_backup, + btrfs_super_total_bytes(info->super_copy)); + btrfs_set_backup_bytes_used(root_backup, + btrfs_super_bytes_used(info->super_copy)); + btrfs_set_backup_num_devices(root_backup, + btrfs_super_num_devices(info->super_copy)); + + /* + * if we don't copy this out to the super_copy, it won't get remembered + * for the next commit + */ + memcpy(&info->super_copy->super_roots, + &info->super_for_commit->super_roots, + sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS); +} + +/* + * this copies info out of the root backup array and back into + * the in-memory super block. It is meant to help iterate through + * the array, so you send it the number of backups you've already + * tried and the last backup index you used. + * + * this returns -1 when it has tried all the backups + */ +static noinline int next_root_backup(struct btrfs_fs_info *info, + struct btrfs_super_block *super, + int *num_backups_tried, int *backup_index) +{ + struct btrfs_root_backup *root_backup; + int newest = *backup_index; + + if (*num_backups_tried == 0) { + u64 gen = btrfs_super_generation(super); + + newest = find_newest_super_backup(info, gen); + if (newest == -1) + return -1; + + *backup_index = newest; + *num_backups_tried = 1; + } else if (*num_backups_tried == BTRFS_NUM_BACKUP_ROOTS) { + /* we've tried all the backups, all done */ + return -1; + } else { + /* jump to the next oldest backup */ + newest = (*backup_index + BTRFS_NUM_BACKUP_ROOTS - 1) % + BTRFS_NUM_BACKUP_ROOTS; + *backup_index = newest; + *num_backups_tried += 1; + } + root_backup = super->super_roots + newest; + + btrfs_set_super_generation(super, + btrfs_backup_tree_root_gen(root_backup)); + btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup)); + btrfs_set_super_root_level(super, + btrfs_backup_tree_root_level(root_backup)); + btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup)); + + /* + * fixme: the total bytes and num_devices need to match or we should + * need a fsck + */ + btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup)); + btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup)); + return 0; +} + +/* helper to cleanup tree roots */ +static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root) +{ + free_extent_buffer(info->tree_root->node); + free_extent_buffer(info->tree_root->commit_root); + free_extent_buffer(info->dev_root->node); + free_extent_buffer(info->dev_root->commit_root); + free_extent_buffer(info->extent_root->node); + free_extent_buffer(info->extent_root->commit_root); + free_extent_buffer(info->csum_root->node); + free_extent_buffer(info->csum_root->commit_root); + + info->tree_root->node = NULL; + info->tree_root->commit_root = NULL; + info->dev_root->node = NULL; + info->dev_root->commit_root = NULL; + info->extent_root->node = NULL; + info->extent_root->commit_root = NULL; + info->csum_root->node = NULL; + info->csum_root->commit_root = NULL; + + if (chunk_root) { + free_extent_buffer(info->chunk_root->node); + free_extent_buffer(info->chunk_root->commit_root); + info->chunk_root->node = NULL; + info->chunk_root->commit_root = NULL; + } +} + + struct btrfs_root *open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, char *options) @@ -1556,29 +1890,32 @@ struct btrfs_root *open_ctree(struct super_block *sb, u64 features; struct btrfs_key location; struct buffer_head *bh; - struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root), - GFP_NOFS); - struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root), - GFP_NOFS); + struct btrfs_super_block *disk_super; struct btrfs_root *tree_root = btrfs_sb(sb); - struct btrfs_fs_info *fs_info = NULL; - struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root), - GFP_NOFS); - struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root), - GFP_NOFS); + struct btrfs_fs_info *fs_info = tree_root->fs_info; + struct btrfs_root *extent_root; + struct btrfs_root *csum_root; + struct btrfs_root *chunk_root; + struct btrfs_root *dev_root; struct btrfs_root *log_tree_root; - int ret; int err = -EINVAL; - - struct btrfs_super_block *disk_super; - - if (!extent_root || !tree_root || !tree_root->fs_info || - !chunk_root || !dev_root || !csum_root) { + int num_backups_tried = 0; + int backup_index = 0; + + extent_root = fs_info->extent_root = + kzalloc(sizeof(struct btrfs_root), GFP_NOFS); + csum_root = fs_info->csum_root = + kzalloc(sizeof(struct btrfs_root), GFP_NOFS); + chunk_root = fs_info->chunk_root = + kzalloc(sizeof(struct btrfs_root), GFP_NOFS); + dev_root = fs_info->dev_root = + kzalloc(sizeof(struct btrfs_root), GFP_NOFS); + + if (!extent_root || !csum_root || !chunk_root || !dev_root) { err = -ENOMEM; goto fail; } - fs_info = tree_root->fs_info; ret = init_srcu_struct(&fs_info->subvol_srcu); if (ret) { @@ -1598,7 +1935,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, goto fail_bdi; } - fs_info->btree_inode->i_mapping->flags &= ~__GFP_FS; + mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); INIT_LIST_HEAD(&fs_info->trans_list); @@ -1614,15 +1951,10 @@ struct btrfs_root *open_ctree(struct super_block *sb, spin_lock_init(&fs_info->fs_roots_radix_lock); spin_lock_init(&fs_info->delayed_iput_lock); spin_lock_init(&fs_info->defrag_inodes_lock); + spin_lock_init(&fs_info->free_chunk_lock); mutex_init(&fs_info->reloc_mutex); init_completion(&fs_info->kobj_unregister); - fs_info->tree_root = tree_root; - fs_info->extent_root = extent_root; - fs_info->csum_root = csum_root; - fs_info->chunk_root = chunk_root; - fs_info->dev_root = dev_root; - fs_info->fs_devices = fs_devices; INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); INIT_LIST_HEAD(&fs_info->space_info); btrfs_mapping_init(&fs_info->mapping_tree); @@ -1631,8 +1963,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_init_block_rsv(&fs_info->trans_block_rsv); btrfs_init_block_rsv(&fs_info->chunk_block_rsv); btrfs_init_block_rsv(&fs_info->empty_block_rsv); - INIT_LIST_HEAD(&fs_info->durable_block_rsv_list); - mutex_init(&fs_info->durable_block_rsv_mutex); + btrfs_init_block_rsv(&fs_info->delayed_block_rsv); atomic_set(&fs_info->nr_async_submits, 0); atomic_set(&fs_info->async_delalloc_pages, 0); atomic_set(&fs_info->async_submit_draining, 0); @@ -1643,6 +1974,11 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->metadata_ratio = 0; fs_info->defrag_inodes = RB_ROOT; fs_info->trans_no_join = 0; + fs_info->free_chunk_space = 0; + + /* readahead state */ + INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT); + spin_lock_init(&fs_info->reada_lock); fs_info->thread_pool_size = min_t(unsigned long, num_online_cpus() + 2, 8); @@ -1671,7 +2007,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, sb->s_bdi = &fs_info->bdi; fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID; - fs_info->btree_inode->i_nlink = 1; + set_nlink(fs_info->btree_inode, 1); /* * we set the i_size on the btree inode to the max possible int. * the real end of the address space is determined by all of @@ -1732,14 +2068,14 @@ struct btrfs_root *open_ctree(struct super_block *sb, goto fail_alloc; } - memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy)); - memcpy(&fs_info->super_for_commit, &fs_info->super_copy, - sizeof(fs_info->super_for_commit)); + memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy)); + memcpy(fs_info->super_for_commit, fs_info->super_copy, + sizeof(*fs_info->super_for_commit)); brelse(bh); - memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE); + memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE); - disk_super = &fs_info->super_copy; + disk_super = fs_info->super_copy; if (!btrfs_super_root(disk_super)) goto fail_alloc; @@ -1749,6 +2085,13 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); /* + * run through our array of backup supers and setup + * our ring pointer to the oldest one + */ + generation = btrfs_super_generation(disk_super); + find_oldest_super_backup(fs_info, generation); + + /* * In the long term, we'll store the compression type in the super * block, and it'll be used for per file compression control. */ @@ -1802,6 +2145,9 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->thread_pool_size), &fs_info->generic_worker); + btrfs_init_workers(&fs_info->caching_workers, "cache", + 2, &fs_info->generic_worker); + /* a higher idle thresh on the submit workers makes it much more * likely that bios will be send down in a sane order to the * devices @@ -1833,6 +2179,9 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta", fs_info->thread_pool_size, &fs_info->generic_worker); + btrfs_init_workers(&fs_info->readahead_workers, "readahead", + fs_info->thread_pool_size, + &fs_info->generic_worker); /* * endios are largely parallel and should have a very @@ -1843,6 +2192,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->endio_write_workers.idle_thresh = 2; fs_info->endio_meta_write_workers.idle_thresh = 2; + fs_info->readahead_workers.idle_thresh = 2; btrfs_start_workers(&fs_info->workers, 1); btrfs_start_workers(&fs_info->generic_worker, 1); @@ -1855,6 +2205,8 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_start_workers(&fs_info->endio_write_workers, 1); btrfs_start_workers(&fs_info->endio_freespace_worker, 1); btrfs_start_workers(&fs_info->delayed_workers, 1); + btrfs_start_workers(&fs_info->caching_workers, 1); + btrfs_start_workers(&fs_info->readahead_workers, 1); fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, @@ -1901,7 +2253,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n", sb->s_id); - goto fail_chunk_root; + goto fail_tree_roots; } btrfs_set_root_node(&chunk_root->root_item, chunk_root->node); chunk_root->commit_root = btrfs_root_node(chunk_root); @@ -1916,11 +2268,12 @@ struct btrfs_root *open_ctree(struct super_block *sb, if (ret) { printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n", sb->s_id); - goto fail_chunk_root; + goto fail_tree_roots; } btrfs_close_extra_devices(fs_devices); +retry_root_backup: blocksize = btrfs_level_size(tree_root, btrfs_super_root_level(disk_super)); generation = btrfs_super_generation(disk_super); @@ -1928,32 +2281,33 @@ struct btrfs_root *open_ctree(struct super_block *sb, tree_root->node = read_tree_block(tree_root, btrfs_super_root(disk_super), blocksize, generation); - if (!tree_root->node) - goto fail_chunk_root; - if (!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) { + if (!tree_root->node || + !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) { printk(KERN_WARNING "btrfs: failed to read tree root on %s\n", sb->s_id); - goto fail_tree_root; + + goto recovery_tree_root; } + btrfs_set_root_node(&tree_root->root_item, tree_root->node); tree_root->commit_root = btrfs_root_node(tree_root); ret = find_and_setup_root(tree_root, fs_info, BTRFS_EXTENT_TREE_OBJECTID, extent_root); if (ret) - goto fail_tree_root; + goto recovery_tree_root; extent_root->track_dirty = 1; ret = find_and_setup_root(tree_root, fs_info, BTRFS_DEV_TREE_OBJECTID, dev_root); if (ret) - goto fail_extent_root; + goto recovery_tree_root; dev_root->track_dirty = 1; ret = find_and_setup_root(tree_root, fs_info, BTRFS_CSUM_TREE_OBJECTID, csum_root); if (ret) - goto fail_dev_root; + goto recovery_tree_root; csum_root->track_dirty = 1; @@ -2086,22 +2440,13 @@ fail_cleaner: fail_block_groups: btrfs_free_block_groups(fs_info); - free_extent_buffer(csum_root->node); - free_extent_buffer(csum_root->commit_root); -fail_dev_root: - free_extent_buffer(dev_root->node); - free_extent_buffer(dev_root->commit_root); -fail_extent_root: - free_extent_buffer(extent_root->node); - free_extent_buffer(extent_root->commit_root); -fail_tree_root: - free_extent_buffer(tree_root->node); - free_extent_buffer(tree_root->commit_root); -fail_chunk_root: - free_extent_buffer(chunk_root->node); - free_extent_buffer(chunk_root->commit_root); + +fail_tree_roots: + free_root_pointers(fs_info, 1); + fail_sb_buffer: btrfs_stop_workers(&fs_info->generic_worker); + btrfs_stop_workers(&fs_info->readahead_workers); btrfs_stop_workers(&fs_info->fixup_workers); btrfs_stop_workers(&fs_info->delalloc_workers); btrfs_stop_workers(&fs_info->workers); @@ -2112,26 +2457,39 @@ fail_sb_buffer: btrfs_stop_workers(&fs_info->endio_freespace_worker); btrfs_stop_workers(&fs_info->submit_workers); btrfs_stop_workers(&fs_info->delayed_workers); + btrfs_stop_workers(&fs_info->caching_workers); fail_alloc: - kfree(fs_info->delayed_root); fail_iput: + btrfs_mapping_tree_free(&fs_info->mapping_tree); + invalidate_inode_pages2(fs_info->btree_inode->i_mapping); iput(fs_info->btree_inode); - - btrfs_close_devices(fs_info->fs_devices); - btrfs_mapping_tree_free(&fs_info->mapping_tree); fail_bdi: bdi_destroy(&fs_info->bdi); fail_srcu: cleanup_srcu_struct(&fs_info->subvol_srcu); fail: - kfree(extent_root); - kfree(tree_root); - kfree(fs_info); - kfree(chunk_root); - kfree(dev_root); - kfree(csum_root); + btrfs_close_devices(fs_info->fs_devices); + free_fs_info(fs_info); return ERR_PTR(err); + +recovery_tree_root: + if (!btrfs_test_opt(tree_root, RECOVERY)) + goto fail_tree_roots; + + free_root_pointers(fs_info, 0); + + /* don't use the log in recovery mode, it won't be valid */ + btrfs_set_super_log_root(disk_super, 0); + + /* we can't trust the free space cache either */ + btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE); + + ret = next_root_backup(fs_info, fs_info->super_copy, + &num_backups_tried, &backup_index); + if (ret == -1) + goto fail_block_groups; + goto retry_root_backup; } static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) @@ -2215,22 +2573,10 @@ static int write_dev_supers(struct btrfs_device *device, int errors = 0; u32 crc; u64 bytenr; - int last_barrier = 0; if (max_mirrors == 0) max_mirrors = BTRFS_SUPER_MIRROR_MAX; - /* make sure only the last submit_bh does a barrier */ - if (do_barriers) { - for (i = 0; i < max_mirrors; i++) { - bytenr = btrfs_sb_offset(i); - if (bytenr + BTRFS_SUPER_INFO_SIZE >= - device->total_bytes) - break; - last_barrier = i; - } - } - for (i = 0; i < max_mirrors; i++) { bytenr = btrfs_sb_offset(i); if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes) @@ -2276,17 +2622,136 @@ static int write_dev_supers(struct btrfs_device *device, bh->b_end_io = btrfs_end_buffer_write_sync; } - if (i == last_barrier && do_barriers) - ret = submit_bh(WRITE_FLUSH_FUA, bh); - else - ret = submit_bh(WRITE_SYNC, bh); - + /* + * we fua the first super. The others we allow + * to go down lazy. + */ + ret = submit_bh(WRITE_FUA, bh); if (ret) errors++; } return errors < i ? 0 : -1; } +/* + * endio for the write_dev_flush, this will wake anyone waiting + * for the barrier when it is done + */ +static void btrfs_end_empty_barrier(struct bio *bio, int err) +{ + if (err) { + if (err == -EOPNOTSUPP) + set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); + clear_bit(BIO_UPTODATE, &bio->bi_flags); + } + if (bio->bi_private) + complete(bio->bi_private); + bio_put(bio); +} + +/* + * trigger flushes for one the devices. If you pass wait == 0, the flushes are + * sent down. With wait == 1, it waits for the previous flush. + * + * any device where the flush fails with eopnotsupp are flagged as not-barrier + * capable + */ +static int write_dev_flush(struct btrfs_device *device, int wait) +{ + struct bio *bio; + int ret = 0; + + if (device->nobarriers) + return 0; + + if (wait) { + bio = device->flush_bio; + if (!bio) + return 0; + + wait_for_completion(&device->flush_wait); + + if (bio_flagged(bio, BIO_EOPNOTSUPP)) { + printk("btrfs: disabling barriers on dev %s\n", + device->name); + device->nobarriers = 1; + } + if (!bio_flagged(bio, BIO_UPTODATE)) { + ret = -EIO; + } + + /* drop the reference from the wait == 0 run */ + bio_put(bio); + device->flush_bio = NULL; + + return ret; + } + + /* + * one reference for us, and we leave it for the + * caller + */ + device->flush_bio = NULL;; + bio = bio_alloc(GFP_NOFS, 0); + if (!bio) + return -ENOMEM; + + bio->bi_end_io = btrfs_end_empty_barrier; + bio->bi_bdev = device->bdev; + init_completion(&device->flush_wait); + bio->bi_private = &device->flush_wait; + device->flush_bio = bio; + + bio_get(bio); + submit_bio(WRITE_FLUSH, bio); + + return 0; +} + +/* + * send an empty flush down to each device in parallel, + * then wait for them + */ +static int barrier_all_devices(struct btrfs_fs_info *info) +{ + struct list_head *head; + struct btrfs_device *dev; + int errors = 0; + int ret; + + /* send down all the barriers */ + head = &info->fs_devices->devices; + list_for_each_entry_rcu(dev, head, dev_list) { + if (!dev->bdev) { + errors++; + continue; + } + if (!dev->in_fs_metadata || !dev->writeable) + continue; + + ret = write_dev_flush(dev, 0); + if (ret) + errors++; + } + + /* wait for all the barriers */ + list_for_each_entry_rcu(dev, head, dev_list) { + if (!dev->bdev) { + errors++; + continue; + } + if (!dev->in_fs_metadata || !dev->writeable) + continue; + + ret = write_dev_flush(dev, 1); + if (ret) + errors++; + } + if (errors) + return -EIO; + return 0; +} + int write_all_supers(struct btrfs_root *root, int max_mirrors) { struct list_head *head; @@ -2299,14 +2764,19 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors) int total_errors = 0; u64 flags; - max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; + max_errors = btrfs_super_num_devices(root->fs_info->super_copy) - 1; do_barriers = !btrfs_test_opt(root, NOBARRIER); + backup_super_roots(root->fs_info); - sb = &root->fs_info->super_for_commit; + sb = root->fs_info->super_for_commit; dev_item = &sb->dev_item; mutex_lock(&root->fs_info->fs_devices->device_list_mutex); head = &root->fs_info->fs_devices->devices; + + if (do_barriers) + barrier_all_devices(root->fs_info); + list_for_each_entry_rcu(dev, head, dev_list) { if (!dev->bdev) { total_errors++; @@ -2506,8 +2976,6 @@ int close_ctree(struct btrfs_root *root) /* clear out the rbtree of defraggable inodes */ btrfs_run_defrag_inodes(root->fs_info); - btrfs_put_block_group_cache(fs_info); - /* * Here come 2 situations when btrfs is broken to flip readonly: * @@ -2533,6 +3001,8 @@ int close_ctree(struct btrfs_root *root) printk(KERN_ERR "btrfs: commit super ret %d\n", ret); } + btrfs_put_block_group_cache(fs_info); + kthread_stop(root->fs_info->transaction_kthread); kthread_stop(root->fs_info->cleaner_kthread); @@ -2564,7 +3034,6 @@ int close_ctree(struct btrfs_root *root) del_fs_roots(fs_info); iput(fs_info->btree_inode); - kfree(fs_info->delayed_root); btrfs_stop_workers(&fs_info->generic_worker); btrfs_stop_workers(&fs_info->fixup_workers); @@ -2577,6 +3046,8 @@ int close_ctree(struct btrfs_root *root) btrfs_stop_workers(&fs_info->endio_freespace_worker); btrfs_stop_workers(&fs_info->submit_workers); btrfs_stop_workers(&fs_info->delayed_workers); + btrfs_stop_workers(&fs_info->caching_workers); + btrfs_stop_workers(&fs_info->readahead_workers); btrfs_close_devices(fs_info->fs_devices); btrfs_mapping_tree_free(&fs_info->mapping_tree); @@ -2584,12 +3055,7 @@ int close_ctree(struct btrfs_root *root) bdi_destroy(&fs_info->bdi); cleanup_srcu_struct(&fs_info->subvol_srcu); - kfree(fs_info->extent_root); - kfree(fs_info->tree_root); - kfree(fs_info->chunk_root); - kfree(fs_info->dev_root); - kfree(fs_info->csum_root); - kfree(fs_info); + free_fs_info(fs_info); return 0; } @@ -2695,7 +3161,8 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) return ret; } -int btree_lock_page_hook(struct page *page) +static int btree_lock_page_hook(struct page *page, void *data, + void (*flush_fn)(void *)) { struct inode *inode = page->mapping->host; struct btrfs_root *root = BTRFS_I(inode)->root; @@ -2712,7 +3179,10 @@ int btree_lock_page_hook(struct page *page) if (!eb) goto out; - btrfs_tree_lock(eb); + if (!btrfs_try_tree_write_lock(eb)) { + flush_fn(data); + btrfs_tree_lock(eb); + } btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { @@ -2727,7 +3197,10 @@ int btree_lock_page_hook(struct page *page) btrfs_tree_unlock(eb); free_extent_buffer(eb); out: - lock_page(page); + if (!trylock_page(page)) { + flush_fn(data); + lock_page(page); + } return 0; } @@ -3083,6 +3556,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root) static struct extent_io_ops btree_extent_io_ops = { .write_cache_pages_lock_hook = btree_lock_page_hook, .readpage_end_io_hook = btree_readpage_end_io_hook, + .readpage_io_failed_hook = btree_io_failed_hook, .submit_bio_hook = btree_submit_bio_hook, /* note we're sharing with inode.c for the merge bio hook */ .merge_bio_hook = btrfs_merge_bio_hook, diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index a0b610a67aae..c99d0a8f13fa 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -40,6 +40,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, u64 parent_transid); int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, u64 parent_transid); +int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, + int mirror_num, struct extent_buffer **eb); struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize); int clean_tree_block(struct btrfs_trans_handle *trans, @@ -83,14 +85,16 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_add_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root); -int btree_lock_page_hook(struct page *page); - #ifdef CONFIG_DEBUG_LOCK_ALLOC -void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level); +void btrfs_init_lockdep(void); +void btrfs_set_buffer_lockdep_class(u64 objectid, + struct extent_buffer *eb, int level); #else -static inline void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, - int level) +static inline void btrfs_init_lockdep(void) +{ } +static inline void btrfs_set_buffer_lockdep_class(u64 objectid, + struct extent_buffer *eb, int level) { } #endif diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 71cd456fdb60..f0d5718d2587 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -23,6 +23,7 @@ #include <linux/rcupdate.h> #include <linux/kthread.h> #include <linux/slab.h> +#include <linux/ratelimit.h> #include "compat.h" #include "hash.h" #include "ctree.h" @@ -52,6 +53,21 @@ enum { CHUNK_ALLOC_LIMITED = 2, }; +/* + * Control how reservations are dealt with. + * + * RESERVE_FREE - freeing a reservation. + * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for + * ENOSPC accounting + * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update + * bytes_may_use as the ENOSPC accounting is done elsewhere + */ +enum { + RESERVE_FREE = 0, + RESERVE_ALLOC = 1, + RESERVE_ALLOC_NO_ACCOUNT = 2, +}; + static int update_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, int alloc); @@ -81,6 +97,8 @@ static int find_next_key(struct btrfs_path *path, int level, struct btrfs_key *key); static void dump_space_info(struct btrfs_space_info *info, u64 bytes, int dump_block_groups); +static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 num_bytes, int reserve); static noinline int block_group_cache_done(struct btrfs_block_group_cache *cache) @@ -104,7 +122,6 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache) if (atomic_dec_and_test(&cache->count)) { WARN_ON(cache->pinned > 0); WARN_ON(cache->reserved > 0); - WARN_ON(cache->reserved_pinned > 0); kfree(cache->free_space_ctl); kfree(cache); } @@ -320,12 +337,12 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group, return total_added; } -static int caching_kthread(void *data) +static noinline void caching_thread(struct btrfs_work *work) { - struct btrfs_block_group_cache *block_group = data; - struct btrfs_fs_info *fs_info = block_group->fs_info; - struct btrfs_caching_control *caching_ctl = block_group->caching_ctl; - struct btrfs_root *extent_root = fs_info->extent_root; + struct btrfs_block_group_cache *block_group; + struct btrfs_fs_info *fs_info; + struct btrfs_caching_control *caching_ctl; + struct btrfs_root *extent_root; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_key key; @@ -334,9 +351,14 @@ static int caching_kthread(void *data) u32 nritems; int ret = 0; + caching_ctl = container_of(work, struct btrfs_caching_control, work); + block_group = caching_ctl->block_group; + fs_info = block_group->fs_info; + extent_root = fs_info->extent_root; + path = btrfs_alloc_path(); if (!path) - return -ENOMEM; + goto out; last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); @@ -433,13 +455,11 @@ err: free_excluded_extents(extent_root, block_group); mutex_unlock(&caching_ctl->mutex); +out: wake_up(&caching_ctl->wait); put_caching_control(caching_ctl); - atomic_dec(&block_group->space_info->caching_threads); btrfs_put_block_group(block_group); - - return 0; } static int cache_block_group(struct btrfs_block_group_cache *cache, @@ -447,14 +467,59 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, struct btrfs_root *root, int load_cache_only) { + DEFINE_WAIT(wait); struct btrfs_fs_info *fs_info = cache->fs_info; struct btrfs_caching_control *caching_ctl; - struct task_struct *tsk; int ret = 0; - smp_mb(); - if (cache->cached != BTRFS_CACHE_NO) + caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); + BUG_ON(!caching_ctl); + + INIT_LIST_HEAD(&caching_ctl->list); + mutex_init(&caching_ctl->mutex); + init_waitqueue_head(&caching_ctl->wait); + caching_ctl->block_group = cache; + caching_ctl->progress = cache->key.objectid; + atomic_set(&caching_ctl->count, 1); + caching_ctl->work.func = caching_thread; + + spin_lock(&cache->lock); + /* + * This should be a rare occasion, but this could happen I think in the + * case where one thread starts to load the space cache info, and then + * some other thread starts a transaction commit which tries to do an + * allocation while the other thread is still loading the space cache + * info. The previous loop should have kept us from choosing this block + * group, but if we've moved to the state where we will wait on caching + * block groups we need to first check if we're doing a fast load here, + * so we can wait for it to finish, otherwise we could end up allocating + * from a block group who's cache gets evicted for one reason or + * another. + */ + while (cache->cached == BTRFS_CACHE_FAST) { + struct btrfs_caching_control *ctl; + + ctl = cache->caching_ctl; + atomic_inc(&ctl->count); + prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&cache->lock); + + schedule(); + + finish_wait(&ctl->wait, &wait); + put_caching_control(ctl); + spin_lock(&cache->lock); + } + + if (cache->cached != BTRFS_CACHE_NO) { + spin_unlock(&cache->lock); + kfree(caching_ctl); return 0; + } + WARN_ON(cache->caching_ctl); + cache->caching_ctl = caching_ctl; + cache->cached = BTRFS_CACHE_FAST; + spin_unlock(&cache->lock); /* * We can't do the read from on-disk cache during a commit since we need @@ -463,69 +528,59 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, * we likely hold important locks. */ if (trans && (!trans->transaction->in_commit) && - (root && root != root->fs_info->tree_root)) { - spin_lock(&cache->lock); - if (cache->cached != BTRFS_CACHE_NO) { - spin_unlock(&cache->lock); - return 0; - } - cache->cached = BTRFS_CACHE_STARTED; - spin_unlock(&cache->lock); - + (root && root != root->fs_info->tree_root) && + btrfs_test_opt(root, SPACE_CACHE)) { ret = load_free_space_cache(fs_info, cache); spin_lock(&cache->lock); if (ret == 1) { + cache->caching_ctl = NULL; cache->cached = BTRFS_CACHE_FINISHED; cache->last_byte_to_unpin = (u64)-1; } else { - cache->cached = BTRFS_CACHE_NO; + if (load_cache_only) { + cache->caching_ctl = NULL; + cache->cached = BTRFS_CACHE_NO; + } else { + cache->cached = BTRFS_CACHE_STARTED; + } } spin_unlock(&cache->lock); + wake_up(&caching_ctl->wait); if (ret == 1) { + put_caching_control(caching_ctl); free_excluded_extents(fs_info->extent_root, cache); return 0; } + } else { + /* + * We are not going to do the fast caching, set cached to the + * appropriate value and wakeup any waiters. + */ + spin_lock(&cache->lock); + if (load_cache_only) { + cache->caching_ctl = NULL; + cache->cached = BTRFS_CACHE_NO; + } else { + cache->cached = BTRFS_CACHE_STARTED; + } + spin_unlock(&cache->lock); + wake_up(&caching_ctl->wait); } - if (load_cache_only) - return 0; - - caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); - BUG_ON(!caching_ctl); - - INIT_LIST_HEAD(&caching_ctl->list); - mutex_init(&caching_ctl->mutex); - init_waitqueue_head(&caching_ctl->wait); - caching_ctl->block_group = cache; - caching_ctl->progress = cache->key.objectid; - /* one for caching kthread, one for caching block group list */ - atomic_set(&caching_ctl->count, 2); - - spin_lock(&cache->lock); - if (cache->cached != BTRFS_CACHE_NO) { - spin_unlock(&cache->lock); - kfree(caching_ctl); + if (load_cache_only) { + put_caching_control(caching_ctl); return 0; } - cache->caching_ctl = caching_ctl; - cache->cached = BTRFS_CACHE_STARTED; - spin_unlock(&cache->lock); down_write(&fs_info->extent_commit_sem); + atomic_inc(&caching_ctl->count); list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); up_write(&fs_info->extent_commit_sem); - atomic_inc(&cache->space_info->caching_threads); btrfs_get_block_group(cache); - tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n", - cache->key.objectid); - if (IS_ERR(tsk)) { - ret = PTR_ERR(tsk); - printk(KERN_ERR "error running thread %d\n", ret); - BUG(); - } + btrfs_queue_worker(&fs_info->caching_workers, &caching_ctl->work); return ret; } @@ -667,7 +722,9 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len) struct btrfs_path *path; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; + key.objectid = start; key.offset = len; btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); @@ -1772,18 +1829,21 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, { int ret; u64 discarded_bytes = 0; - struct btrfs_multi_bio *multi = NULL; + struct btrfs_bio *bbio = NULL; /* Tell the block device(s) that the sectors can be discarded */ ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD, - bytenr, &num_bytes, &multi, 0); + bytenr, &num_bytes, &bbio, 0); if (!ret) { - struct btrfs_bio_stripe *stripe = multi->stripes; + struct btrfs_bio_stripe *stripe = bbio->stripes; int i; - for (i = 0; i < multi->num_stripes; i++, stripe++) { + for (i = 0; i < bbio->num_stripes; i++, stripe++) { + if (!stripe->dev->can_discard) + continue; + ret = btrfs_issue_discard(stripe->dev->bdev, stripe->physical, stripe->length); @@ -1791,11 +1851,16 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, discarded_bytes += stripe->length; else if (ret != -EOPNOTSUPP) break; + + /* + * Just in case we get back EOPNOTSUPP for some reason, + * just ignore the return value so we don't screw up + * people calling discard_extent. + */ + ret = 0; } - kfree(multi); + kfree(bbio); } - if (discarded_bytes && ret == -EOPNOTSUPP) - ret = 0; if (actual_bytes) *actual_bytes = discarded_bytes; @@ -2694,6 +2759,13 @@ again: goto again; } + /* We've already setup this transaction, go ahead and exit */ + if (block_group->cache_generation == trans->transid && + i_size_read(inode)) { + dcs = BTRFS_DC_SETUP; + goto out_put; + } + /* * We want to set the generation to 0, that way if anything goes wrong * from here on out we know not to trust this cache when we load up next @@ -2743,12 +2815,15 @@ again: if (!ret) dcs = BTRFS_DC_SETUP; btrfs_free_reserved_data_space(inode, num_pages); + out_put: iput(inode); out_free: btrfs_release_path(path); out: spin_lock(&block_group->lock); + if (!ret) + block_group->cache_generation = trans->transid; block_group->disk_cache_state = dcs; spin_unlock(&block_group->lock); @@ -2932,9 +3007,10 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->full = 0; found->force_alloc = CHUNK_ALLOC_NO_FORCE; found->chunk_alloc = 0; + found->flush = 0; + init_waitqueue_head(&found->wait); *space_info = found; list_add_rcu(&found->list, &info->space_info); - atomic_set(&found->caching_threads, 0); return 0; } @@ -3115,16 +3191,13 @@ commit_trans: return -ENOSPC; } data_sinfo->bytes_may_use += bytes; - BTRFS_I(inode)->reserved_bytes += bytes; spin_unlock(&data_sinfo->lock); return 0; } /* - * called when we are clearing an delalloc extent from the - * inode's io_tree or there was an error for whatever reason - * after calling btrfs_check_data_free_space + * Called if we need to clear a data reservation for this inode. */ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) { @@ -3137,7 +3210,6 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) data_sinfo = BTRFS_I(inode)->space_info; spin_lock(&data_sinfo->lock); data_sinfo->bytes_may_use -= bytes; - BTRFS_I(inode)->reserved_bytes -= bytes; spin_unlock(&data_sinfo->lock); } @@ -3158,6 +3230,7 @@ static int should_alloc_chunk(struct btrfs_root *root, struct btrfs_space_info *sinfo, u64 alloc_bytes, int force) { + struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved; u64 thresh; @@ -3166,11 +3239,18 @@ static int should_alloc_chunk(struct btrfs_root *root, return 1; /* + * We need to take into account the global rsv because for all intents + * and purposes it's used space. Don't worry about locking the + * global_rsv, it doesn't change except when the transaction commits. + */ + num_allocated += global_rsv->size; + + /* * in limited mode, we want to have some free space up to * about 1% of the FS size. */ if (force == CHUNK_ALLOC_LIMITED) { - thresh = btrfs_super_total_bytes(&root->fs_info->super_copy); + thresh = btrfs_super_total_bytes(root->fs_info->super_copy); thresh = max_t(u64, 64 * 1024 * 1024, div_factor_fine(thresh, 1)); @@ -3192,7 +3272,7 @@ static int should_alloc_chunk(struct btrfs_root *root, if (num_allocated + alloc_bytes < div_factor(num_bytes, 8)) return 0; - thresh = btrfs_super_total_bytes(&root->fs_info->super_copy); + thresh = btrfs_super_total_bytes(root->fs_info->super_copy); /* 256MB or 5% of the FS */ thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5)); @@ -3275,6 +3355,9 @@ again: } ret = btrfs_alloc_chunk(trans, extent_root, flags); + if (ret < 0 && ret != -ENOSPC) + goto out; + spin_lock(&space_info->lock); if (ret) space_info->full = 1; @@ -3284,6 +3367,7 @@ again: space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; space_info->chunk_alloc = 0; spin_unlock(&space_info->lock); +out: mutex_unlock(&extent_root->fs_info->chunk_mutex); return ret; } @@ -3291,42 +3375,54 @@ again: /* * shrink metadata reservation for delalloc */ -static int shrink_delalloc(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 to_reclaim, int sync) +static int shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, + bool wait_ordered) { struct btrfs_block_rsv *block_rsv; struct btrfs_space_info *space_info; + struct btrfs_trans_handle *trans; u64 reserved; u64 max_reclaim; u64 reclaimed = 0; long time_left; - int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; + unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; int loops = 0; unsigned long progress; + trans = (struct btrfs_trans_handle *)current->journal_info; block_rsv = &root->fs_info->delalloc_block_rsv; space_info = block_rsv->space_info; smp_mb(); - reserved = space_info->bytes_reserved; + reserved = space_info->bytes_may_use; progress = space_info->reservation_progress; if (reserved == 0) return 0; - max_reclaim = min(reserved, to_reclaim); + smp_mb(); + if (root->fs_info->delalloc_bytes == 0) { + if (trans) + return 0; + btrfs_wait_ordered_extents(root, 0, 0); + return 0; + } + max_reclaim = min(reserved, to_reclaim); + nr_pages = max_t(unsigned long, nr_pages, + max_reclaim >> PAGE_CACHE_SHIFT); while (loops < 1024) { /* have the flusher threads jump in and do some IO */ smp_mb(); nr_pages = min_t(unsigned long, nr_pages, root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT); - writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages); + writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages, + WB_REASON_FS_FREE_SPACE); spin_lock(&space_info->lock); - if (reserved > space_info->bytes_reserved) - reclaimed += reserved - space_info->bytes_reserved; - reserved = space_info->bytes_reserved; + if (reserved > space_info->bytes_may_use) + reclaimed += reserved - space_info->bytes_may_use; + reserved = space_info->bytes_may_use; spin_unlock(&space_info->lock); loops++; @@ -3337,11 +3433,15 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, if (trans && trans->transaction->blocked) return -EAGAIN; - time_left = schedule_timeout_interruptible(1); + if (wait_ordered && !trans) { + btrfs_wait_ordered_extents(root, 0, 0); + } else { + time_left = schedule_timeout_interruptible(1); - /* We were interrupted, exit */ - if (time_left) - break; + /* We were interrupted, exit */ + if (time_left) + break; + } /* we've kicked the IO a few times, if anything has been freed, * exit. There is no sense in looping here for a long time @@ -3356,42 +3456,121 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, } } + return reclaimed >= to_reclaim; } -/* - * Retries tells us how many times we've called reserve_metadata_bytes. The - * idea is if this is the first call (retries == 0) then we will add to our - * reserved count if we can't make the allocation in order to hold our place - * while we go and try and free up space. That way for retries > 1 we don't try - * and add space, we just check to see if the amount of unused space is >= the - * total space, meaning that our reservation is valid. +/** + * maybe_commit_transaction - possibly commit the transaction if its ok to + * @root - the root we're allocating for + * @bytes - the number of bytes we want to reserve + * @force - force the commit * - * However if we don't intend to retry this reservation, pass -1 as retries so - * that it short circuits this logic. + * This will check to make sure that committing the transaction will actually + * get us somewhere and then commit the transaction if it does. Otherwise it + * will return -ENOSPC. */ -static int reserve_metadata_bytes(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +static int may_commit_transaction(struct btrfs_root *root, + struct btrfs_space_info *space_info, + u64 bytes, int force) +{ + struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv; + struct btrfs_trans_handle *trans; + + trans = (struct btrfs_trans_handle *)current->journal_info; + if (trans) + return -EAGAIN; + + if (force) + goto commit; + + /* See if there is enough pinned space to make this reservation */ + spin_lock(&space_info->lock); + if (space_info->bytes_pinned >= bytes) { + spin_unlock(&space_info->lock); + goto commit; + } + spin_unlock(&space_info->lock); + + /* + * See if there is some space in the delayed insertion reservation for + * this reservation. + */ + if (space_info != delayed_rsv->space_info) + return -ENOSPC; + + spin_lock(&delayed_rsv->lock); + if (delayed_rsv->size < bytes) { + spin_unlock(&delayed_rsv->lock); + return -ENOSPC; + } + spin_unlock(&delayed_rsv->lock); + +commit: + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + return -ENOSPC; + + return btrfs_commit_transaction(trans, root); +} + +/** + * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space + * @root - the root we're allocating for + * @block_rsv - the block_rsv we're allocating for + * @orig_bytes - the number of bytes we want + * @flush - wether or not we can flush to make our reservation + * + * This will reserve orgi_bytes number of bytes from the space info associated + * with the block_rsv. If there is not enough space it will make an attempt to + * flush out space to make room. It will do this by flushing delalloc if + * possible or committing the transaction. If flush is 0 then no attempts to + * regain reservations will be made and this will fail if there is not enough + * space already. + */ +static int reserve_metadata_bytes(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 orig_bytes, int flush) { struct btrfs_space_info *space_info = block_rsv->space_info; - u64 unused; + u64 used; u64 num_bytes = orig_bytes; int retries = 0; int ret = 0; - bool reserved = false; bool committed = false; + bool flushing = false; + bool wait_ordered = false; again: - ret = -ENOSPC; - if (reserved) - num_bytes = 0; - + ret = 0; spin_lock(&space_info->lock); - unused = space_info->bytes_used + space_info->bytes_reserved + - space_info->bytes_pinned + space_info->bytes_readonly + - space_info->bytes_may_use; + /* + * We only want to wait if somebody other than us is flushing and we are + * actually alloed to flush. + */ + while (flush && !flushing && space_info->flush) { + spin_unlock(&space_info->lock); + /* + * If we have a trans handle we can't wait because the flusher + * may have to commit the transaction, which would mean we would + * deadlock since we are waiting for the flusher to finish, but + * hold the current transaction open. + */ + if (current->journal_info) + return -EAGAIN; + ret = wait_event_interruptible(space_info->wait, + !space_info->flush); + /* Must have been interrupted, return */ + if (ret) + return -EINTR; + + spin_lock(&space_info->lock); + } + + ret = -ENOSPC; + used = space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly + + space_info->bytes_may_use; /* * The idea here is that we've not already over-reserved the block group @@ -3400,11 +3579,9 @@ again: * lets start flushing stuff first and then come back and try to make * our reservation. */ - if (unused <= space_info->total_bytes) { - unused = space_info->total_bytes - unused; - if (unused >= num_bytes) { - if (!reserved) - space_info->bytes_reserved += orig_bytes; + if (used <= space_info->total_bytes) { + if (used + orig_bytes <= space_info->total_bytes) { + space_info->bytes_may_use += orig_bytes; ret = 0; } else { /* @@ -3420,91 +3597,129 @@ again: * amount plus the amount of bytes that we need for this * reservation. */ - num_bytes = unused - space_info->total_bytes + + wait_ordered = true; + num_bytes = used - space_info->total_bytes + (orig_bytes * (retries + 1)); } + if (ret) { + u64 profile = btrfs_get_alloc_profile(root, 0); + u64 avail; + + /* + * If we have a lot of space that's pinned, don't bother doing + * the overcommit dance yet and just commit the transaction. + */ + avail = (space_info->total_bytes - space_info->bytes_used) * 8; + do_div(avail, 10); + if (space_info->bytes_pinned >= avail && flush && !committed) { + space_info->flush = 1; + flushing = true; + spin_unlock(&space_info->lock); + ret = may_commit_transaction(root, space_info, + orig_bytes, 1); + if (ret) + goto out; + committed = true; + goto again; + } + + spin_lock(&root->fs_info->free_chunk_lock); + avail = root->fs_info->free_chunk_space; + + /* + * If we have dup, raid1 or raid10 then only half of the free + * space is actually useable. + */ + if (profile & (BTRFS_BLOCK_GROUP_DUP | + BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10)) + avail >>= 1; + + /* + * If we aren't flushing don't let us overcommit too much, say + * 1/8th of the space. If we can flush, let it overcommit up to + * 1/2 of the space. + */ + if (flush) + avail >>= 3; + else + avail >>= 1; + spin_unlock(&root->fs_info->free_chunk_lock); + + if (used + num_bytes < space_info->total_bytes + avail) { + space_info->bytes_may_use += orig_bytes; + ret = 0; + } else { + wait_ordered = true; + } + } + /* * Couldn't make our reservation, save our place so while we're trying * to reclaim space we can actually use it instead of somebody else * stealing it from us. */ - if (ret && !reserved) { - space_info->bytes_reserved += orig_bytes; - reserved = true; + if (ret && flush) { + flushing = true; + space_info->flush = 1; } spin_unlock(&space_info->lock); - if (!ret) - return 0; - - if (!flush) + if (!ret || !flush) goto out; /* * We do synchronous shrinking since we don't actually unreserve * metadata until after the IO is completed. */ - ret = shrink_delalloc(trans, root, num_bytes, 1); - if (ret > 0) - return 0; - else if (ret < 0) + ret = shrink_delalloc(root, num_bytes, wait_ordered); + if (ret < 0) goto out; + ret = 0; + /* * So if we were overcommitted it's possible that somebody else flushed * out enough space and we simply didn't have enough space to reclaim, * so go back around and try again. */ if (retries < 2) { + wait_ordered = true; retries++; goto again; } - spin_lock(&space_info->lock); - /* - * Not enough space to be reclaimed, don't bother committing the - * transaction. - */ - if (space_info->bytes_pinned < orig_bytes) - ret = -ENOSPC; - spin_unlock(&space_info->lock); - if (ret) - goto out; - - ret = -EAGAIN; - if (trans || committed) - goto out; - ret = -ENOSPC; - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) + if (committed) goto out; - ret = btrfs_commit_transaction(trans, root); + + ret = may_commit_transaction(root, space_info, orig_bytes, 0); if (!ret) { - trans = NULL; committed = true; goto again; } out: - if (reserved) { + if (flushing) { spin_lock(&space_info->lock); - space_info->bytes_reserved -= orig_bytes; + space_info->flush = 0; + wake_up_all(&space_info->wait); spin_unlock(&space_info->lock); } - return ret; } static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - struct btrfs_block_rsv *block_rsv; - if (root->ref_cows) + struct btrfs_block_rsv *block_rsv = NULL; + + if (root->ref_cows || root == root->fs_info->csum_root) block_rsv = trans->block_rsv; - else + + if (!block_rsv) block_rsv = root->block_rsv; if (!block_rsv) @@ -3575,7 +3790,7 @@ static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv, } if (num_bytes) { spin_lock(&space_info->lock); - space_info->bytes_reserved -= num_bytes; + space_info->bytes_may_use -= num_bytes; space_info->reservation_progress++; spin_unlock(&space_info->lock); } @@ -3599,9 +3814,6 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv) { memset(rsv, 0, sizeof(*rsv)); spin_lock_init(&rsv->lock); - atomic_set(&rsv->usage, 1); - rsv->priority = 6; - INIT_LIST_HEAD(&rsv->list); } struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root) @@ -3622,38 +3834,20 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root) void btrfs_free_block_rsv(struct btrfs_root *root, struct btrfs_block_rsv *rsv) { - if (rsv && atomic_dec_and_test(&rsv->usage)) { - btrfs_block_rsv_release(root, rsv, (u64)-1); - if (!rsv->durable) - kfree(rsv); - } -} - -/* - * make the block_rsv struct be able to capture freed space. - * the captured space will re-add to the the block_rsv struct - * after transaction commit - */ -void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *block_rsv) -{ - block_rsv->durable = 1; - mutex_lock(&fs_info->durable_block_rsv_mutex); - list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list); - mutex_unlock(&fs_info->durable_block_rsv_mutex); + btrfs_block_rsv_release(root, rsv, (u64)-1); + kfree(rsv); } -int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes) +static inline int __block_rsv_add(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes, int flush) { int ret; if (num_bytes == 0) return 0; - ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 1); + ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); if (!ret) { block_rsv_add_bytes(block_rsv, num_bytes, 1); return 0; @@ -3662,56 +3856,80 @@ int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, return ret; } -int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 min_reserved, int min_factor) +int btrfs_block_rsv_add(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes) +{ + return __block_rsv_add(root, block_rsv, num_bytes, 1); +} + +int btrfs_block_rsv_add_noflush(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes) +{ + return __block_rsv_add(root, block_rsv, num_bytes, 0); +} + +int btrfs_block_rsv_check(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, int min_factor) { u64 num_bytes = 0; - int commit_trans = 0; int ret = -ENOSPC; if (!block_rsv) return 0; spin_lock(&block_rsv->lock); - if (min_factor > 0) - num_bytes = div_factor(block_rsv->size, min_factor); - if (min_reserved > num_bytes) - num_bytes = min_reserved; + num_bytes = div_factor(block_rsv->size, min_factor); + if (block_rsv->reserved >= num_bytes) + ret = 0; + spin_unlock(&block_rsv->lock); - if (block_rsv->reserved >= num_bytes) { + return ret; +} + +static inline int __btrfs_block_rsv_refill(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 min_reserved, int flush) +{ + u64 num_bytes = 0; + int ret = -ENOSPC; + + if (!block_rsv) + return 0; + + spin_lock(&block_rsv->lock); + num_bytes = min_reserved; + if (block_rsv->reserved >= num_bytes) ret = 0; - } else { + else num_bytes -= block_rsv->reserved; - if (block_rsv->durable && - block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes) - commit_trans = 1; - } spin_unlock(&block_rsv->lock); + if (!ret) return 0; - if (block_rsv->refill_used) { - ret = reserve_metadata_bytes(trans, root, block_rsv, - num_bytes, 0); - if (!ret) { - block_rsv_add_bytes(block_rsv, num_bytes, 0); - return 0; - } + ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); + if (!ret) { + block_rsv_add_bytes(block_rsv, num_bytes, 0); + return 0; } - if (commit_trans) { - if (trans) - return -EAGAIN; + return ret; +} - trans = btrfs_join_transaction(root); - BUG_ON(IS_ERR(trans)); - ret = btrfs_commit_transaction(trans, root); - return 0; - } +int btrfs_block_rsv_refill(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 min_reserved) +{ + return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1); +} - return -ENOSPC; +int btrfs_block_rsv_refill_noflush(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 min_reserved) +{ + return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0); } int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, @@ -3743,7 +3961,7 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info) u64 num_bytes; u64 meta_used; u64 data_used; - int csum_size = btrfs_super_csum_size(&fs_info->super_copy); + int csum_size = btrfs_super_csum_size(fs_info->super_copy); sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); spin_lock(&sinfo->lock); @@ -3787,12 +4005,12 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) if (sinfo->total_bytes > num_bytes) { num_bytes = sinfo->total_bytes - num_bytes; block_rsv->reserved += num_bytes; - sinfo->bytes_reserved += num_bytes; + sinfo->bytes_may_use += num_bytes; } if (block_rsv->reserved >= block_rsv->size) { num_bytes = block_rsv->reserved - block_rsv->size; - sinfo->bytes_reserved -= num_bytes; + sinfo->bytes_may_use -= num_bytes; sinfo->reservation_progress++; block_rsv->reserved = block_rsv->size; block_rsv->full = 1; @@ -3808,16 +4026,13 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info) space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); fs_info->chunk_block_rsv.space_info = space_info; - fs_info->chunk_block_rsv.priority = 10; space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); fs_info->global_block_rsv.space_info = space_info; - fs_info->global_block_rsv.priority = 10; - fs_info->global_block_rsv.refill_used = 1; fs_info->delalloc_block_rsv.space_info = space_info; fs_info->trans_block_rsv.space_info = space_info; fs_info->empty_block_rsv.space_info = space_info; - fs_info->empty_block_rsv.priority = 10; + fs_info->delayed_block_rsv.space_info = space_info; fs_info->extent_root->block_rsv = &fs_info->global_block_rsv; fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; @@ -3825,10 +4040,6 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info) fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; - btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv); - - btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv); - update_global_block_rsv(fs_info); } @@ -3841,57 +4052,8 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info) WARN_ON(fs_info->trans_block_rsv.reserved > 0); WARN_ON(fs_info->chunk_block_rsv.size > 0); WARN_ON(fs_info->chunk_block_rsv.reserved > 0); -} - -int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_block_rsv *rsv) -{ - struct btrfs_block_rsv *trans_rsv = &root->fs_info->trans_block_rsv; - u64 num_bytes; - int ret; - - /* - * Truncate should be freeing data, but give us 2 items just in case it - * needs to use some space. We may want to be smarter about this in the - * future. - */ - num_bytes = btrfs_calc_trans_metadata_size(root, 2); - - /* We already have enough bytes, just return */ - if (rsv->reserved >= num_bytes) - return 0; - - num_bytes -= rsv->reserved; - - /* - * You should have reserved enough space before hand to do this, so this - * should not fail. - */ - ret = block_rsv_migrate_bytes(trans_rsv, rsv, num_bytes); - BUG_ON(ret); - - return 0; -} - -int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - int num_items) -{ - u64 num_bytes; - int ret; - - if (num_items == 0 || root->fs_info->chunk_root == root) - return 0; - - num_bytes = btrfs_calc_trans_metadata_size(root, num_items); - ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv, - num_bytes); - if (!ret) { - trans->bytes_reserved += num_bytes; - trans->block_rsv = &root->fs_info->trans_block_rsv; - } - return ret; + WARN_ON(fs_info->delayed_block_rsv.size > 0); + WARN_ON(fs_info->delayed_block_rsv.reserved > 0); } void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, @@ -3900,9 +4062,7 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, if (!trans->bytes_reserved) return; - BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv); - btrfs_block_rsv_release(root, trans->block_rsv, - trans->bytes_reserved); + btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); trans->bytes_reserved = 0; } @@ -3944,90 +4104,212 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans, return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); } -static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes) +/** + * drop_outstanding_extent - drop an outstanding extent + * @inode: the inode we're dropping the extent for + * + * This is called when we are freeing up an outstanding extent, either called + * after an error or after an extent is written. This will return the number of + * reserved extents that need to be freed. This must be called with + * BTRFS_I(inode)->lock held. + */ +static unsigned drop_outstanding_extent(struct inode *inode) { - return num_bytes >>= 3; + unsigned drop_inode_space = 0; + unsigned dropped_extents = 0; + + BUG_ON(!BTRFS_I(inode)->outstanding_extents); + BTRFS_I(inode)->outstanding_extents--; + + if (BTRFS_I(inode)->outstanding_extents == 0 && + BTRFS_I(inode)->delalloc_meta_reserved) { + drop_inode_space = 1; + BTRFS_I(inode)->delalloc_meta_reserved = 0; + } + + /* + * If we have more or the same amount of outsanding extents than we have + * reserved then we need to leave the reserved extents count alone. + */ + if (BTRFS_I(inode)->outstanding_extents >= + BTRFS_I(inode)->reserved_extents) + return drop_inode_space; + + dropped_extents = BTRFS_I(inode)->reserved_extents - + BTRFS_I(inode)->outstanding_extents; + BTRFS_I(inode)->reserved_extents -= dropped_extents; + return dropped_extents + drop_inode_space; +} + +/** + * calc_csum_metadata_size - return the amount of metada space that must be + * reserved/free'd for the given bytes. + * @inode: the inode we're manipulating + * @num_bytes: the number of bytes in question + * @reserve: 1 if we are reserving space, 0 if we are freeing space + * + * This adjusts the number of csum_bytes in the inode and then returns the + * correct amount of metadata that must either be reserved or freed. We + * calculate how many checksums we can fit into one leaf and then divide the + * number of bytes that will need to be checksumed by this value to figure out + * how many checksums will be required. If we are adding bytes then the number + * may go up and we will return the number of additional bytes that must be + * reserved. If it is going down we will return the number of bytes that must + * be freed. + * + * This must be called with BTRFS_I(inode)->lock held. + */ +static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes, + int reserve) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 csum_size; + int num_csums_per_leaf; + int num_csums; + int old_csums; + + if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM && + BTRFS_I(inode)->csum_bytes == 0) + return 0; + + old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize); + if (reserve) + BTRFS_I(inode)->csum_bytes += num_bytes; + else + BTRFS_I(inode)->csum_bytes -= num_bytes; + csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item); + num_csums_per_leaf = (int)div64_u64(csum_size, + sizeof(struct btrfs_csum_item) + + sizeof(struct btrfs_disk_key)); + num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize); + num_csums = num_csums + num_csums_per_leaf - 1; + num_csums = num_csums / num_csums_per_leaf; + + old_csums = old_csums + num_csums_per_leaf - 1; + old_csums = old_csums / num_csums_per_leaf; + + /* No change, no need to reserve more */ + if (old_csums == num_csums) + return 0; + + if (reserve) + return btrfs_calc_trans_metadata_size(root, + num_csums - old_csums); + + return btrfs_calc_trans_metadata_size(root, old_csums - num_csums); } int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; - u64 to_reserve; - int nr_extents; - int reserved_extents; + u64 to_reserve = 0; + unsigned nr_extents = 0; + int flush = 1; int ret; - if (btrfs_transaction_in_commit(root->fs_info)) + if (btrfs_is_free_space_inode(root, inode)) + flush = 0; + + if (flush && btrfs_transaction_in_commit(root->fs_info)) schedule_timeout(1); num_bytes = ALIGN(num_bytes, root->sectorsize); - nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1; - reserved_extents = atomic_read(&BTRFS_I(inode)->reserved_extents); + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->outstanding_extents++; - if (nr_extents > reserved_extents) { - nr_extents -= reserved_extents; - to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); - } else { - nr_extents = 0; - to_reserve = 0; + if (BTRFS_I(inode)->outstanding_extents > + BTRFS_I(inode)->reserved_extents) { + nr_extents = BTRFS_I(inode)->outstanding_extents - + BTRFS_I(inode)->reserved_extents; + BTRFS_I(inode)->reserved_extents += nr_extents; } - to_reserve += calc_csum_metadata_size(inode, num_bytes); - ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1); - if (ret) - return ret; + /* + * Add an item to reserve for updating the inode when we complete the + * delalloc io. + */ + if (!BTRFS_I(inode)->delalloc_meta_reserved) { + nr_extents++; + BTRFS_I(inode)->delalloc_meta_reserved = 1; + } - atomic_add(nr_extents, &BTRFS_I(inode)->reserved_extents); - atomic_inc(&BTRFS_I(inode)->outstanding_extents); + to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); + to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); + spin_unlock(&BTRFS_I(inode)->lock); - block_rsv_add_bytes(block_rsv, to_reserve, 1); + ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); + if (ret) { + u64 to_free = 0; + unsigned dropped; - if (block_rsv->size > 512 * 1024 * 1024) - shrink_delalloc(NULL, root, to_reserve, 0); + spin_lock(&BTRFS_I(inode)->lock); + dropped = drop_outstanding_extent(inode); + to_free = calc_csum_metadata_size(inode, num_bytes, 0); + spin_unlock(&BTRFS_I(inode)->lock); + to_free += btrfs_calc_trans_metadata_size(root, dropped); + + /* + * Somebody could have come in and twiddled with the + * reservation, so if we have to free more than we would have + * reserved from this reservation go ahead and release those + * bytes. + */ + to_free -= to_reserve; + if (to_free) + btrfs_block_rsv_release(root, block_rsv, to_free); + return ret; + } + + block_rsv_add_bytes(block_rsv, to_reserve, 1); return 0; } +/** + * btrfs_delalloc_release_metadata - release a metadata reservation for an inode + * @inode: the inode to release the reservation for + * @num_bytes: the number of bytes we're releasing + * + * This will release the metadata reservation for an inode. This can be called + * once we complete IO for a given set of bytes to release their metadata + * reservations. + */ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) { struct btrfs_root *root = BTRFS_I(inode)->root; - u64 to_free; - int nr_extents; - int reserved_extents; + u64 to_free = 0; + unsigned dropped; num_bytes = ALIGN(num_bytes, root->sectorsize); - atomic_dec(&BTRFS_I(inode)->outstanding_extents); - WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents) < 0); + spin_lock(&BTRFS_I(inode)->lock); + dropped = drop_outstanding_extent(inode); - reserved_extents = atomic_read(&BTRFS_I(inode)->reserved_extents); - do { - int old, new; - - nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents); - if (nr_extents >= reserved_extents) { - nr_extents = 0; - break; - } - old = reserved_extents; - nr_extents = reserved_extents - nr_extents; - new = reserved_extents - nr_extents; - old = atomic_cmpxchg(&BTRFS_I(inode)->reserved_extents, - reserved_extents, new); - if (likely(old == reserved_extents)) - break; - reserved_extents = old; - } while (1); - - to_free = calc_csum_metadata_size(inode, num_bytes); - if (nr_extents > 0) - to_free += btrfs_calc_trans_metadata_size(root, nr_extents); + to_free = calc_csum_metadata_size(inode, num_bytes, 0); + spin_unlock(&BTRFS_I(inode)->lock); + if (dropped > 0) + to_free += btrfs_calc_trans_metadata_size(root, dropped); btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, to_free); } +/** + * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc + * @inode: inode we're writing to + * @num_bytes: the number of bytes we want to allocate + * + * This will do the following things + * + * o reserve space in the data space info for num_bytes + * o reserve space in the metadata space info based on number of outstanding + * extents and how much csums will be needed + * o add to the inodes ->delalloc_bytes + * o add it to the fs_info's delalloc inodes list. + * + * This will return 0 for success and -ENOSPC if there is no space left. + */ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) { int ret; @@ -4045,6 +4327,19 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) return 0; } +/** + * btrfs_delalloc_release_space - release data and metadata space for delalloc + * @inode: inode we're releasing space for + * @num_bytes: the number of bytes we want to free up + * + * This must be matched with a call to btrfs_delalloc_reserve_space. This is + * called in the case that we don't need the metadata AND data reservations + * anymore. So if there is an error or we insert an inline extent. + * + * This function will release the metadata space that was not used and will + * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes + * list if there are no delalloc bytes left. + */ void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes) { btrfs_delalloc_release_metadata(inode, num_bytes); @@ -4064,12 +4359,12 @@ static int update_block_group(struct btrfs_trans_handle *trans, /* block accounting for super block */ spin_lock(&info->delalloc_lock); - old_val = btrfs_super_bytes_used(&info->super_copy); + old_val = btrfs_super_bytes_used(info->super_copy); if (alloc) old_val += num_bytes; else old_val -= num_bytes; - btrfs_set_super_bytes_used(&info->super_copy, old_val); + btrfs_set_super_bytes_used(info->super_copy, old_val); spin_unlock(&info->delalloc_lock); while (total) { @@ -4097,7 +4392,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, spin_lock(&cache->space_info->lock); spin_lock(&cache->lock); - if (btrfs_super_cache_generation(&info->super_copy) != 0 && + if (btrfs_test_opt(root, SPACE_CACHE) && cache->disk_cache_state < BTRFS_DC_CLEAR) cache->disk_cache_state = BTRFS_DC_CLEAR; @@ -4109,7 +4404,6 @@ static int update_block_group(struct btrfs_trans_handle *trans, btrfs_set_block_group_used(&cache->item, old_val); cache->reserved -= num_bytes; cache->space_info->bytes_reserved -= num_bytes; - cache->space_info->reservation_progress++; cache->space_info->bytes_used += num_bytes; cache->space_info->disk_used += num_bytes * factor; spin_unlock(&cache->lock); @@ -4161,7 +4455,6 @@ static int pin_down_extent(struct btrfs_root *root, if (reserved) { cache->reserved -= num_bytes; cache->space_info->bytes_reserved -= num_bytes; - cache->space_info->reservation_progress++; } spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); @@ -4189,45 +4482,82 @@ int btrfs_pin_extent(struct btrfs_root *root, } /* - * update size of reserved extents. this function may return -EAGAIN - * if 'reserve' is true or 'sinfo' is false. + * this function must be called within transaction + */ +int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes) +{ + struct btrfs_block_group_cache *cache; + + cache = btrfs_lookup_block_group(root->fs_info, bytenr); + BUG_ON(!cache); + + /* + * pull in the free space cache (if any) so that our pin + * removes the free space from the cache. We have load_only set + * to one because the slow code to read in the free extents does check + * the pinned extents. + */ + cache_block_group(cache, trans, root, 1); + + pin_down_extent(root, cache, bytenr, num_bytes, 0); + + /* remove us from the free space cache (if we're there at all) */ + btrfs_remove_free_space(cache, bytenr, num_bytes); + btrfs_put_block_group(cache); + return 0; +} + +/** + * btrfs_update_reserved_bytes - update the block_group and space info counters + * @cache: The cache we are manipulating + * @num_bytes: The number of bytes in question + * @reserve: One of the reservation enums + * + * This is called by the allocator when it reserves space, or by somebody who is + * freeing space that was never actually used on disk. For example if you + * reserve some space for a new leaf in transaction A and before transaction A + * commits you free that leaf, you call this with reserve set to 0 in order to + * clear the reservation. + * + * Metadata reservations should be called with RESERVE_ALLOC so we do the proper + * ENOSPC accounting. For data we handle the reservation through clearing the + * delalloc bits in the io_tree. We have to do this since we could end up + * allocating less disk space for the amount of data we have reserved in the + * case of compression. + * + * If this is a reservation and the block group has become read only we cannot + * make the reservation and return -EAGAIN, otherwise this function always + * succeeds. */ -int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, - u64 num_bytes, int reserve, int sinfo) +static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 num_bytes, int reserve) { + struct btrfs_space_info *space_info = cache->space_info; int ret = 0; - if (sinfo) { - struct btrfs_space_info *space_info = cache->space_info; - spin_lock(&space_info->lock); - spin_lock(&cache->lock); - if (reserve) { - if (cache->ro) { - ret = -EAGAIN; - } else { - cache->reserved += num_bytes; - space_info->bytes_reserved += num_bytes; - } - } else { - if (cache->ro) - space_info->bytes_readonly += num_bytes; - cache->reserved -= num_bytes; - space_info->bytes_reserved -= num_bytes; - space_info->reservation_progress++; - } - spin_unlock(&cache->lock); - spin_unlock(&space_info->lock); - } else { - spin_lock(&cache->lock); + spin_lock(&space_info->lock); + spin_lock(&cache->lock); + if (reserve != RESERVE_FREE) { if (cache->ro) { ret = -EAGAIN; } else { - if (reserve) - cache->reserved += num_bytes; - else - cache->reserved -= num_bytes; + cache->reserved += num_bytes; + space_info->bytes_reserved += num_bytes; + if (reserve == RESERVE_ALLOC) { + BUG_ON(space_info->bytes_may_use < num_bytes); + space_info->bytes_may_use -= num_bytes; + } } - spin_unlock(&cache->lock); + } else { + if (cache->ro) + space_info->bytes_readonly += num_bytes; + cache->reserved -= num_bytes; + space_info->bytes_reserved -= num_bytes; + space_info->reservation_progress++; } + spin_unlock(&cache->lock); + spin_unlock(&space_info->lock); return ret; } @@ -4293,13 +4623,8 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) spin_lock(&cache->lock); cache->pinned -= len; cache->space_info->bytes_pinned -= len; - if (cache->ro) { + if (cache->ro) cache->space_info->bytes_readonly += len; - } else if (cache->reserved_pinned > 0) { - len = min(len, cache->reserved_pinned); - cache->reserved_pinned -= len; - cache->space_info->bytes_reserved += len; - } spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); } @@ -4314,11 +4639,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = root->fs_info; struct extent_io_tree *unpin; - struct btrfs_block_rsv *block_rsv; - struct btrfs_block_rsv *next_rsv; u64 start; u64 end; - int idx; int ret; if (fs_info->pinned_extents == &fs_info->freed_extents[0]) @@ -4341,30 +4663,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, cond_resched(); } - mutex_lock(&fs_info->durable_block_rsv_mutex); - list_for_each_entry_safe(block_rsv, next_rsv, - &fs_info->durable_block_rsv_list, list) { - - idx = trans->transid & 0x1; - if (block_rsv->freed[idx] > 0) { - block_rsv_add_bytes(block_rsv, - block_rsv->freed[idx], 0); - block_rsv->freed[idx] = 0; - } - if (atomic_read(&block_rsv->usage) == 0) { - btrfs_block_rsv_release(root, block_rsv, (u64)-1); - - if (block_rsv->freed[0] == 0 && - block_rsv->freed[1] == 0) { - list_del_init(&block_rsv->list); - kfree(block_rsv); - } - } else { - btrfs_block_rsv_release(root, block_rsv, 0); - } - } - mutex_unlock(&fs_info->durable_block_rsv_mutex); - return 0; } @@ -4444,7 +4742,9 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, printk(KERN_ERR "umm, got %d back from search" ", was looking for %llu\n", ret, (unsigned long long)bytenr); - btrfs_print_leaf(extent_root, path->nodes[0]); + if (ret > 0) + btrfs_print_leaf(extent_root, + path->nodes[0]); } BUG_ON(ret); extent_slot = path->slots[0]; @@ -4640,7 +4940,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, struct extent_buffer *buf, u64 parent, int last_ref) { - struct btrfs_block_rsv *block_rsv; struct btrfs_block_group_cache *cache = NULL; int ret; @@ -4655,64 +4954,24 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, if (!last_ref) return; - block_rsv = get_block_rsv(trans, root); cache = btrfs_lookup_block_group(root->fs_info, buf->start); - if (block_rsv->space_info != cache->space_info) - goto out; if (btrfs_header_generation(buf) == trans->transid) { if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { ret = check_ref_cleanup(trans, root, buf->start); if (!ret) - goto pin; + goto out; } if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { pin_down_extent(root, cache, buf->start, buf->len, 1); - goto pin; + goto out; } WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); btrfs_add_free_space(cache, buf->start, buf->len); - ret = btrfs_update_reserved_bytes(cache, buf->len, 0, 0); - if (ret == -EAGAIN) { - /* block group became read-only */ - btrfs_update_reserved_bytes(cache, buf->len, 0, 1); - goto out; - } - - ret = 1; - spin_lock(&block_rsv->lock); - if (block_rsv->reserved < block_rsv->size) { - block_rsv->reserved += buf->len; - ret = 0; - } - spin_unlock(&block_rsv->lock); - - if (ret) { - spin_lock(&cache->space_info->lock); - cache->space_info->bytes_reserved -= buf->len; - cache->space_info->reservation_progress++; - spin_unlock(&cache->space_info->lock); - } - goto out; - } -pin: - if (block_rsv->durable && !cache->ro) { - ret = 0; - spin_lock(&cache->lock); - if (!cache->ro) { - cache->reserved_pinned += buf->len; - ret = 1; - } - spin_unlock(&cache->lock); - - if (ret) { - spin_lock(&block_rsv->lock); - block_rsv->freed[trans->transid & 0x1] += buf->len; - spin_unlock(&block_rsv->lock); - } + btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE); } out: /* @@ -4855,10 +5114,13 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, int last_ptr_loop = 0; int loop = 0; int index = 0; + int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ? + RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC; bool found_uncached_bg = false; bool failed_cluster_refill = false; bool failed_alloc = false; bool use_cluster = true; + bool have_caching_bg = false; u64 ideal_cache_percent = 0; u64 ideal_cache_offset = 0; @@ -4941,6 +5203,7 @@ ideal_cache: } } search: + have_caching_bg = false; down_read(&space_info->groups_sem); list_for_each_entry(block_group, &space_info->block_groups[index], list) { @@ -4970,13 +5233,15 @@ search: } have_block_group: - if (unlikely(block_group->cached == BTRFS_CACHE_NO)) { + cached = block_group_cache_done(block_group); + if (unlikely(!cached)) { u64 free_percent; + found_uncached_bg = true; ret = cache_block_group(block_group, trans, orig_root, 1); if (block_group->cached == BTRFS_CACHE_FINISHED) - goto have_block_group; + goto alloc; free_percent = btrfs_block_group_used(&block_group->item); free_percent *= 100; @@ -4990,19 +5255,14 @@ have_block_group: } /* - * We only want to start kthread caching if we are at - * the point where we will wait for caching to make - * progress, or if our ideal search is over and we've - * found somebody to start caching. + * The caching workers are limited to 2 threads, so we + * can queue as much work as we care to. */ - if (loop > LOOP_CACHING_NOWAIT || - (loop > LOOP_FIND_IDEAL && - atomic_read(&space_info->caching_threads) < 2)) { + if (loop > LOOP_FIND_IDEAL) { ret = cache_block_group(block_group, trans, orig_root, 0); BUG_ON(ret); } - found_uncached_bg = true; /* * If loop is set for cached only, try the next block @@ -5012,17 +5272,14 @@ have_block_group: goto loop; } - cached = block_group_cache_done(block_group); - if (unlikely(!cached)) - found_uncached_bg = true; - +alloc: if (unlikely(block_group->ro)) goto loop; spin_lock(&block_group->free_space_ctl->tree_lock); if (cached && block_group->free_space_ctl->free_space < - num_bytes + empty_size) { + num_bytes + empty_cluster + empty_size) { spin_unlock(&block_group->free_space_ctl->tree_lock); goto loop; } @@ -5043,12 +5300,10 @@ have_block_group: * people trying to start a new cluster */ spin_lock(&last_ptr->refill_lock); - if (last_ptr->block_group && - (last_ptr->block_group->ro || - !block_group_bits(last_ptr->block_group, data))) { - offset = 0; + if (!last_ptr->block_group || + last_ptr->block_group->ro || + !block_group_bits(last_ptr->block_group, data)) goto refill_cluster; - } offset = btrfs_alloc_from_cluster(block_group, last_ptr, num_bytes, search_start); @@ -5065,7 +5320,9 @@ have_block_group: * group is does point to and try again */ if (!last_ptr_loop && last_ptr->block_group && - last_ptr->block_group != block_group) { + last_ptr->block_group != block_group && + index <= + get_block_group_index(last_ptr->block_group)) { btrfs_put_block_group(block_group); block_group = last_ptr->block_group; @@ -5097,7 +5354,7 @@ refill_cluster: /* allocate a cluster in this block group */ ret = btrfs_find_space_cluster(trans, root, block_group, last_ptr, - offset, num_bytes, + search_start, num_bytes, empty_cluster + empty_size); if (ret == 0) { /* @@ -5151,6 +5408,8 @@ refill_cluster: failed_alloc = true; goto have_block_group; } else if (!offset) { + if (!cached) + have_caching_bg = true; goto loop; } checks: @@ -5176,8 +5435,8 @@ checks: search_start - offset); BUG_ON(offset > search_start); - ret = btrfs_update_reserved_bytes(block_group, num_bytes, 1, - (data & BTRFS_BLOCK_GROUP_DATA)); + ret = btrfs_update_reserved_bytes(block_group, num_bytes, + alloc_type); if (ret == -EAGAIN) { btrfs_add_free_space(block_group, offset, num_bytes); goto loop; @@ -5201,6 +5460,9 @@ loop: } up_read(&space_info->groups_sem); + if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg) + goto search; + if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES) goto search; @@ -5219,8 +5481,7 @@ loop: if (loop == LOOP_FIND_IDEAL && found_uncached_bg) { found_uncached_bg = false; loop++; - if (!ideal_cache_percent && - atomic_read(&space_info->caching_threads)) + if (!ideal_cache_percent) goto search; /* @@ -5300,7 +5561,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes, int index = 0; spin_lock(&info->lock); - printk(KERN_INFO "space_info has %llu free, is %sfull\n", + printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n", + (unsigned long long)info->flags, (unsigned long long)(info->total_bytes - info->bytes_used - info->bytes_pinned - info->bytes_reserved - info->bytes_readonly), @@ -5386,7 +5648,8 @@ again: return ret; } -int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) +static int __btrfs_free_reserved_extent(struct btrfs_root *root, + u64 start, u64 len, int pin) { struct btrfs_block_group_cache *cache; int ret = 0; @@ -5401,8 +5664,12 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) if (btrfs_test_opt(root, DISCARD)) ret = btrfs_discard_extent(root, start, len, NULL); - btrfs_add_free_space(cache, start, len); - btrfs_update_reserved_bytes(cache, len, 0, 1); + if (pin) + pin_down_extent(root, cache, start, len, 1); + else { + btrfs_add_free_space(cache, start, len); + btrfs_update_reserved_bytes(cache, len, RESERVE_FREE); + } btrfs_put_block_group(cache); trace_btrfs_reserved_extent_free(root, start, len); @@ -5410,6 +5677,18 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) return ret; } +int btrfs_free_reserved_extent(struct btrfs_root *root, + u64 start, u64 len) +{ + return __btrfs_free_reserved_extent(root, start, len, 0); +} + +int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, + u64 start, u64 len) +{ + return __btrfs_free_reserved_extent(root, start, len, 1); +} + static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 parent, u64 root_objectid, @@ -5494,7 +5773,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref); path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, @@ -5604,7 +5884,8 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, put_caching_control(caching_ctl); } - ret = btrfs_update_reserved_bytes(block_group, ins->offset, 1, 1); + ret = btrfs_update_reserved_bytes(block_group, ins->offset, + RESERVE_ALLOC_NO_ACCOUNT); BUG_ON(ret); btrfs_put_block_group(block_group); ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, @@ -5623,7 +5904,7 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, if (!buf) return ERR_PTR(-ENOMEM); btrfs_set_header_generation(buf, trans->transid); - btrfs_set_buffer_lockdep_class(buf, level); + btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); btrfs_tree_lock(buf); clean_tree_block(trans, root, buf); @@ -5661,8 +5942,7 @@ use_block_rsv(struct btrfs_trans_handle *trans, block_rsv = get_block_rsv(trans, root); if (block_rsv->size == 0) { - ret = reserve_metadata_bytes(trans, root, block_rsv, - blocksize, 0); + ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); /* * If we couldn't reserve metadata bytes try and use some from * the global reserve. @@ -5682,13 +5962,15 @@ use_block_rsv(struct btrfs_trans_handle *trans, if (!ret) return block_rsv; if (ret) { - WARN_ON(1); - ret = reserve_metadata_bytes(trans, root, block_rsv, blocksize, - 0); + static DEFINE_RATELIMIT_STATE(_rs, + DEFAULT_RATELIMIT_INTERVAL, + /*DEFAULT_RATELIMIT_BURST*/ 2); + if (__ratelimit(&_rs)) { + printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret); + WARN_ON(1); + } + ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); if (!ret) { - spin_lock(&block_rsv->lock); - block_rsv->size += blocksize; - spin_unlock(&block_rsv->lock); return block_rsv; } else if (ret && block_rsv != global_rsv) { ret = block_rsv_use_bytes(global_rsv, blocksize); @@ -5910,7 +6192,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, return 1; if (path->locks[level] && !wc->keep_locks) { - btrfs_tree_unlock(eb); + btrfs_tree_unlock_rw(eb, path->locks[level]); path->locks[level] = 0; } return 0; @@ -5934,7 +6216,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, * keep the tree lock */ if (path->locks[level] && level > 0) { - btrfs_tree_unlock(eb); + btrfs_tree_unlock_rw(eb, path->locks[level]); path->locks[level] = 0; } return 0; @@ -6047,7 +6329,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, BUG_ON(level != btrfs_header_level(next)); path->nodes[level] = next; path->slots[level] = 0; - path->locks[level] = 1; + path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; wc->level = level; if (wc->level == 1) wc->reada_slot = 0; @@ -6118,7 +6400,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, BUG_ON(level == 0); btrfs_tree_lock(eb); btrfs_set_lock_blocking(eb); - path->locks[level] = 1; + path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; ret = btrfs_lookup_extent_info(trans, root, eb->start, eb->len, @@ -6127,8 +6409,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, BUG_ON(ret); BUG_ON(wc->refs[level] == 0); if (wc->refs[level] == 1) { - btrfs_tree_unlock(eb); - path->locks[level] = 0; + btrfs_tree_unlock_rw(eb, path->locks[level]); return 1; } } @@ -6150,7 +6431,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, btrfs_header_generation(eb) == trans->transid) { btrfs_tree_lock(eb); btrfs_set_lock_blocking(eb); - path->locks[level] = 1; + path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; } clean_tree_block(trans, root, eb); } @@ -6229,7 +6510,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, return 0; if (path->locks[level]) { - btrfs_tree_unlock(path->nodes[level]); + btrfs_tree_unlock_rw(path->nodes[level], + path->locks[level]); path->locks[level] = 0; } free_extent_buffer(path->nodes[level]); @@ -6251,8 +6533,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, * also make sure backrefs for the shared block and all lower level * blocks are properly updated. */ -int btrfs_drop_snapshot(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, int update_ref) +void btrfs_drop_snapshot(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, int update_ref) { struct btrfs_path *path; struct btrfs_trans_handle *trans; @@ -6265,10 +6547,17 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int level; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) { + err = -ENOMEM; + goto out; + } wc = kzalloc(sizeof(*wc), GFP_NOFS); - BUG_ON(!wc); + if (!wc) { + btrfs_free_path(path); + err = -ENOMEM; + goto out; + } trans = btrfs_start_transaction(tree_root, 0); BUG_ON(IS_ERR(trans)); @@ -6281,7 +6570,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, path->nodes[level] = btrfs_lock_root_node(root); btrfs_set_lock_blocking(path->nodes[level]); path->slots[level] = 0; - path->locks[level] = 1; + path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; memset(&wc->update_progress, 0, sizeof(wc->update_progress)); } else { @@ -6296,7 +6585,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, path->lowest_level = 0; if (ret < 0) { err = ret; - goto out; + goto out_free; } WARN_ON(ret > 0); @@ -6403,11 +6692,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root, free_extent_buffer(root->commit_root); kfree(root); } -out: +out_free: btrfs_end_transaction_throttle(trans, tree_root); kfree(wc); btrfs_free_path(path); - return err; +out: + if (err) + btrfs_std_error(root->fs_info, err); + return; } /* @@ -6449,7 +6741,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, level = btrfs_header_level(node); path->nodes[level] = node; path->slots[level] = 0; - path->locks[level] = 1; + path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; wc->refs[parent_level] = 1; wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF; @@ -6524,30 +6816,45 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) return flags; } -static int set_block_group_ro(struct btrfs_block_group_cache *cache) +static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force) { struct btrfs_space_info *sinfo = cache->space_info; u64 num_bytes; + u64 min_allocable_bytes; int ret = -ENOSPC; - if (cache->ro) - return 0; + + /* + * We need some metadata space and system metadata space for + * allocating chunks in some corner cases until we force to set + * it to be readonly. + */ + if ((sinfo->flags & + (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) && + !force) + min_allocable_bytes = 1 * 1024 * 1024; + else + min_allocable_bytes = 0; spin_lock(&sinfo->lock); spin_lock(&cache->lock); + + if (cache->ro) { + ret = 0; + goto out; + } + num_bytes = cache->key.offset - cache->reserved - cache->pinned - cache->bytes_super - btrfs_block_group_used(&cache->item); if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned + - sinfo->bytes_may_use + sinfo->bytes_readonly + - cache->reserved_pinned + num_bytes <= sinfo->total_bytes) { + sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes + + min_allocable_bytes <= sinfo->total_bytes) { sinfo->bytes_readonly += num_bytes; - sinfo->bytes_reserved += cache->reserved_pinned; - cache->reserved_pinned = 0; cache->ro = 1; ret = 0; } - +out: spin_unlock(&cache->lock); spin_unlock(&sinfo->lock); return ret; @@ -6571,7 +6878,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root, do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, CHUNK_ALLOC_FORCE); - ret = set_block_group_ro(cache); + ret = set_block_group_ro(cache, 0); if (!ret) goto out; alloc_flags = get_alloc_profile(root, cache->space_info->flags); @@ -6579,7 +6886,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root, CHUNK_ALLOC_FORCE); if (ret < 0) goto out; - ret = set_block_group_ro(cache); + ret = set_block_group_ro(cache, 0); out: btrfs_end_transaction(trans, root); return ret; @@ -6680,6 +6987,10 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) struct btrfs_space_info *space_info; struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; struct btrfs_device *device; + u64 min_free; + u64 dev_min = 1; + u64 dev_nr = 0; + int index; int full = 0; int ret = 0; @@ -6689,8 +7000,10 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) if (!block_group) return -1; + min_free = btrfs_block_group_used(&block_group->item); + /* no bytes used, we're good */ - if (!btrfs_block_group_used(&block_group->item)) + if (!min_free) goto out; space_info = block_group->space_info; @@ -6706,10 +7019,9 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) * all of the extents from this block group. If we can, we're good */ if ((space_info->total_bytes != block_group->key.offset) && - (space_info->bytes_used + space_info->bytes_reserved + - space_info->bytes_pinned + space_info->bytes_readonly + - btrfs_block_group_used(&block_group->item) < - space_info->total_bytes)) { + (space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly + + min_free < space_info->total_bytes)) { spin_unlock(&space_info->lock); goto out; } @@ -6726,9 +7038,31 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) if (full) goto out; + /* + * index: + * 0: raid10 + * 1: raid1 + * 2: dup + * 3: raid0 + * 4: single + */ + index = get_block_group_index(block_group); + if (index == 0) { + dev_min = 4; + /* Divide by 2 */ + min_free >>= 1; + } else if (index == 1) { + dev_min = 2; + } else if (index == 2) { + /* Multiply by 2 */ + min_free <<= 1; + } else if (index == 3) { + dev_min = fs_devices->rw_devices; + do_div(min_free, dev_min); + } + mutex_lock(&root->fs_info->chunk_mutex); list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { - u64 min_free = btrfs_block_group_used(&block_group->item); u64 dev_offset; /* @@ -6739,7 +7073,11 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) ret = find_free_dev_extent(NULL, device, min_free, &dev_offset, NULL); if (!ret) + dev_nr++; + + if (dev_nr >= dev_min) break; + ret = -1; } } @@ -6879,7 +7217,8 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) struct btrfs_space_info, list); if (space_info->bytes_pinned > 0 || - space_info->bytes_reserved > 0) { + space_info->bytes_reserved > 0 || + space_info->bytes_may_use > 0) { WARN_ON(1); dump_space_info(space_info, 0, 0); } @@ -6921,14 +7260,12 @@ int btrfs_read_block_groups(struct btrfs_root *root) return -ENOMEM; path->reada = 1; - cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy); - if (cache_gen != 0 && - btrfs_super_generation(&root->fs_info->super_copy) != cache_gen) + cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); + if (btrfs_test_opt(root, SPACE_CACHE) && + btrfs_super_generation(root->fs_info->super_copy) != cache_gen) need_clear = 1; if (btrfs_test_opt(root, CLEAR_CACHE)) need_clear = 1; - if (!btrfs_test_opt(root, SPACE_CACHE) && cache_gen) - printk(KERN_INFO "btrfs: disk space caching is enabled\n"); while (1) { ret = find_first_block_group(root, path, &key); @@ -7016,7 +7353,7 @@ int btrfs_read_block_groups(struct btrfs_root *root) set_avail_alloc_bits(root->fs_info, cache->flags); if (btrfs_chunk_readonly(root, cache->key.objectid)) - set_block_group_ro(cache); + set_block_group_ro(cache, 1); } list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) { @@ -7030,9 +7367,9 @@ int btrfs_read_block_groups(struct btrfs_root *root) * mirrored block groups. */ list_for_each_entry(cache, &space_info->block_groups[3], list) - set_block_group_ro(cache); + set_block_group_ro(cache, 1); list_for_each_entry(cache, &space_info->block_groups[4], list) - set_block_group_ro(cache); + set_block_group_ro(cache, 1); } init_global_block_rsv(info); @@ -7162,11 +7499,15 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, spin_unlock(&cluster->refill_lock); path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) { + ret = -ENOMEM; + goto out; + } - inode = lookup_free_space_inode(root, block_group, path); + inode = lookup_free_space_inode(tree_root, block_group, path); if (!IS_ERR(inode)) { - btrfs_orphan_add(trans, inode); + ret = btrfs_orphan_add(trans, inode); + BUG_ON(ret); clear_nlink(inode); /* One for the block groups ref */ spin_lock(&block_group->lock); @@ -7179,7 +7520,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, spin_unlock(&block_group->lock); } /* One for our lookup ref */ - iput(inode); + btrfs_add_delayed_iput(inode); } key.objectid = BTRFS_FREE_SPACE_OBJECTID; @@ -7250,7 +7591,7 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info) int mixed = 0; int ret; - disk_super = &fs_info->super_copy; + disk_super = fs_info->super_copy; if (!btrfs_super_root(disk_super)) return 1; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 7055d11c1efd..be1bf627a14b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -17,6 +17,7 @@ #include "compat.h" #include "ctree.h" #include "btrfs_inode.h" +#include "volumes.h" static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; @@ -254,14 +255,14 @@ static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, * * This should be called with the tree lock held. */ -static int merge_state(struct extent_io_tree *tree, - struct extent_state *state) +static void merge_state(struct extent_io_tree *tree, + struct extent_state *state) { struct extent_state *other; struct rb_node *other_node; if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) - return 0; + return; other_node = rb_prev(&state->rb_node); if (other_node) { @@ -281,26 +282,19 @@ static int merge_state(struct extent_io_tree *tree, if (other->start == state->end + 1 && other->state == state->state) { merge_cb(tree, state, other); - other->start = state->start; - state->tree = NULL; - rb_erase(&state->rb_node, &tree->state); - free_extent_state(state); - state = NULL; + state->end = other->end; + other->tree = NULL; + rb_erase(&other->rb_node, &tree->state); + free_extent_state(other); } } - - return 0; } -static int set_state_cb(struct extent_io_tree *tree, +static void set_state_cb(struct extent_io_tree *tree, struct extent_state *state, int *bits) { - if (tree->ops && tree->ops->set_bit_hook) { - return tree->ops->set_bit_hook(tree->mapping->host, - state, bits); - } - - return 0; + if (tree->ops && tree->ops->set_bit_hook) + tree->ops->set_bit_hook(tree->mapping->host, state, bits); } static void clear_state_cb(struct extent_io_tree *tree, @@ -310,6 +304,9 @@ static void clear_state_cb(struct extent_io_tree *tree, tree->ops->clear_bit_hook(tree->mapping->host, state, bits); } +static void set_state_bits(struct extent_io_tree *tree, + struct extent_state *state, int *bits); + /* * insert an extent_state struct into the tree. 'bits' are set on the * struct before it is inserted. @@ -325,8 +322,6 @@ static int insert_state(struct extent_io_tree *tree, int *bits) { struct rb_node *node; - int bits_to_set = *bits & ~EXTENT_CTLBITS; - int ret; if (end < start) { printk(KERN_ERR "btrfs end < start %llu %llu\n", @@ -336,13 +331,9 @@ static int insert_state(struct extent_io_tree *tree, } state->start = start; state->end = end; - ret = set_state_cb(tree, state, bits); - if (ret) - return ret; - if (bits_to_set & EXTENT_DIRTY) - tree->dirty_bytes += end - start + 1; - state->state |= bits_to_set; + set_state_bits(tree, state, bits); + node = tree_insert(&tree->state, end, &state->rb_node); if (node) { struct extent_state *found; @@ -351,7 +342,6 @@ static int insert_state(struct extent_io_tree *tree, "%llu %llu\n", (unsigned long long)found->start, (unsigned long long)found->end, (unsigned long long)start, (unsigned long long)end); - free_extent_state(state); return -EEXIST; } state->tree = tree; @@ -359,13 +349,11 @@ static int insert_state(struct extent_io_tree *tree, return 0; } -static int split_cb(struct extent_io_tree *tree, struct extent_state *orig, +static void split_cb(struct extent_io_tree *tree, struct extent_state *orig, u64 split) { if (tree->ops && tree->ops->split_extent_hook) - return tree->ops->split_extent_hook(tree->mapping->host, - orig, split); - return 0; + tree->ops->split_extent_hook(tree->mapping->host, orig, split); } /* @@ -500,7 +488,8 @@ again: cached_state = NULL; } - if (cached && cached->tree && cached->start == start) { + if (cached && cached->tree && cached->start <= start && + cached->end > start) { if (clear) atomic_dec(&cached->refs); state = cached; @@ -660,34 +649,25 @@ again: if (start > end) break; - if (need_resched()) { - spin_unlock(&tree->lock); - cond_resched(); - spin_lock(&tree->lock); - } + cond_resched_lock(&tree->lock); } out: spin_unlock(&tree->lock); return 0; } -static int set_state_bits(struct extent_io_tree *tree, +static void set_state_bits(struct extent_io_tree *tree, struct extent_state *state, int *bits) { - int ret; int bits_to_set = *bits & ~EXTENT_CTLBITS; - ret = set_state_cb(tree, state, bits); - if (ret) - return ret; + set_state_cb(tree, state, bits); if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { u64 range = state->end - state->start + 1; tree->dirty_bytes += range; } state->state |= bits_to_set; - - return 0; } static void cache_state(struct extent_state *state, @@ -742,7 +722,8 @@ again: spin_lock(&tree->lock); if (cached_state && *cached_state) { state = *cached_state; - if (state->start == start && state->tree) { + if (state->start <= start && state->end > start && + state->tree) { node = &state->rb_node; goto hit_next; } @@ -779,17 +760,15 @@ hit_next: goto out; } - err = set_state_bits(tree, state, &bits); - if (err) - goto out; + set_state_bits(tree, state, &bits); - next_node = rb_next(node); cache_state(state, cached_state); merge_state(tree, state); if (last_end == (u64)-1) goto out; start = last_end + 1; + next_node = rb_next(&state->rb_node); if (next_node && start < end && prealloc && !need_resched()) { state = rb_entry(next_node, struct extent_state, rb_node); @@ -830,9 +809,7 @@ hit_next: if (err) goto out; if (state->end <= end) { - err = set_state_bits(tree, state, &bits); - if (err) - goto out; + set_state_bits(tree, state, &bits); cache_state(state, cached_state); merge_state(tree, state); if (last_end == (u64)-1) @@ -862,7 +839,6 @@ hit_next: * Avoid to free 'prealloc' if it can be merged with * the later extent. */ - atomic_inc(&prealloc->refs); err = insert_state(tree, prealloc, start, this_end, &bits); BUG_ON(err == -EEXIST); @@ -872,7 +848,6 @@ hit_next: goto out; } cache_state(prealloc, cached_state); - free_extent_state(prealloc); prealloc = NULL; start = this_end + 1; goto search_again; @@ -895,12 +870,196 @@ hit_next: err = split_state(tree, state, prealloc, end + 1); BUG_ON(err == -EEXIST); - err = set_state_bits(tree, prealloc, &bits); + set_state_bits(tree, prealloc, &bits); + cache_state(prealloc, cached_state); + merge_state(tree, prealloc); + prealloc = NULL; + goto out; + } + + goto search_again; + +out: + spin_unlock(&tree->lock); + if (prealloc) + free_extent_state(prealloc); + + return err; + +search_again: + if (start > end) + goto out; + spin_unlock(&tree->lock); + if (mask & __GFP_WAIT) + cond_resched(); + goto again; +} + +/** + * convert_extent - convert all bits in a given range from one bit to another + * @tree: the io tree to search + * @start: the start offset in bytes + * @end: the end offset in bytes (inclusive) + * @bits: the bits to set in this range + * @clear_bits: the bits to clear in this range + * @mask: the allocation mask + * + * This will go through and set bits for the given range. If any states exist + * already in this range they are set with the given bit and cleared of the + * clear_bits. This is only meant to be used by things that are mergeable, ie + * converting from say DELALLOC to DIRTY. This is not meant to be used with + * boundary bits like LOCK. + */ +int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + int bits, int clear_bits, gfp_t mask) +{ + struct extent_state *state; + struct extent_state *prealloc = NULL; + struct rb_node *node; + int err = 0; + u64 last_start; + u64 last_end; + +again: + if (!prealloc && (mask & __GFP_WAIT)) { + prealloc = alloc_extent_state(mask); + if (!prealloc) + return -ENOMEM; + } + + spin_lock(&tree->lock); + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(tree, start); + if (!node) { + prealloc = alloc_extent_state_atomic(prealloc); + if (!prealloc) + return -ENOMEM; + err = insert_state(tree, prealloc, start, end, &bits); + prealloc = NULL; + BUG_ON(err == -EEXIST); + goto out; + } + state = rb_entry(node, struct extent_state, rb_node); +hit_next: + last_start = state->start; + last_end = state->end; + + /* + * | ---- desired range ---- | + * | state | + * + * Just lock what we found and keep going + */ + if (state->start == start && state->end <= end) { + struct rb_node *next_node; + + set_state_bits(tree, state, &bits); + clear_state_bit(tree, state, &clear_bits, 0); + + merge_state(tree, state); + if (last_end == (u64)-1) + goto out; + + start = last_end + 1; + next_node = rb_next(&state->rb_node); + if (next_node && start < end && prealloc && !need_resched()) { + state = rb_entry(next_node, struct extent_state, + rb_node); + if (state->start == start) + goto hit_next; + } + goto search_again; + } + + /* + * | ---- desired range ---- | + * | state | + * or + * | ------------- state -------------- | + * + * We need to split the extent we found, and may flip bits on + * second half. + * + * If the extent we found extends past our + * range, we just split and search again. It'll get split + * again the next time though. + * + * If the extent we found is inside our range, we set the + * desired bit on it. + */ + if (state->start < start) { + prealloc = alloc_extent_state_atomic(prealloc); + if (!prealloc) + return -ENOMEM; + err = split_state(tree, state, prealloc, start); + BUG_ON(err == -EEXIST); + prealloc = NULL; + if (err) + goto out; + if (state->end <= end) { + set_state_bits(tree, state, &bits); + clear_state_bit(tree, state, &clear_bits, 0); + merge_state(tree, state); + if (last_end == (u64)-1) + goto out; + start = last_end + 1; + } + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | or | state | + * + * There's a hole, we need to insert something in it and + * ignore the extent we found. + */ + if (state->start > start) { + u64 this_end; + if (end < last_start) + this_end = end; + else + this_end = last_start - 1; + + prealloc = alloc_extent_state_atomic(prealloc); + if (!prealloc) + return -ENOMEM; + + /* + * Avoid to free 'prealloc' if it can be merged with + * the later extent. + */ + err = insert_state(tree, prealloc, start, this_end, + &bits); + BUG_ON(err == -EEXIST); if (err) { + free_extent_state(prealloc); prealloc = NULL; goto out; } - cache_state(prealloc, cached_state); + prealloc = NULL; + start = this_end + 1; + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | + * We need to split the extent, and set the bit + * on the first half + */ + if (state->start <= end && state->end > end) { + prealloc = alloc_extent_state_atomic(prealloc); + if (!prealloc) + return -ENOMEM; + + err = split_state(tree, state, prealloc, end + 1); + BUG_ON(err == -EEXIST); + + set_state_bits(tree, prealloc, &bits); + clear_state_bit(tree, prealloc, &clear_bits, 0); + merge_state(tree, prealloc); prealloc = NULL; goto out; @@ -949,7 +1108,7 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask) { return set_extent_bit(tree, start, end, - EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE, + EXTENT_DELALLOC | EXTENT_UPTODATE, 0, NULL, cached_state, mask); } @@ -1061,46 +1220,6 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) return 0; } -/* - * find the first offset in the io tree with 'bits' set. zero is - * returned if we find something, and *start_ret and *end_ret are - * set to reflect the state struct that was found. - * - * If nothing was found, 1 is returned, < 0 on error - */ -int find_first_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, int bits) -{ - struct rb_node *node; - struct extent_state *state; - int ret = 1; - - spin_lock(&tree->lock); - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search(tree, start); - if (!node) - goto out; - - while (1) { - state = rb_entry(node, struct extent_state, rb_node); - if (state->end >= start && (state->state & bits)) { - *start_ret = state->start; - *end_ret = state->end; - ret = 0; - break; - } - node = rb_next(node); - if (!node) - break; - } -out: - spin_unlock(&tree->lock); - return ret; -} - /* find the first state struct with 'bits' set after 'start', and * return it. tree->lock must be held. NULL will returned if * nothing was found after 'start' @@ -1133,6 +1252,30 @@ out: } /* + * find the first offset in the io tree with 'bits' set. zero is + * returned if we find something, and *start_ret and *end_ret are + * set to reflect the state struct that was found. + * + * If nothing was found, 1 is returned, < 0 on error + */ +int find_first_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, int bits) +{ + struct extent_state *state; + int ret = 1; + + spin_lock(&tree->lock); + state = find_first_extent_bit_state(tree, start, bits); + if (state) { + *start_ret = state->start; + *end_ret = state->end; + ret = 0; + } + spin_unlock(&tree->lock); + return ret; +} + +/* * find a contiguous range of bytes in the file marked as delalloc, not * more than 'max_bytes'. start and end are used to return the range, * @@ -1564,7 +1707,8 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, int bitset = 0; spin_lock(&tree->lock); - if (cached && cached->tree && cached->start == start) + if (cached && cached->tree && cached->start <= start && + cached->end > start) node = &cached->rb_node; else node = tree_search(tree, start); @@ -1644,6 +1788,368 @@ static int check_page_writeback(struct extent_io_tree *tree, return 0; } +/* + * When IO fails, either with EIO or csum verification fails, we + * try other mirrors that might have a good copy of the data. This + * io_failure_record is used to record state as we go through all the + * mirrors. If another mirror has good data, the page is set up to date + * and things continue. If a good mirror can't be found, the original + * bio end_io callback is called to indicate things have failed. + */ +struct io_failure_record { + struct page *page; + u64 start; + u64 len; + u64 logical; + unsigned long bio_flags; + int this_mirror; + int failed_mirror; + int in_validation; +}; + +static int free_io_failure(struct inode *inode, struct io_failure_record *rec, + int did_repair) +{ + int ret; + int err = 0; + struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; + + set_state_private(failure_tree, rec->start, 0); + ret = clear_extent_bits(failure_tree, rec->start, + rec->start + rec->len - 1, + EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); + if (ret) + err = ret; + + if (did_repair) { + ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, + rec->start + rec->len - 1, + EXTENT_DAMAGED, GFP_NOFS); + if (ret && !err) + err = ret; + } + + kfree(rec); + return err; +} + +static void repair_io_failure_callback(struct bio *bio, int err) +{ + complete(bio->bi_private); +} + +/* + * this bypasses the standard btrfs submit functions deliberately, as + * the standard behavior is to write all copies in a raid setup. here we only + * want to write the one bad copy. so we do the mapping for ourselves and issue + * submit_bio directly. + * to avoid any synchonization issues, wait for the data after writing, which + * actually prevents the read that triggered the error from finishing. + * currently, there can be no more than two copies of every data bit. thus, + * exactly one rewrite is required. + */ +int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, + u64 length, u64 logical, struct page *page, + int mirror_num) +{ + struct bio *bio; + struct btrfs_device *dev; + DECLARE_COMPLETION_ONSTACK(compl); + u64 map_length = 0; + u64 sector; + struct btrfs_bio *bbio = NULL; + int ret; + + BUG_ON(!mirror_num); + + bio = bio_alloc(GFP_NOFS, 1); + if (!bio) + return -EIO; + bio->bi_private = &compl; + bio->bi_end_io = repair_io_failure_callback; + bio->bi_size = 0; + map_length = length; + + ret = btrfs_map_block(map_tree, WRITE, logical, + &map_length, &bbio, mirror_num); + if (ret) { + bio_put(bio); + return -EIO; + } + BUG_ON(mirror_num != bbio->mirror_num); + sector = bbio->stripes[mirror_num-1].physical >> 9; + bio->bi_sector = sector; + dev = bbio->stripes[mirror_num-1].dev; + kfree(bbio); + if (!dev || !dev->bdev || !dev->writeable) { + bio_put(bio); + return -EIO; + } + bio->bi_bdev = dev->bdev; + bio_add_page(bio, page, length, start-page_offset(page)); + submit_bio(WRITE_SYNC, bio); + wait_for_completion(&compl); + + if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { + /* try to remap that extent elsewhere? */ + bio_put(bio); + return -EIO; + } + + printk(KERN_INFO "btrfs read error corrected: ino %lu off %llu (dev %s " + "sector %llu)\n", page->mapping->host->i_ino, start, + dev->name, sector); + + bio_put(bio); + return 0; +} + +/* + * each time an IO finishes, we do a fast check in the IO failure tree + * to see if we need to process or clean up an io_failure_record + */ +static int clean_io_failure(u64 start, struct page *page) +{ + u64 private; + u64 private_failure; + struct io_failure_record *failrec; + struct btrfs_mapping_tree *map_tree; + struct extent_state *state; + int num_copies; + int did_repair = 0; + int ret; + struct inode *inode = page->mapping->host; + + private = 0; + ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, + (u64)-1, 1, EXTENT_DIRTY, 0); + if (!ret) + return 0; + + ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start, + &private_failure); + if (ret) + return 0; + + failrec = (struct io_failure_record *)(unsigned long) private_failure; + BUG_ON(!failrec->this_mirror); + + if (failrec->in_validation) { + /* there was no real error, just free the record */ + pr_debug("clean_io_failure: freeing dummy error at %llu\n", + failrec->start); + did_repair = 1; + goto out; + } + + spin_lock(&BTRFS_I(inode)->io_tree.lock); + state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree, + failrec->start, + EXTENT_LOCKED); + spin_unlock(&BTRFS_I(inode)->io_tree.lock); + + if (state && state->start == failrec->start) { + map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; + num_copies = btrfs_num_copies(map_tree, failrec->logical, + failrec->len); + if (num_copies > 1) { + ret = repair_io_failure(map_tree, start, failrec->len, + failrec->logical, page, + failrec->failed_mirror); + did_repair = !ret; + } + } + +out: + if (!ret) + ret = free_io_failure(inode, failrec, did_repair); + + return ret; +} + +/* + * this is a generic handler for readpage errors (default + * readpage_io_failed_hook). if other copies exist, read those and write back + * good data to the failed position. does not investigate in remapping the + * failed extent elsewhere, hoping the device will be smart enough to do this as + * needed + */ + +static int bio_readpage_error(struct bio *failed_bio, struct page *page, + u64 start, u64 end, int failed_mirror, + struct extent_state *state) +{ + struct io_failure_record *failrec = NULL; + u64 private; + struct extent_map *em; + struct inode *inode = page->mapping->host; + struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; + struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct bio *bio; + int num_copies; + int ret; + int read_mode; + u64 logical; + + BUG_ON(failed_bio->bi_rw & REQ_WRITE); + + ret = get_state_private(failure_tree, start, &private); + if (ret) { + failrec = kzalloc(sizeof(*failrec), GFP_NOFS); + if (!failrec) + return -ENOMEM; + failrec->start = start; + failrec->len = end - start + 1; + failrec->this_mirror = 0; + failrec->bio_flags = 0; + failrec->in_validation = 0; + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, failrec->len); + if (!em) { + read_unlock(&em_tree->lock); + kfree(failrec); + return -EIO; + } + + if (em->start > start || em->start + em->len < start) { + free_extent_map(em); + em = NULL; + } + read_unlock(&em_tree->lock); + + if (!em || IS_ERR(em)) { + kfree(failrec); + return -EIO; + } + logical = start - em->start; + logical = em->block_start + logical; + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { + logical = em->block_start; + failrec->bio_flags = EXTENT_BIO_COMPRESSED; + extent_set_compress_type(&failrec->bio_flags, + em->compress_type); + } + pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, " + "len=%llu\n", logical, start, failrec->len); + failrec->logical = logical; + free_extent_map(em); + + /* set the bits in the private failure tree */ + ret = set_extent_bits(failure_tree, start, end, + EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); + if (ret >= 0) + ret = set_state_private(failure_tree, start, + (u64)(unsigned long)failrec); + /* set the bits in the inode's tree */ + if (ret >= 0) + ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED, + GFP_NOFS); + if (ret < 0) { + kfree(failrec); + return ret; + } + } else { + failrec = (struct io_failure_record *)(unsigned long)private; + pr_debug("bio_readpage_error: (found) logical=%llu, " + "start=%llu, len=%llu, validation=%d\n", + failrec->logical, failrec->start, failrec->len, + failrec->in_validation); + /* + * when data can be on disk more than twice, add to failrec here + * (e.g. with a list for failed_mirror) to make + * clean_io_failure() clean all those errors at once. + */ + } + num_copies = btrfs_num_copies( + &BTRFS_I(inode)->root->fs_info->mapping_tree, + failrec->logical, failrec->len); + if (num_copies == 1) { + /* + * we only have a single copy of the data, so don't bother with + * all the retry and error correction code that follows. no + * matter what the error is, it is very likely to persist. + */ + pr_debug("bio_readpage_error: cannot repair, num_copies == 1. " + "state=%p, num_copies=%d, next_mirror %d, " + "failed_mirror %d\n", state, num_copies, + failrec->this_mirror, failed_mirror); + free_io_failure(inode, failrec, 0); + return -EIO; + } + + if (!state) { + spin_lock(&tree->lock); + state = find_first_extent_bit_state(tree, failrec->start, + EXTENT_LOCKED); + if (state && state->start != failrec->start) + state = NULL; + spin_unlock(&tree->lock); + } + + /* + * there are two premises: + * a) deliver good data to the caller + * b) correct the bad sectors on disk + */ + if (failed_bio->bi_vcnt > 1) { + /* + * to fulfill b), we need to know the exact failing sectors, as + * we don't want to rewrite any more than the failed ones. thus, + * we need separate read requests for the failed bio + * + * if the following BUG_ON triggers, our validation request got + * merged. we need separate requests for our algorithm to work. + */ + BUG_ON(failrec->in_validation); + failrec->in_validation = 1; + failrec->this_mirror = failed_mirror; + read_mode = READ_SYNC | REQ_FAILFAST_DEV; + } else { + /* + * we're ready to fulfill a) and b) alongside. get a good copy + * of the failed sector and if we succeed, we have setup + * everything for repair_io_failure to do the rest for us. + */ + if (failrec->in_validation) { + BUG_ON(failrec->this_mirror != failed_mirror); + failrec->in_validation = 0; + failrec->this_mirror = 0; + } + failrec->failed_mirror = failed_mirror; + failrec->this_mirror++; + if (failrec->this_mirror == failed_mirror) + failrec->this_mirror++; + read_mode = READ_SYNC; + } + + if (!state || failrec->this_mirror > num_copies) { + pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, " + "next_mirror %d, failed_mirror %d\n", state, + num_copies, failrec->this_mirror, failed_mirror); + free_io_failure(inode, failrec, 0); + return -EIO; + } + + bio = bio_alloc(GFP_NOFS, 1); + bio->bi_private = state; + bio->bi_end_io = failed_bio->bi_end_io; + bio->bi_sector = failrec->logical >> 9; + bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; + bio->bi_size = 0; + + bio_add_page(bio, page, failrec->len, start - page_offset(page)); + + pr_debug("bio_readpage_error: submitting new read[%#x] to " + "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode, + failrec->this_mirror, num_copies, failrec->in_validation); + + tree->ops->submit_bio_hook(inode, read_mode, bio, failrec->this_mirror, + failrec->bio_flags, 0); + return 0; +} + /* lots and lots of room for performance fixes in the end_bio funcs */ /* @@ -1742,6 +2248,9 @@ static void end_bio_extent_readpage(struct bio *bio, int err) struct extent_state *cached = NULL; struct extent_state *state; + pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, " + "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err, + (long int)bio->bi_bdev); tree = &BTRFS_I(page->mapping->host)->io_tree; start = ((u64)page->index << PAGE_CACHE_SHIFT) + @@ -1772,12 +2281,26 @@ static void end_bio_extent_readpage(struct bio *bio, int err) state); if (ret) uptodate = 0; + else + clean_io_failure(start, page); } - if (!uptodate && tree->ops && - tree->ops->readpage_io_failed_hook) { - ret = tree->ops->readpage_io_failed_hook(bio, page, - start, end, NULL); + if (!uptodate) { + int failed_mirror; + failed_mirror = (int)(unsigned long)bio->bi_bdev; + /* + * The generic bio_readpage_error handles errors the + * following way: If possible, new read requests are + * created and submitted and will end up in + * end_bio_extent_readpage as well (if we're lucky, not + * in the !uptodate case). In that case it returns 0 and + * we just go on with the next page in our bio. If it + * can't handle the error it will return -EIO and we + * remain responsible for that page. + */ + ret = bio_readpage_error(bio, page, start, end, + failed_mirror, NULL); if (ret == 0) { +error_handled: uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); if (err) @@ -1785,6 +2308,13 @@ static void end_bio_extent_readpage(struct bio *bio, int err) uncache_state(&cached); continue; } + if (tree->ops && tree->ops->readpage_io_failed_hook) { + ret = tree->ops->readpage_io_failed_hook( + bio, page, start, end, + failed_mirror, state); + if (ret == 0) + goto error_handled; + } } if (uptodate) { @@ -1856,6 +2386,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num, mirror_num, bio_flags, start); else submit_bio(rw, bio); + if (bio_flagged(bio, BIO_EOPNOTSUPP)) ret = -EOPNOTSUPP; bio_put(bio); @@ -2121,16 +2652,16 @@ out: } int extent_read_full_page(struct extent_io_tree *tree, struct page *page, - get_extent_t *get_extent) + get_extent_t *get_extent, int mirror_num) { struct bio *bio = NULL; unsigned long bio_flags = 0; int ret; - ret = __extent_read_full_page(tree, page, get_extent, &bio, 0, + ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num, &bio_flags); if (bio) - ret = submit_one_bio(READ, bio, 0, bio_flags); + ret = submit_one_bio(READ, bio, mirror_num, bio_flags); return ret; } @@ -2181,6 +2712,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, int compressed; int write_flags; unsigned long nr_written = 0; + bool fill_delalloc = true; if (wbc->sync_mode == WB_SYNC_ALL) write_flags = WRITE_SYNC; @@ -2190,6 +2722,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, trace___extent_writepage(page, inode, wbc); WARN_ON(!PageLocked(page)); + + ClearPageError(page); + pg_offset = i_size & (PAGE_CACHE_SIZE - 1); if (page->index > end_index || (page->index == end_index && !pg_offset)) { @@ -2211,10 +2746,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, set_page_extent_mapped(page); + if (!tree->ops || !tree->ops->fill_delalloc) + fill_delalloc = false; + delalloc_start = start; delalloc_end = 0; page_started = 0; - if (!epd->extent_locked) { + if (!epd->extent_locked && fill_delalloc) { u64 delalloc_to_write = 0; /* * make sure the wbc mapping index is at least updated @@ -2432,6 +2970,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, pgoff_t index; pgoff_t end; /* Inclusive */ int scanned = 0; + int tag; pagevec_init(&pvec, 0); if (wbc->range_cyclic) { @@ -2442,11 +2981,16 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, end = wbc->range_end >> PAGE_CACHE_SHIFT; scanned = 1; } + if (wbc->sync_mode == WB_SYNC_ALL) + tag = PAGECACHE_TAG_TOWRITE; + else + tag = PAGECACHE_TAG_DIRTY; retry: + if (wbc->sync_mode == WB_SYNC_ALL) + tag_pages_for_writeback(mapping, index, end); while (!done && !nr_to_write_done && (index <= end) && - (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY, min(end - index, - (pgoff_t)PAGEVEC_SIZE-1) + 1))) { + (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { unsigned i; scanned = 1; @@ -2460,10 +3004,16 @@ retry: * swizzled back from swapper_space to tmpfs file * mapping */ - if (tree->ops && tree->ops->write_cache_pages_lock_hook) - tree->ops->write_cache_pages_lock_hook(page); - else - lock_page(page); + if (tree->ops && + tree->ops->write_cache_pages_lock_hook) { + tree->ops->write_cache_pages_lock_hook(page, + data, flush_fn); + } else { + if (!trylock_page(page)) { + flush_fn(data); + lock_page(page); + } + } if (unlikely(page->mapping != mapping)) { unlock_page(page); @@ -2541,7 +3091,6 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page, struct writeback_control *wbc) { int ret; - struct address_space *mapping = page->mapping; struct extent_page_data epd = { .bio = NULL, .tree = tree, @@ -2549,18 +3098,9 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page, .extent_locked = 0, .sync_io = wbc->sync_mode == WB_SYNC_ALL, }; - struct writeback_control wbc_writepages = { - .sync_mode = wbc->sync_mode, - .older_than_this = NULL, - .nr_to_write = 64, - .range_start = page_offset(page) + PAGE_CACHE_SIZE, - .range_end = (loff_t)-1, - }; ret = __extent_writepage(page, wbc, &epd); - extent_write_cache_pages(tree, mapping, &wbc_writepages, - __extent_writepage, &epd, flush_write_bio); flush_epd_write_bio(&epd); return ret; } @@ -2584,7 +3124,6 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, }; struct writeback_control wbc_writepages = { .sync_mode = mode, - .older_than_this = NULL, .nr_to_write = nr_pages * 2, .range_start = start, .range_end = end + 1, @@ -2840,6 +3379,9 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return -ENOMEM; path->leave_spinning = 1; + start = ALIGN(start, BTRFS_I(inode)->root->sectorsize); + len = ALIGN(len, BTRFS_I(inode)->root->sectorsize); + /* * lookup the last file extent. We're not using i_size here * because there might be preallocation past i_size @@ -2887,7 +3429,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, &cached_state, GFP_NOFS); - em = get_extent_skip_holes(inode, off, last_for_get_extent, + em = get_extent_skip_holes(inode, start, last_for_get_extent, get_extent); if (!em) goto out; @@ -2976,7 +3518,7 @@ out: return ret; } -static inline struct page *extent_buffer_page(struct extent_buffer *eb, +inline struct page *extent_buffer_page(struct extent_buffer *eb, unsigned long i) { struct page *p; @@ -3001,7 +3543,7 @@ static inline struct page *extent_buffer_page(struct extent_buffer *eb, return p; } -static inline unsigned long num_extent_pages(u64 start, u64 len) +inline unsigned long num_extent_pages(u64 start, u64 len) { return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT); @@ -3022,8 +3564,15 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, return NULL; eb->start = start; eb->len = len; - spin_lock_init(&eb->lock); - init_waitqueue_head(&eb->lock_wq); + rwlock_init(&eb->lock); + atomic_set(&eb->write_locks, 0); + atomic_set(&eb->read_locks, 0); + atomic_set(&eb->blocking_readers, 0); + atomic_set(&eb->blocking_writers, 0); + atomic_set(&eb->spinning_readers, 0); + atomic_set(&eb->spinning_writers, 0); + init_waitqueue_head(&eb->write_lock_wq); + init_waitqueue_head(&eb->read_lock_wq); #if LEAK_DEBUG spin_lock_irqsave(&leak_lock, flags); @@ -3119,7 +3668,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, i = 0; } for (; i < num_pages; i++, index++) { - p = find_or_create_page(mapping, index, GFP_NOFS | __GFP_HIGHMEM); + p = find_or_create_page(mapping, index, GFP_NOFS); if (!p) { WARN_ON(1); goto free_eb; @@ -3247,6 +3796,7 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree, PAGECACHE_TAG_DIRTY); } spin_unlock_irq(&page->mapping->tree_lock); + ClearPageError(page); unlock_page(page); } return 0; @@ -3266,6 +3816,22 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree, return was_dirty; } +static int __eb_straddles_pages(u64 start, u64 len) +{ + if (len < PAGE_CACHE_SIZE) + return 1; + if (start & (PAGE_CACHE_SIZE - 1)) + return 1; + if ((start + len) & (PAGE_CACHE_SIZE - 1)) + return 1; + return 0; +} + +static int eb_straddles_pages(struct extent_buffer *eb) +{ + return __eb_straddles_pages(eb->start, eb->len); +} + int clear_extent_buffer_uptodate(struct extent_io_tree *tree, struct extent_buffer *eb, struct extent_state **cached_state) @@ -3277,8 +3843,10 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree, num_pages = num_extent_pages(eb->start, eb->len); clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, - cached_state, GFP_NOFS); + if (eb_straddles_pages(eb)) { + clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, + cached_state, GFP_NOFS); + } for (i = 0; i < num_pages; i++) { page = extent_buffer_page(eb, i); if (page) @@ -3296,8 +3864,10 @@ int set_extent_buffer_uptodate(struct extent_io_tree *tree, num_pages = num_extent_pages(eb->start, eb->len); - set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, - NULL, GFP_NOFS); + if (eb_straddles_pages(eb)) { + set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, + NULL, GFP_NOFS); + } for (i = 0; i < num_pages; i++) { page = extent_buffer_page(eb, i); if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || @@ -3320,9 +3890,12 @@ int extent_range_uptodate(struct extent_io_tree *tree, int uptodate; unsigned long index; - ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL); - if (ret) - return 1; + if (__eb_straddles_pages(start, end - start + 1)) { + ret = test_range_bit(tree, start, end, + EXTENT_UPTODATE, 1, NULL); + if (ret) + return 1; + } while (start <= end) { index = start >> PAGE_CACHE_SHIFT; page = find_get_page(tree->mapping, index); @@ -3350,10 +3923,12 @@ int extent_buffer_uptodate(struct extent_io_tree *tree, if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 1; - ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, - EXTENT_UPTODATE, 1, cached_state); - if (ret) - return ret; + if (eb_straddles_pages(eb)) { + ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, + EXTENT_UPTODATE, 1, cached_state); + if (ret) + return ret; + } num_pages = num_extent_pages(eb->start, eb->len); for (i = 0; i < num_pages; i++) { @@ -3367,8 +3942,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree, } int read_extent_buffer_pages(struct extent_io_tree *tree, - struct extent_buffer *eb, - u64 start, int wait, + struct extent_buffer *eb, u64 start, int wait, get_extent_t *get_extent, int mirror_num) { unsigned long i; @@ -3386,9 +3960,11 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 0; - if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, - EXTENT_UPTODATE, 1, NULL)) { - return 0; + if (eb_straddles_pages(eb)) { + if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, + EXTENT_UPTODATE, 1, NULL)) { + return 0; + } } if (start) { @@ -3402,7 +3978,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, num_pages = num_extent_pages(eb->start, eb->len); for (i = start_i; i < num_pages; i++) { page = extent_buffer_page(eb, i); - if (!wait) { + if (wait == WAIT_NONE) { if (!trylock_page(page)) goto unlock_exit; } else { @@ -3446,7 +4022,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, if (bio) submit_one_bio(READ, bio, mirror_num, bio_flags); - if (ret || !wait) + if (ret || wait != WAIT_COMPLETE) return ret; for (i = start_i; i < num_pages; i++) { @@ -3492,9 +4068,8 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv, page = extent_buffer_page(eb, i); cur = min(len, (PAGE_CACHE_SIZE - offset)); - kaddr = kmap_atomic(page, KM_USER1); + kaddr = page_address(page); memcpy(dst, kaddr + offset, cur); - kunmap_atomic(kaddr, KM_USER1); dst += cur; len -= cur; @@ -3504,9 +4079,9 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv, } int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, - unsigned long min_len, char **token, char **map, + unsigned long min_len, char **map, unsigned long *map_start, - unsigned long *map_len, int km) + unsigned long *map_len) { size_t offset = start & (PAGE_CACHE_SIZE - 1); char *kaddr; @@ -3536,42 +4111,12 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, } p = extent_buffer_page(eb, i); - kaddr = kmap_atomic(p, km); - *token = kaddr; + kaddr = page_address(p); *map = kaddr + offset; *map_len = PAGE_CACHE_SIZE - offset; return 0; } -int map_extent_buffer(struct extent_buffer *eb, unsigned long start, - unsigned long min_len, - char **token, char **map, - unsigned long *map_start, - unsigned long *map_len, int km) -{ - int err; - int save = 0; - if (eb->map_token) { - unmap_extent_buffer(eb, eb->map_token, km); - eb->map_token = NULL; - save = 1; - } - err = map_private_extent_buffer(eb, start, min_len, token, map, - map_start, map_len, km); - if (!err && save) { - eb->map_token = *token; - eb->kaddr = *map; - eb->map_start = *map_start; - eb->map_len = *map_len; - } - return err; -} - -void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km) -{ - kunmap_atomic(token, km); -} - int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, unsigned long start, unsigned long len) @@ -3595,9 +4140,8 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, cur = min(len, (PAGE_CACHE_SIZE - offset)); - kaddr = kmap_atomic(page, KM_USER0); + kaddr = page_address(page); ret = memcmp(ptr, kaddr + offset, cur); - kunmap_atomic(kaddr, KM_USER0); if (ret) break; @@ -3630,9 +4174,8 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv, WARN_ON(!PageUptodate(page)); cur = min(len, PAGE_CACHE_SIZE - offset); - kaddr = kmap_atomic(page, KM_USER1); + kaddr = page_address(page); memcpy(kaddr + offset, src, cur); - kunmap_atomic(kaddr, KM_USER1); src += cur; len -= cur; @@ -3661,9 +4204,8 @@ void memset_extent_buffer(struct extent_buffer *eb, char c, WARN_ON(!PageUptodate(page)); cur = min(len, PAGE_CACHE_SIZE - offset); - kaddr = kmap_atomic(page, KM_USER0); + kaddr = page_address(page); memset(kaddr + offset, c, cur); - kunmap_atomic(kaddr, KM_USER0); len -= cur; offset = 0; @@ -3694,9 +4236,8 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset)); - kaddr = kmap_atomic(page, KM_USER0); + kaddr = page_address(page); read_extent_buffer(src, kaddr + offset, src_offset, cur); - kunmap_atomic(kaddr, KM_USER0); src_offset += cur; len -= cur; @@ -3709,20 +4250,17 @@ static void move_pages(struct page *dst_page, struct page *src_page, unsigned long dst_off, unsigned long src_off, unsigned long len) { - char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); + char *dst_kaddr = page_address(dst_page); if (dst_page == src_page) { memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len); } else { - char *src_kaddr = kmap_atomic(src_page, KM_USER1); + char *src_kaddr = page_address(src_page); char *p = dst_kaddr + dst_off + len; char *s = src_kaddr + src_off + len; while (len--) *--p = *--s; - - kunmap_atomic(src_kaddr, KM_USER1); } - kunmap_atomic(dst_kaddr, KM_USER0); } static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len) @@ -3735,20 +4273,17 @@ static void copy_pages(struct page *dst_page, struct page *src_page, unsigned long dst_off, unsigned long src_off, unsigned long len) { - char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); + char *dst_kaddr = page_address(dst_page); char *src_kaddr; if (dst_page != src_page) { - src_kaddr = kmap_atomic(src_page, KM_USER1); + src_kaddr = page_address(src_page); } else { src_kaddr = dst_kaddr; BUG_ON(areas_overlap(src_off, dst_off, len)); } memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); - kunmap_atomic(dst_kaddr, KM_USER0); - if (dst_page != src_page) - kunmap_atomic(src_kaddr, KM_USER1); } void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index a11a92ee2d30..7604c3001322 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -17,6 +17,8 @@ #define EXTENT_NODATASUM (1 << 10) #define EXTENT_DO_ACCOUNTING (1 << 11) #define EXTENT_FIRST_DELALLOC (1 << 12) +#define EXTENT_NEED_WAIT (1 << 13) +#define EXTENT_DAMAGED (1 << 14) #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) @@ -32,6 +34,7 @@ #define EXTENT_BUFFER_BLOCKING 1 #define EXTENT_BUFFER_DIRTY 2 #define EXTENT_BUFFER_CORRUPT 3 +#define EXTENT_BUFFER_READAHEAD 4 /* this got triggered by readahead */ /* these are flags for extent_clear_unlock_delalloc */ #define EXTENT_CLEAR_UNLOCK_PAGE 0x1 @@ -67,7 +70,7 @@ struct extent_io_ops { unsigned long bio_flags); int (*readpage_io_hook)(struct page *page, u64 start, u64 end); int (*readpage_io_failed_hook)(struct bio *bio, struct page *page, - u64 start, u64 end, + u64 start, u64 end, int failed_mirror, struct extent_state *state); int (*writepage_io_failed_hook)(struct bio *bio, struct page *page, u64 start, u64 end, @@ -76,16 +79,17 @@ struct extent_io_ops { struct extent_state *state); int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, struct extent_state *state, int uptodate); - int (*set_bit_hook)(struct inode *inode, struct extent_state *state, - int *bits); - int (*clear_bit_hook)(struct inode *inode, struct extent_state *state, - int *bits); - int (*merge_extent_hook)(struct inode *inode, - struct extent_state *new, - struct extent_state *other); - int (*split_extent_hook)(struct inode *inode, - struct extent_state *orig, u64 split); - int (*write_cache_pages_lock_hook)(struct page *page); + void (*set_bit_hook)(struct inode *inode, struct extent_state *state, + int *bits); + void (*clear_bit_hook)(struct inode *inode, struct extent_state *state, + int *bits); + void (*merge_extent_hook)(struct inode *inode, + struct extent_state *new, + struct extent_state *other); + void (*split_extent_hook)(struct inode *inode, + struct extent_state *orig, u64 split); + int (*write_cache_pages_lock_hook)(struct page *page, void *data, + void (*flush_fn)(void *)); }; struct extent_io_tree { @@ -108,8 +112,6 @@ struct extent_state { wait_queue_head_t wq; atomic_t refs; unsigned long state; - u64 split_start; - u64 split_end; /* for use by the FS */ u64 private; @@ -120,8 +122,6 @@ struct extent_state { struct extent_buffer { u64 start; unsigned long len; - char *map_token; - char *kaddr; unsigned long map_start; unsigned long map_len; struct page *first_page; @@ -130,14 +130,26 @@ struct extent_buffer { struct rcu_head rcu_head; atomic_t refs; - /* the spinlock is used to protect most operations */ - spinlock_t lock; + /* count of read lock holders on the extent buffer */ + atomic_t write_locks; + atomic_t read_locks; + atomic_t blocking_writers; + atomic_t blocking_readers; + atomic_t spinning_readers; + atomic_t spinning_writers; + + /* protects write locks */ + rwlock_t lock; + + /* readers use lock_wq while they wait for the write + * lock holders to unlock + */ + wait_queue_head_t write_lock_wq; - /* - * when we keep the lock held while blocking, waiters go onto - * the wq + /* writers use read_lock_wq while they wait for readers + * to unlock */ - wait_queue_head_t lock_wq; + wait_queue_head_t read_lock_wq; }; static inline void extent_set_compress_type(unsigned long *bio_flags, @@ -177,7 +189,7 @@ int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); int extent_read_full_page(struct extent_io_tree *tree, struct page *page, - get_extent_t *get_extent); + get_extent_t *get_extent, int mirror_num); int __init extent_io_init(void); void extent_io_exit(void); @@ -206,6 +218,8 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); +int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + int bits, int clear_bits, gfp_t mask); int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask); int find_first_extent_bit(struct extent_io_tree *tree, u64 start, @@ -240,9 +254,14 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, u64 start, unsigned long len); void free_extent_buffer(struct extent_buffer *eb); +#define WAIT_NONE 0 +#define WAIT_COMPLETE 1 +#define WAIT_PAGE_LOCK 2 int read_extent_buffer_pages(struct extent_io_tree *tree, struct extent_buffer *eb, u64 start, int wait, get_extent_t *get_extent, int mirror_num); +unsigned long num_extent_pages(u64 start, u64 len); +struct page *extent_buffer_page(struct extent_buffer *eb, unsigned long i); static inline void extent_buffer_get(struct extent_buffer *eb) { @@ -279,15 +298,10 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree, int extent_buffer_uptodate(struct extent_io_tree *tree, struct extent_buffer *eb, struct extent_state *cached_state); -int map_extent_buffer(struct extent_buffer *eb, unsigned long offset, - unsigned long min_len, char **token, char **map, - unsigned long *map_start, - unsigned long *map_len, int km); int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset, - unsigned long min_len, char **token, char **map, + unsigned long min_len, char **map, unsigned long *map_start, - unsigned long *map_len, int km); -void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km); + unsigned long *map_len); int extent_range_uptodate(struct extent_io_tree *tree, u64 start, u64 end); int extent_clear_unlock_delalloc(struct inode *inode, @@ -297,4 +311,10 @@ int extent_clear_unlock_delalloc(struct inode *inode, struct bio * btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, gfp_t gfp_flags); + +struct btrfs_mapping_tree; + +int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, + u64 length, u64 logical, struct page *page, + int mirror_num); #endif diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 2d0410344ea3..7c97b3301459 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -183,22 +183,10 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next) return 0; } -int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len) +static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) { - int ret = 0; struct extent_map *merge = NULL; struct rb_node *rb; - struct extent_map *em; - - write_lock(&tree->lock); - em = lookup_extent_mapping(tree, start, len); - - WARN_ON(!em || em->start != start); - - if (!em) - goto out; - - clear_bit(EXTENT_FLAG_PINNED, &em->flags); if (em->start != 0) { rb = rb_prev(&em->rb_node); @@ -225,6 +213,24 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len) merge->in_tree = 0; free_extent_map(merge); } +} + +int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len) +{ + int ret = 0; + struct extent_map *em; + + write_lock(&tree->lock); + em = lookup_extent_mapping(tree, start, len); + + WARN_ON(!em || em->start != start); + + if (!em) + goto out; + + clear_bit(EXTENT_FLAG_PINNED, &em->flags); + + try_merge_map(tree, em); free_extent_map(em); out: @@ -247,7 +253,6 @@ int add_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) { int ret = 0; - struct extent_map *merge = NULL; struct rb_node *rb; struct extent_map *exist; @@ -263,30 +268,8 @@ int add_extent_mapping(struct extent_map_tree *tree, goto out; } atomic_inc(&em->refs); - if (em->start != 0) { - rb = rb_prev(&em->rb_node); - if (rb) - merge = rb_entry(rb, struct extent_map, rb_node); - if (rb && mergable_maps(merge, em)) { - em->start = merge->start; - em->len += merge->len; - em->block_len += merge->block_len; - em->block_start = merge->block_start; - merge->in_tree = 0; - rb_erase(&merge->rb_node, &tree->map); - free_extent_map(merge); - } - } - rb = rb_next(&em->rb_node); - if (rb) - merge = rb_entry(rb, struct extent_map, rb_node); - if (rb && mergable_maps(em, merge)) { - em->len += merge->len; - em->block_len += merge->len; - rb_erase(&merge->rb_node, &tree->map); - merge->in_tree = 0; - free_extent_map(merge); - } + + try_merge_map(tree, em); out: return ret; } @@ -299,19 +282,8 @@ static u64 range_end(u64 start, u64 len) return start + len; } -/** - * lookup_extent_mapping - lookup extent_map - * @tree: tree to lookup in - * @start: byte offset to start the search - * @len: length of the lookup range - * - * Find and return the first extent_map struct in @tree that intersects the - * [start, len] range. There may be additional objects in the tree that - * intersect, so check the object returned carefully to make sure that no - * additional lookups are needed. - */ -struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, - u64 start, u64 len) +struct extent_map *__lookup_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len, int strict) { struct extent_map *em; struct rb_node *rb_node; @@ -320,38 +292,42 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, u64 end = range_end(start, len); rb_node = __tree_search(&tree->map, start, &prev, &next); - if (!rb_node && prev) { - em = rb_entry(prev, struct extent_map, rb_node); - if (end > em->start && start < extent_map_end(em)) - goto found; - } - if (!rb_node && next) { - em = rb_entry(next, struct extent_map, rb_node); - if (end > em->start && start < extent_map_end(em)) - goto found; - } if (!rb_node) { - em = NULL; - goto out; - } - if (IS_ERR(rb_node)) { - em = ERR_CAST(rb_node); - goto out; + if (prev) + rb_node = prev; + else if (next) + rb_node = next; + else + return NULL; } + em = rb_entry(rb_node, struct extent_map, rb_node); - if (end > em->start && start < extent_map_end(em)) - goto found; - em = NULL; - goto out; + if (strict && !(end > em->start && start < extent_map_end(em))) + return NULL; -found: atomic_inc(&em->refs); -out: return em; } /** + * lookup_extent_mapping - lookup extent_map + * @tree: tree to lookup in + * @start: byte offset to start the search + * @len: length of the lookup range + * + * Find and return the first extent_map struct in @tree that intersects the + * [start, len] range. There may be additional objects in the tree that + * intersect, so check the object returned carefully to make sure that no + * additional lookups are needed. + */ +struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len) +{ + return __lookup_extent_mapping(tree, start, len, 1); +} + +/** * search_extent_mapping - find a nearby extent map * @tree: tree to lookup in * @start: byte offset to start the search @@ -365,38 +341,7 @@ out: struct extent_map *search_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len) { - struct extent_map *em; - struct rb_node *rb_node; - struct rb_node *prev = NULL; - struct rb_node *next = NULL; - - rb_node = __tree_search(&tree->map, start, &prev, &next); - if (!rb_node && prev) { - em = rb_entry(prev, struct extent_map, rb_node); - goto found; - } - if (!rb_node && next) { - em = rb_entry(next, struct extent_map, rb_node); - goto found; - } - if (!rb_node) { - em = NULL; - goto out; - } - if (IS_ERR(rb_node)) { - em = ERR_CAST(rb_node); - goto out; - } - em = rb_entry(rb_node, struct extent_map, rb_node); - goto found; - - em = NULL; - goto out; - -found: - atomic_inc(&em->refs); -out: - return em; + return __lookup_extent_mapping(tree, start, len, 0); } /** diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 90d4ee52cd45..c7fb3a4247d3 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -91,8 +91,7 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans, struct btrfs_csum_item *item; struct extent_buffer *leaf; u64 csum_offset = 0; - u16 csum_size = - btrfs_super_csum_size(&root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); int csums_in_item; file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; @@ -162,8 +161,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, u64 item_last_offset = 0; u64 disk_bytenr; u32 diff; - u16 csum_size = - btrfs_super_csum_size(&root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); int ret; struct btrfs_path *path; struct btrfs_csum_item *item = NULL; @@ -177,6 +175,17 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, WARN_ON(bio->bi_vcnt <= 0); + /* + * the free space stuff is only read when it hasn't been + * updated in the current transaction. So, we can safely + * read from the commit root and sidestep a nasty deadlock + * between reading the free space cache and updating the csum tree. + */ + if (btrfs_is_free_space_inode(root, inode)) { + path->search_commit_root = 1; + path->skip_locking = 1; + } + disk_bytenr = (u64)bio->bi_sector << 9; if (dio) offset = logical_offset; @@ -279,10 +288,11 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, int ret; size_t size; u64 csum_end; - u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; if (search_commit) { path->skip_locking = 1; @@ -480,8 +490,7 @@ static noinline int truncate_one_csum(struct btrfs_trans_handle *trans, u64 bytenr, u64 len) { struct extent_buffer *leaf; - u16 csum_size = - btrfs_super_csum_size(&root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); u64 csum_end; u64 end_byte = bytenr + len; u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits; @@ -537,8 +546,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, u64 csum_end; struct extent_buffer *leaf; int ret; - u16 csum_size = - btrfs_super_csum_size(&root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); int blocksize_bits = root->fs_info->sb->s_blocksize_bits; root = root->fs_info->csum_root; @@ -664,15 +672,12 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_sector_sum *sector_sum; u32 nritems; u32 ins_size; - char *eb_map; - char *eb_token; - unsigned long map_len; - unsigned long map_start; - u16 csum_size = - btrfs_super_csum_size(&root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; + sector_sum = sums->sums; again: next_offset = (u64)-1; @@ -814,30 +819,9 @@ found: item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); item_end = (struct btrfs_csum_item *)((unsigned char *)item_end + btrfs_item_size_nr(leaf, path->slots[0])); - eb_token = NULL; next_sector: - if (!eb_token || - (unsigned long)item + csum_size >= map_start + map_len) { - int err; - - if (eb_token) - unmap_extent_buffer(leaf, eb_token, KM_USER1); - eb_token = NULL; - err = map_private_extent_buffer(leaf, (unsigned long)item, - csum_size, - &eb_token, &eb_map, - &map_start, &map_len, KM_USER1); - if (err) - eb_token = NULL; - } - if (eb_token) { - memcpy(eb_token + ((unsigned long)item & (PAGE_CACHE_SIZE - 1)), - §or_sum->sum, csum_size); - } else { - write_extent_buffer(leaf, §or_sum->sum, - (unsigned long)item, csum_size); - } + write_extent_buffer(leaf, §or_sum->sum, (unsigned long)item, csum_size); total_bytes += root->sectorsize; sector_sum++; @@ -850,10 +834,7 @@ next_sector: goto next_sector; } } - if (eb_token) { - unmap_extent_buffer(leaf, eb_token, KM_USER1); - eb_token = NULL; - } + btrfs_mark_buffer_dirty(path->nodes[0]); if (total_bytes < sums->len) { btrfs_release_path(path); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 59cbdb120ad0..dafdfa059bf6 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -74,7 +74,7 @@ struct inode_defrag { * If an existing record is found the defrag item you * pass in is freed */ -static int __btrfs_add_inode_defrag(struct inode *inode, +static void __btrfs_add_inode_defrag(struct inode *inode, struct inode_defrag *defrag) { struct btrfs_root *root = BTRFS_I(inode)->root; @@ -106,11 +106,11 @@ static int __btrfs_add_inode_defrag(struct inode *inode, BTRFS_I(inode)->in_defrag = 1; rb_link_node(&defrag->rb_node, parent, p); rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes); - return 0; + return; exists: kfree(defrag); - return 0; + return; } @@ -123,7 +123,6 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, { struct btrfs_root *root = BTRFS_I(inode)->root; struct inode_defrag *defrag; - int ret = 0; u64 transid; if (!btrfs_test_opt(root, AUTO_DEFRAG)) @@ -150,9 +149,11 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, spin_lock(&root->fs_info->defrag_inodes_lock); if (!BTRFS_I(inode)->in_defrag) - ret = __btrfs_add_inode_defrag(inode, defrag); + __btrfs_add_inode_defrag(inode, defrag); + else + kfree(defrag); spin_unlock(&root->fs_info->defrag_inodes_lock); - return ret; + return 0; } /* @@ -855,7 +856,8 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, btrfs_drop_extent_cache(inode, start, end - 1, 0); path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; again: recow = 0; split = start; @@ -1034,11 +1036,13 @@ out: * on error we return an unlocked page and the error value * on success we return a locked page and 0 */ -static int prepare_uptodate_page(struct page *page, u64 pos) +static int prepare_uptodate_page(struct page *page, u64 pos, + bool force_uptodate) { int ret = 0; - if ((pos & (PAGE_CACHE_SIZE - 1)) && !PageUptodate(page)) { + if (((pos & (PAGE_CACHE_SIZE - 1)) || force_uptodate) && + !PageUptodate(page)) { ret = btrfs_readpage(NULL, page); if (ret) return ret; @@ -1059,12 +1063,13 @@ static int prepare_uptodate_page(struct page *page, u64 pos) static noinline int prepare_pages(struct btrfs_root *root, struct file *file, struct page **pages, size_t num_pages, loff_t pos, unsigned long first_index, - unsigned long last_index, size_t write_bytes) + size_t write_bytes, bool force_uptodate) { struct extent_state *cached_state = NULL; int i; unsigned long index = pos >> PAGE_CACHE_SHIFT; struct inode *inode = fdentry(file)->d_inode; + gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); int err = 0; int faili = 0; u64 start_pos; @@ -1073,15 +1078,10 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file, start_pos = pos & ~((u64)root->sectorsize - 1); last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT; - if (start_pos > inode->i_size) { - err = btrfs_cont_expand(inode, i_size_read(inode), start_pos); - if (err) - return err; - } - again: for (i = 0; i < num_pages; i++) { - pages[i] = grab_cache_page(inode->i_mapping, index + i); + pages[i] = find_or_create_page(inode->i_mapping, index + i, + mask); if (!pages[i]) { faili = i - 1; err = -ENOMEM; @@ -1089,10 +1089,11 @@ again: } if (i == 0) - err = prepare_uptodate_page(pages[i], pos); + err = prepare_uptodate_page(pages[i], pos, + force_uptodate); if (i == num_pages - 1) err = prepare_uptodate_page(pages[i], - pos + write_bytes); + pos + write_bytes, false); if (err) { page_cache_release(pages[i]); faili = i - 1; @@ -1158,10 +1159,10 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, struct btrfs_root *root = BTRFS_I(inode)->root; struct page **pages = NULL; unsigned long first_index; - unsigned long last_index; size_t num_written = 0; int nrptrs; int ret = 0; + bool force_page_uptodate = false; nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE, PAGE_CACHE_SIZE / @@ -1171,7 +1172,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, return -ENOMEM; first_index = pos >> PAGE_CACHE_SHIFT; - last_index = (pos + iov_iter_count(i)) >> PAGE_CACHE_SHIFT; while (iov_iter_count(i) > 0) { size_t offset = pos & (PAGE_CACHE_SIZE - 1); @@ -1205,8 +1205,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, * contents of pages from loop to loop */ ret = prepare_pages(root, file, pages, num_pages, - pos, first_index, last_index, - write_bytes); + pos, first_index, write_bytes, + force_page_uptodate); if (ret) { btrfs_delalloc_release_space(inode, num_pages << PAGE_CACHE_SHIFT); @@ -1223,12 +1223,15 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, if (copied < write_bytes) nrptrs = 1; - if (copied == 0) + if (copied == 0) { + force_page_uptodate = true; dirty_pages = 0; - else + } else { + force_page_uptodate = false; dirty_pages = (copied + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + } /* * If we had a short copy we need to release the excess delaloc @@ -1238,9 +1241,11 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, * managed to copy. */ if (num_pages > dirty_pages) { - if (copied > 0) - atomic_inc( - &BTRFS_I(inode)->outstanding_extents); + if (copied > 0) { + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->outstanding_extents++; + spin_unlock(&BTRFS_I(inode)->lock); + } btrfs_delalloc_release_space(inode, (num_pages - dirty_pages) << PAGE_CACHE_SHIFT); @@ -1336,6 +1341,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, struct inode *inode = fdentry(file)->d_inode; struct btrfs_root *root = BTRFS_I(inode)->root; loff_t *ppos = &iocb->ki_pos; + u64 start_pos; ssize_t num_written = 0; ssize_t err = 0; size_t count, ocount; @@ -1384,6 +1390,15 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, file_update_time(file); BTRFS_I(inode)->sequence++; + start_pos = round_down(pos, root->sectorsize); + if (start_pos > i_size_read(inode)) { + err = btrfs_cont_expand(inode, i_size_read(inode), start_pos); + if (err) { + mutex_unlock(&inode->i_mutex); + goto out; + } + } + if (unlikely(file->f_flags & O_DIRECT)) { num_written = __btrfs_direct_write(iocb, iov, nr_segs, pos, ppos, count, ocount); @@ -1601,10 +1616,6 @@ static long btrfs_fallocate(struct file *file, int mode, goto out; } - ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start); - if (ret) - goto out; - locked_end = alloc_end - 1; while (1) { struct btrfs_ordered_extent *ordered; @@ -1638,23 +1649,53 @@ static long btrfs_fallocate(struct file *file, int mode, cur_offset = alloc_start; while (1) { + u64 actual_end; + em = btrfs_get_extent(inode, NULL, 0, cur_offset, alloc_end - cur_offset, 0); BUG_ON(IS_ERR_OR_NULL(em)); last_byte = min(extent_map_end(em), alloc_end); + actual_end = min_t(u64, extent_map_end(em), offset + len); last_byte = (last_byte + mask) & ~mask; + if (em->block_start == EXTENT_MAP_HOLE || (cur_offset >= inode->i_size && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { + + /* + * Make sure we have enough space before we do the + * allocation. + */ + ret = btrfs_check_data_free_space(inode, last_byte - + cur_offset); + if (ret) { + free_extent_map(em); + break; + } + ret = btrfs_prealloc_file_range(inode, mode, cur_offset, last_byte - cur_offset, 1 << inode->i_blkbits, offset + len, &alloc_hint); + + /* Let go of our reservation. */ + btrfs_free_reserved_data_space(inode, last_byte - + cur_offset); if (ret < 0) { free_extent_map(em); break; } + } else if (actual_end > inode->i_size && + !(mode & FALLOC_FL_KEEP_SIZE)) { + /* + * We didn't need to allocate any more space, but we + * still extended the size of the file so we need to + * update i_size. + */ + inode->i_ctime = CURRENT_TIME; + i_size_write(inode, actual_end); + btrfs_ordered_update_i_size(inode, actual_end, NULL); } free_extent_map(em); @@ -1666,8 +1707,6 @@ static long btrfs_fallocate(struct file *file, int mode, } unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, &cached_state, GFP_NOFS); - - btrfs_free_reserved_data_space(inode, alloc_end - alloc_start); out: mutex_unlock(&inode->i_mutex); return ret; @@ -1793,10 +1832,15 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin) switch (origin) { case SEEK_END: case SEEK_CUR: - offset = generic_file_llseek_unlocked(file, offset, origin); + offset = generic_file_llseek(file, offset, origin); goto out; case SEEK_DATA: case SEEK_HOLE: + if (offset >= i_size_read(inode)) { + mutex_unlock(&inode->i_mutex); + return -ENXIO; + } + ret = find_desired_extent(inode, &offset, origin); if (ret) { mutex_unlock(&inode->i_mutex); @@ -1804,10 +1848,14 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin) } } - if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) - return -EINVAL; - if (offset > inode->i_sb->s_maxbytes) - return -EINVAL; + if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) { + offset = -EINVAL; + goto out; + } + if (offset > inode->i_sb->s_maxbytes) { + offset = -EINVAL; + goto out; + } /* Special lock needed here? */ if (offset != file->f_pos) { diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index bf0d61567f3d..ec23d43d0c35 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -20,6 +20,7 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/math64.h> +#include <linux/ratelimit.h> #include "ctree.h" #include "free-space-cache.h" #include "transaction.h" @@ -84,6 +85,7 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root, *block_group, struct btrfs_path *path) { struct inode *inode = NULL; + u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; spin_lock(&block_group->lock); if (block_group->inode) @@ -98,7 +100,14 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root, return inode; spin_lock(&block_group->lock); - if (!btrfs_fs_closing(root->fs_info)) { + if (!((BTRFS_I(inode)->flags & flags) == flags)) { + printk(KERN_INFO "Old style space inode found, converting.\n"); + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM | + BTRFS_INODE_NODATACOW; + block_group->disk_cache_state = BTRFS_DC_CLEAR; + } + + if (!block_group->iref) { block_group->inode = igrab(inode); block_group->iref = 1; } @@ -116,12 +125,17 @@ int __create_free_space_inode(struct btrfs_root *root, struct btrfs_free_space_header *header; struct btrfs_inode_item *inode_item; struct extent_buffer *leaf; + u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC; int ret; ret = btrfs_insert_empty_inode(trans, root, path, ino); if (ret) return ret; + /* We inline crc's for the free disk space cache */ + if (ino != BTRFS_FREE_INO_OBJECTID) + flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; + leaf = path->nodes[0]; inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); @@ -134,8 +148,7 @@ int __create_free_space_inode(struct btrfs_root *root, btrfs_set_inode_uid(leaf, inode_item, 0); btrfs_set_inode_gid(leaf, inode_item, 0); btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600); - btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS | - BTRFS_INODE_PREALLOC | BTRFS_INODE_NODATASUM); + btrfs_set_inode_flags(leaf, inode_item, flags); btrfs_set_inode_nlink(leaf, inode_item, 1); btrfs_set_inode_transid(leaf, inode_item, trans->transid); btrfs_set_inode_block_group(leaf, inode_item, offset); @@ -184,15 +197,25 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root, struct btrfs_path *path, struct inode *inode) { + struct btrfs_block_rsv *rsv; + u64 needed_bytes; loff_t oldsize; int ret = 0; - trans->block_rsv = root->orphan_block_rsv; - ret = btrfs_block_rsv_check(trans, root, - root->orphan_block_rsv, - 0, 5); - if (ret) - return ret; + rsv = trans->block_rsv; + trans->block_rsv = &root->fs_info->global_block_rsv; + + /* 1 for slack space, 1 for updating the inode */ + needed_bytes = btrfs_calc_trunc_metadata_size(root, 1) + + btrfs_calc_trans_metadata_size(root, 1); + + spin_lock(&trans->block_rsv->lock); + if (trans->block_rsv->reserved < needed_bytes) { + spin_unlock(&trans->block_rsv->lock); + trans->block_rsv = rsv; + return -ENOSPC; + } + spin_unlock(&trans->block_rsv->lock); oldsize = i_size_read(inode); btrfs_i_size_write(inode, 0); @@ -204,12 +227,16 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root, */ ret = btrfs_truncate_inode_items(trans, root, inode, 0, BTRFS_EXTENT_DATA_KEY); + if (ret) { + trans->block_rsv = rsv; WARN_ON(1); return ret; } ret = btrfs_update_inode(trans, root, inode); + trans->block_rsv = rsv; + return ret; } @@ -232,31 +259,348 @@ static int readahead_cache(struct inode *inode) return 0; } +struct io_ctl { + void *cur, *orig; + struct page *page; + struct page **pages; + struct btrfs_root *root; + unsigned long size; + int index; + int num_pages; + unsigned check_crcs:1; +}; + +static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode, + struct btrfs_root *root) +{ + memset(io_ctl, 0, sizeof(struct io_ctl)); + io_ctl->num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + io_ctl->pages = kzalloc(sizeof(struct page *) * io_ctl->num_pages, + GFP_NOFS); + if (!io_ctl->pages) + return -ENOMEM; + io_ctl->root = root; + if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID) + io_ctl->check_crcs = 1; + return 0; +} + +static void io_ctl_free(struct io_ctl *io_ctl) +{ + kfree(io_ctl->pages); +} + +static void io_ctl_unmap_page(struct io_ctl *io_ctl) +{ + if (io_ctl->cur) { + kunmap(io_ctl->page); + io_ctl->cur = NULL; + io_ctl->orig = NULL; + } +} + +static void io_ctl_map_page(struct io_ctl *io_ctl, int clear) +{ + WARN_ON(io_ctl->cur); + BUG_ON(io_ctl->index >= io_ctl->num_pages); + io_ctl->page = io_ctl->pages[io_ctl->index++]; + io_ctl->cur = kmap(io_ctl->page); + io_ctl->orig = io_ctl->cur; + io_ctl->size = PAGE_CACHE_SIZE; + if (clear) + memset(io_ctl->cur, 0, PAGE_CACHE_SIZE); +} + +static void io_ctl_drop_pages(struct io_ctl *io_ctl) +{ + int i; + + io_ctl_unmap_page(io_ctl); + + for (i = 0; i < io_ctl->num_pages; i++) { + ClearPageChecked(io_ctl->pages[i]); + unlock_page(io_ctl->pages[i]); + page_cache_release(io_ctl->pages[i]); + } +} + +static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode, + int uptodate) +{ + struct page *page; + gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); + int i; + + for (i = 0; i < io_ctl->num_pages; i++) { + page = find_or_create_page(inode->i_mapping, i, mask); + if (!page) { + io_ctl_drop_pages(io_ctl); + return -ENOMEM; + } + io_ctl->pages[i] = page; + if (uptodate && !PageUptodate(page)) { + btrfs_readpage(NULL, page); + lock_page(page); + if (!PageUptodate(page)) { + printk(KERN_ERR "btrfs: error reading free " + "space cache\n"); + io_ctl_drop_pages(io_ctl); + return -EIO; + } + } + } + + for (i = 0; i < io_ctl->num_pages; i++) { + clear_page_dirty_for_io(io_ctl->pages[i]); + set_page_extent_mapped(io_ctl->pages[i]); + } + + return 0; +} + +static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation) +{ + u64 *val; + + io_ctl_map_page(io_ctl, 1); + + /* + * Skip the csum areas. If we don't check crcs then we just have a + * 64bit chunk at the front of the first page. + */ + if (io_ctl->check_crcs) { + io_ctl->cur += (sizeof(u32) * io_ctl->num_pages); + io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages); + } else { + io_ctl->cur += sizeof(u64); + io_ctl->size -= sizeof(u64) * 2; + } + + val = io_ctl->cur; + *val = cpu_to_le64(generation); + io_ctl->cur += sizeof(u64); +} + +static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation) +{ + u64 *gen; + + /* + * Skip the crc area. If we don't check crcs then we just have a 64bit + * chunk at the front of the first page. + */ + if (io_ctl->check_crcs) { + io_ctl->cur += sizeof(u32) * io_ctl->num_pages; + io_ctl->size -= sizeof(u64) + + (sizeof(u32) * io_ctl->num_pages); + } else { + io_ctl->cur += sizeof(u64); + io_ctl->size -= sizeof(u64) * 2; + } + + gen = io_ctl->cur; + if (le64_to_cpu(*gen) != generation) { + printk_ratelimited(KERN_ERR "btrfs: space cache generation " + "(%Lu) does not match inode (%Lu)\n", *gen, + generation); + io_ctl_unmap_page(io_ctl); + return -EIO; + } + io_ctl->cur += sizeof(u64); + return 0; +} + +static void io_ctl_set_crc(struct io_ctl *io_ctl, int index) +{ + u32 *tmp; + u32 crc = ~(u32)0; + unsigned offset = 0; + + if (!io_ctl->check_crcs) { + io_ctl_unmap_page(io_ctl); + return; + } + + if (index == 0) + offset = sizeof(u32) * io_ctl->num_pages;; + + crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc, + PAGE_CACHE_SIZE - offset); + btrfs_csum_final(crc, (char *)&crc); + io_ctl_unmap_page(io_ctl); + tmp = kmap(io_ctl->pages[0]); + tmp += index; + *tmp = crc; + kunmap(io_ctl->pages[0]); +} + +static int io_ctl_check_crc(struct io_ctl *io_ctl, int index) +{ + u32 *tmp, val; + u32 crc = ~(u32)0; + unsigned offset = 0; + + if (!io_ctl->check_crcs) { + io_ctl_map_page(io_ctl, 0); + return 0; + } + + if (index == 0) + offset = sizeof(u32) * io_ctl->num_pages; + + tmp = kmap(io_ctl->pages[0]); + tmp += index; + val = *tmp; + kunmap(io_ctl->pages[0]); + + io_ctl_map_page(io_ctl, 0); + crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc, + PAGE_CACHE_SIZE - offset); + btrfs_csum_final(crc, (char *)&crc); + if (val != crc) { + printk_ratelimited(KERN_ERR "btrfs: csum mismatch on free " + "space cache\n"); + io_ctl_unmap_page(io_ctl); + return -EIO; + } + + return 0; +} + +static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes, + void *bitmap) +{ + struct btrfs_free_space_entry *entry; + + if (!io_ctl->cur) + return -ENOSPC; + + entry = io_ctl->cur; + entry->offset = cpu_to_le64(offset); + entry->bytes = cpu_to_le64(bytes); + entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP : + BTRFS_FREE_SPACE_EXTENT; + io_ctl->cur += sizeof(struct btrfs_free_space_entry); + io_ctl->size -= sizeof(struct btrfs_free_space_entry); + + if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) + return 0; + + io_ctl_set_crc(io_ctl, io_ctl->index - 1); + + /* No more pages to map */ + if (io_ctl->index >= io_ctl->num_pages) + return 0; + + /* map the next page */ + io_ctl_map_page(io_ctl, 1); + return 0; +} + +static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap) +{ + if (!io_ctl->cur) + return -ENOSPC; + + /* + * If we aren't at the start of the current page, unmap this one and + * map the next one if there is any left. + */ + if (io_ctl->cur != io_ctl->orig) { + io_ctl_set_crc(io_ctl, io_ctl->index - 1); + if (io_ctl->index >= io_ctl->num_pages) + return -ENOSPC; + io_ctl_map_page(io_ctl, 0); + } + + memcpy(io_ctl->cur, bitmap, PAGE_CACHE_SIZE); + io_ctl_set_crc(io_ctl, io_ctl->index - 1); + if (io_ctl->index < io_ctl->num_pages) + io_ctl_map_page(io_ctl, 0); + return 0; +} + +static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl) +{ + /* + * If we're not on the boundary we know we've modified the page and we + * need to crc the page. + */ + if (io_ctl->cur != io_ctl->orig) + io_ctl_set_crc(io_ctl, io_ctl->index - 1); + else + io_ctl_unmap_page(io_ctl); + + while (io_ctl->index < io_ctl->num_pages) { + io_ctl_map_page(io_ctl, 1); + io_ctl_set_crc(io_ctl, io_ctl->index - 1); + } +} + +static int io_ctl_read_entry(struct io_ctl *io_ctl, + struct btrfs_free_space *entry, u8 *type) +{ + struct btrfs_free_space_entry *e; + int ret; + + if (!io_ctl->cur) { + ret = io_ctl_check_crc(io_ctl, io_ctl->index); + if (ret) + return ret; + } + + e = io_ctl->cur; + entry->offset = le64_to_cpu(e->offset); + entry->bytes = le64_to_cpu(e->bytes); + *type = e->type; + io_ctl->cur += sizeof(struct btrfs_free_space_entry); + io_ctl->size -= sizeof(struct btrfs_free_space_entry); + + if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) + return 0; + + io_ctl_unmap_page(io_ctl); + + return 0; +} + +static int io_ctl_read_bitmap(struct io_ctl *io_ctl, + struct btrfs_free_space *entry) +{ + int ret; + + ret = io_ctl_check_crc(io_ctl, io_ctl->index); + if (ret) + return ret; + + memcpy(entry->bitmap, io_ctl->cur, PAGE_CACHE_SIZE); + io_ctl_unmap_page(io_ctl); + + return 0; +} + int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, struct btrfs_free_space_ctl *ctl, struct btrfs_path *path, u64 offset) { struct btrfs_free_space_header *header; struct extent_buffer *leaf; - struct page *page; - u32 *checksums = NULL, *crc; - char *disk_crcs = NULL; + struct io_ctl io_ctl; struct btrfs_key key; + struct btrfs_free_space *e, *n; struct list_head bitmaps; u64 num_entries; u64 num_bitmaps; u64 generation; - u32 cur_crc = ~(u32)0; - pgoff_t index = 0; - unsigned long first_page_offset; - int num_checksums; + u8 type; int ret = 0; INIT_LIST_HEAD(&bitmaps); /* Nothing in the space cache, goodbye */ if (!i_size_read(inode)) - goto out; + return 0; key.objectid = BTRFS_FREE_SPACE_OBJECTID; key.offset = offset; @@ -264,11 +608,10 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) - goto out; + return 0; else if (ret > 0) { btrfs_release_path(path); - ret = 0; - goto out; + return 0; } ret = -1; @@ -286,194 +629,102 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, " not match free space cache generation (%llu)\n", (unsigned long long)BTRFS_I(inode)->generation, (unsigned long long)generation); - goto out; + return 0; } if (!num_entries) - goto out; - - /* Setup everything for doing checksumming */ - num_checksums = i_size_read(inode) / PAGE_CACHE_SIZE; - checksums = crc = kzalloc(sizeof(u32) * num_checksums, GFP_NOFS); - if (!checksums) - goto out; - first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64); - disk_crcs = kzalloc(first_page_offset, GFP_NOFS); - if (!disk_crcs) - goto out; + return 0; + io_ctl_init(&io_ctl, inode, root); ret = readahead_cache(inode); if (ret) goto out; - while (1) { - struct btrfs_free_space_entry *entry; - struct btrfs_free_space *e; - void *addr; - unsigned long offset = 0; - unsigned long start_offset = 0; - int need_loop = 0; + ret = io_ctl_prepare_pages(&io_ctl, inode, 1); + if (ret) + goto out; - if (!num_entries && !num_bitmaps) - break; + ret = io_ctl_check_crc(&io_ctl, 0); + if (ret) + goto free_cache; - if (index == 0) { - start_offset = first_page_offset; - offset = start_offset; - } + ret = io_ctl_check_generation(&io_ctl, generation); + if (ret) + goto free_cache; - page = grab_cache_page(inode->i_mapping, index); - if (!page) + while (num_entries) { + e = kmem_cache_zalloc(btrfs_free_space_cachep, + GFP_NOFS); + if (!e) goto free_cache; - if (!PageUptodate(page)) { - btrfs_readpage(NULL, page); - lock_page(page); - if (!PageUptodate(page)) { - unlock_page(page); - page_cache_release(page); - printk(KERN_ERR "btrfs: error reading free " - "space cache\n"); - goto free_cache; - } - } - addr = kmap(page); - - if (index == 0) { - u64 *gen; - - memcpy(disk_crcs, addr, first_page_offset); - gen = addr + (sizeof(u32) * num_checksums); - if (*gen != BTRFS_I(inode)->generation) { - printk(KERN_ERR "btrfs: space cache generation" - " (%llu) does not match inode (%llu)\n", - (unsigned long long)*gen, - (unsigned long long) - BTRFS_I(inode)->generation); - kunmap(page); - unlock_page(page); - page_cache_release(page); - goto free_cache; - } - crc = (u32 *)disk_crcs; - } - entry = addr + start_offset; - - /* First lets check our crc before we do anything fun */ - cur_crc = ~(u32)0; - cur_crc = btrfs_csum_data(root, addr + start_offset, cur_crc, - PAGE_CACHE_SIZE - start_offset); - btrfs_csum_final(cur_crc, (char *)&cur_crc); - if (cur_crc != *crc) { - printk(KERN_ERR "btrfs: crc mismatch for page %lu\n", - index); - kunmap(page); - unlock_page(page); - page_cache_release(page); + ret = io_ctl_read_entry(&io_ctl, e, &type); + if (ret) { + kmem_cache_free(btrfs_free_space_cachep, e); goto free_cache; } - crc++; - while (1) { - if (!num_entries) - break; + if (!e->bytes) { + kmem_cache_free(btrfs_free_space_cachep, e); + goto free_cache; + } - need_loop = 1; - e = kmem_cache_zalloc(btrfs_free_space_cachep, - GFP_NOFS); - if (!e) { - kunmap(page); - unlock_page(page); - page_cache_release(page); + if (type == BTRFS_FREE_SPACE_EXTENT) { + spin_lock(&ctl->tree_lock); + ret = link_free_space(ctl, e); + spin_unlock(&ctl->tree_lock); + if (ret) { + printk(KERN_ERR "Duplicate entries in " + "free space cache, dumping\n"); + kmem_cache_free(btrfs_free_space_cachep, e); goto free_cache; } - - e->offset = le64_to_cpu(entry->offset); - e->bytes = le64_to_cpu(entry->bytes); - if (!e->bytes) { - kunmap(page); - kmem_cache_free(btrfs_free_space_cachep, e); - unlock_page(page); - page_cache_release(page); + } else { + BUG_ON(!num_bitmaps); + num_bitmaps--; + e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); + if (!e->bitmap) { + kmem_cache_free( + btrfs_free_space_cachep, e); goto free_cache; } - - if (entry->type == BTRFS_FREE_SPACE_EXTENT) { - spin_lock(&ctl->tree_lock); - ret = link_free_space(ctl, e); - spin_unlock(&ctl->tree_lock); - if (ret) { - printk(KERN_ERR "Duplicate entries in " - "free space cache, dumping\n"); - kunmap(page); - unlock_page(page); - page_cache_release(page); - goto free_cache; - } - } else { - e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); - if (!e->bitmap) { - kunmap(page); - kmem_cache_free( - btrfs_free_space_cachep, e); - unlock_page(page); - page_cache_release(page); - goto free_cache; - } - spin_lock(&ctl->tree_lock); - ret = link_free_space(ctl, e); - ctl->total_bitmaps++; - ctl->op->recalc_thresholds(ctl); - spin_unlock(&ctl->tree_lock); - if (ret) { - printk(KERN_ERR "Duplicate entries in " - "free space cache, dumping\n"); - kunmap(page); - unlock_page(page); - page_cache_release(page); - goto free_cache; - } - list_add_tail(&e->list, &bitmaps); + spin_lock(&ctl->tree_lock); + ret = link_free_space(ctl, e); + ctl->total_bitmaps++; + ctl->op->recalc_thresholds(ctl); + spin_unlock(&ctl->tree_lock); + if (ret) { + printk(KERN_ERR "Duplicate entries in " + "free space cache, dumping\n"); + kmem_cache_free(btrfs_free_space_cachep, e); + goto free_cache; } - - num_entries--; - offset += sizeof(struct btrfs_free_space_entry); - if (offset + sizeof(struct btrfs_free_space_entry) >= - PAGE_CACHE_SIZE) - break; - entry++; + list_add_tail(&e->list, &bitmaps); } - /* - * We read an entry out of this page, we need to move on to the - * next page. - */ - if (need_loop) { - kunmap(page); - goto next; - } + num_entries--; + } - /* - * We add the bitmaps at the end of the entries in order that - * the bitmap entries are added to the cache. - */ - e = list_entry(bitmaps.next, struct btrfs_free_space, list); + io_ctl_unmap_page(&io_ctl); + + /* + * We add the bitmaps at the end of the entries in order that + * the bitmap entries are added to the cache. + */ + list_for_each_entry_safe(e, n, &bitmaps, list) { list_del_init(&e->list); - memcpy(e->bitmap, addr, PAGE_CACHE_SIZE); - kunmap(page); - num_bitmaps--; -next: - unlock_page(page); - page_cache_release(page); - index++; + ret = io_ctl_read_bitmap(&io_ctl, e); + if (ret) + goto free_cache; } + io_ctl_drop_pages(&io_ctl); ret = 1; out: - kfree(checksums); - kfree(disk_crcs); + io_ctl_free(&io_ctl); return ret; free_cache: + io_ctl_drop_pages(&io_ctl); __btrfs_remove_free_space_cache(ctl); goto out; } @@ -485,7 +736,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root = fs_info->tree_root; struct inode *inode; struct btrfs_path *path; - int ret; + int ret = 0; bool matched; u64 used = btrfs_block_group_used(&block_group->item); @@ -517,6 +768,14 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, return 0; } + /* We may have converted the inode and made the cache invalid. */ + spin_lock(&block_group->lock); + if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { + spin_unlock(&block_group->lock); + goto out; + } + spin_unlock(&block_group->lock); + ret = __load_free_space_cache(fs_info->tree_root, inode, ctl, path, block_group->key.objectid); btrfs_free_path(path); @@ -550,6 +809,19 @@ out: return ret; } +/** + * __btrfs_write_out_cache - write out cached info to an inode + * @root - the root the inode belongs to + * @ctl - the free space cache we are going to write out + * @block_group - the block_group for this cache if it belongs to a block_group + * @trans - the trans handle + * @path - the path to use + * @offset - the offset for the key we'll insert + * + * This function writes out a free space cache struct to disk for quick recovery + * on mount. This will return 0 if it was successfull in writing the cache out, + * and -1 if it was not. + */ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, struct btrfs_free_space_ctl *ctl, struct btrfs_block_group_cache *block_group, @@ -560,64 +832,24 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, struct extent_buffer *leaf; struct rb_node *node; struct list_head *pos, *n; - struct page **pages; - struct page *page; struct extent_state *cached_state = NULL; struct btrfs_free_cluster *cluster = NULL; struct extent_io_tree *unpin = NULL; + struct io_ctl io_ctl; struct list_head bitmap_list; struct btrfs_key key; u64 start, end, len; - u64 bytes = 0; - u32 *crc, *checksums; - unsigned long first_page_offset; - int index = 0, num_pages = 0; int entries = 0; int bitmaps = 0; - int ret = -1; - bool next_page = false; - bool out_of_space = false; + int ret; + int err = -1; INIT_LIST_HEAD(&bitmap_list); - node = rb_first(&ctl->free_space_offset); - if (!node) - return 0; - if (!i_size_read(inode)) return -1; - num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT; - - /* Since the first page has all of our checksums and our generation we - * need to calculate the offset into the page that we can start writing - * our entries. - */ - first_page_offset = (sizeof(u32) * num_pages) + sizeof(u64); - - filemap_write_and_wait(inode->i_mapping); - btrfs_wait_ordered_range(inode, inode->i_size & - ~(root->sectorsize - 1), (u64)-1); - - /* make sure we don't overflow that first page */ - if (first_page_offset + sizeof(struct btrfs_free_space_entry) >= PAGE_CACHE_SIZE) { - /* this is really the same as running out of space, where we also return 0 */ - printk(KERN_CRIT "Btrfs: free space cache was too big for the crc page\n"); - ret = 0; - goto out_update; - } - - /* We need a checksum per page. */ - crc = checksums = kzalloc(sizeof(u32) * num_pages, GFP_NOFS); - if (!crc) - return -1; - - pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS); - if (!pages) { - kfree(crc); - return -1; - } + io_ctl_init(&io_ctl, inode, root); /* Get the cluster for this block_group if it exists */ if (block_group && !list_empty(&block_group->cluster_list)) @@ -631,30 +863,9 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, */ unpin = root->fs_info->pinned_extents; - /* - * Lock all pages first so we can lock the extent safely. - * - * NOTE: Because we hold the ref the entire time we're going to write to - * the page find_get_page should never fail, so we don't do a check - * after find_get_page at this point. Just putting this here so people - * know and don't freak out. - */ - while (index < num_pages) { - page = grab_cache_page(inode->i_mapping, index); - if (!page) { - int i; - - for (i = 0; i < num_pages; i++) { - unlock_page(pages[i]); - page_cache_release(pages[i]); - } - goto out_free; - } - pages[index] = page; - index++; - } + /* Lock all pages first so we can lock the extent safely. */ + io_ctl_prepare_pages(&io_ctl, inode, 0); - index = 0; lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 0, &cached_state, GFP_NOFS); @@ -665,192 +876,112 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, if (block_group) start = block_group->key.objectid; - /* Write out the extent entries */ - do { - struct btrfs_free_space_entry *entry; - void *addr; - unsigned long offset = 0; - unsigned long start_offset = 0; - - next_page = false; - - if (index == 0) { - start_offset = first_page_offset; - offset = start_offset; - } + node = rb_first(&ctl->free_space_offset); + if (!node && cluster) { + node = rb_first(&cluster->root); + cluster = NULL; + } - if (index >= num_pages) { - out_of_space = true; - break; - } + /* Make sure we can fit our crcs into the first page */ + if (io_ctl.check_crcs && + (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) { + WARN_ON(1); + goto out_nospc; + } - page = pages[index]; + io_ctl_set_generation(&io_ctl, trans->transid); - addr = kmap(page); - entry = addr + start_offset; + /* Write out the extent entries */ + while (node) { + struct btrfs_free_space *e; - memset(addr, 0, PAGE_CACHE_SIZE); - while (node && !next_page) { - struct btrfs_free_space *e; + e = rb_entry(node, struct btrfs_free_space, offset_index); + entries++; - e = rb_entry(node, struct btrfs_free_space, offset_index); - entries++; + ret = io_ctl_add_entry(&io_ctl, e->offset, e->bytes, + e->bitmap); + if (ret) + goto out_nospc; - entry->offset = cpu_to_le64(e->offset); - entry->bytes = cpu_to_le64(e->bytes); - if (e->bitmap) { - entry->type = BTRFS_FREE_SPACE_BITMAP; - list_add_tail(&e->list, &bitmap_list); - bitmaps++; - } else { - entry->type = BTRFS_FREE_SPACE_EXTENT; - } - node = rb_next(node); - if (!node && cluster) { - node = rb_first(&cluster->root); - cluster = NULL; - } - offset += sizeof(struct btrfs_free_space_entry); - if (offset + sizeof(struct btrfs_free_space_entry) >= - PAGE_CACHE_SIZE) - next_page = true; - entry++; + if (e->bitmap) { + list_add_tail(&e->list, &bitmap_list); + bitmaps++; } + node = rb_next(node); + if (!node && cluster) { + node = rb_first(&cluster->root); + cluster = NULL; + } + } - /* - * We want to add any pinned extents to our free space cache - * so we don't leak the space - */ - while (block_group && !next_page && - (start < block_group->key.objectid + - block_group->key.offset)) { - ret = find_first_extent_bit(unpin, start, &start, &end, - EXTENT_DIRTY); - if (ret) { - ret = 0; - break; - } - - /* This pinned extent is out of our range */ - if (start >= block_group->key.objectid + - block_group->key.offset) - break; - - len = block_group->key.objectid + - block_group->key.offset - start; - len = min(len, end + 1 - start); - - entries++; - entry->offset = cpu_to_le64(start); - entry->bytes = cpu_to_le64(len); - entry->type = BTRFS_FREE_SPACE_EXTENT; - - start = end + 1; - offset += sizeof(struct btrfs_free_space_entry); - if (offset + sizeof(struct btrfs_free_space_entry) >= - PAGE_CACHE_SIZE) - next_page = true; - entry++; + /* + * We want to add any pinned extents to our free space cache + * so we don't leak the space + */ + while (block_group && (start < block_group->key.objectid + + block_group->key.offset)) { + ret = find_first_extent_bit(unpin, start, &start, &end, + EXTENT_DIRTY); + if (ret) { + ret = 0; + break; } - *crc = ~(u32)0; - *crc = btrfs_csum_data(root, addr + start_offset, *crc, - PAGE_CACHE_SIZE - start_offset); - kunmap(page); - btrfs_csum_final(*crc, (char *)crc); - crc++; + /* This pinned extent is out of our range */ + if (start >= block_group->key.objectid + + block_group->key.offset) + break; + + len = block_group->key.objectid + + block_group->key.offset - start; + len = min(len, end + 1 - start); - bytes += PAGE_CACHE_SIZE; + entries++; + ret = io_ctl_add_entry(&io_ctl, start, len, NULL); + if (ret) + goto out_nospc; - index++; - } while (node || next_page); + start = end + 1; + } /* Write out the bitmaps */ list_for_each_safe(pos, n, &bitmap_list) { - void *addr; struct btrfs_free_space *entry = list_entry(pos, struct btrfs_free_space, list); - if (index >= num_pages) { - out_of_space = true; - break; - } - page = pages[index]; - - addr = kmap(page); - memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE); - *crc = ~(u32)0; - *crc = btrfs_csum_data(root, addr, *crc, PAGE_CACHE_SIZE); - kunmap(page); - btrfs_csum_final(*crc, (char *)crc); - crc++; - bytes += PAGE_CACHE_SIZE; - + ret = io_ctl_add_bitmap(&io_ctl, entry->bitmap); + if (ret) + goto out_nospc; list_del_init(&entry->list); - index++; - } - - if (out_of_space) { - btrfs_drop_pages(pages, num_pages); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, - i_size_read(inode) - 1, &cached_state, - GFP_NOFS); - ret = 0; - goto out_free; } /* Zero out the rest of the pages just to make sure */ - while (index < num_pages) { - void *addr; - - page = pages[index]; - addr = kmap(page); - memset(addr, 0, PAGE_CACHE_SIZE); - kunmap(page); - bytes += PAGE_CACHE_SIZE; - index++; - } - - /* Write the checksums and trans id to the first page */ - { - void *addr; - u64 *gen; - - page = pages[0]; + io_ctl_zero_remaining_pages(&io_ctl); - addr = kmap(page); - memcpy(addr, checksums, sizeof(u32) * num_pages); - gen = addr + (sizeof(u32) * num_pages); - *gen = trans->transid; - kunmap(page); - } - - ret = btrfs_dirty_pages(root, inode, pages, num_pages, 0, - bytes, &cached_state); - btrfs_drop_pages(pages, num_pages); + ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages, + 0, i_size_read(inode), &cached_state); + io_ctl_drop_pages(&io_ctl); unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, &cached_state, GFP_NOFS); - if (ret) { - ret = 0; - goto out_free; - } + if (ret) + goto out; - BTRFS_I(inode)->generation = trans->transid; - filemap_write_and_wait(inode->i_mapping); + ret = filemap_write_and_wait(inode->i_mapping); + if (ret) + goto out; key.objectid = BTRFS_FREE_SPACE_OBJECTID; key.offset = offset; key.type = 0; - ret = btrfs_search_slot(trans, root, &key, path, 1, 1); + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) { - ret = -1; - clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS); - goto out_free; + clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, + EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL, + GFP_NOFS); + goto out; } leaf = path->nodes[0]; if (ret > 0) { @@ -860,15 +991,16 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || found_key.offset != offset) { - ret = -1; - clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, 0, 0, NULL, - GFP_NOFS); + clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, + inode->i_size - 1, + EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, + NULL, GFP_NOFS); btrfs_release_path(path); - goto out_free; + goto out; } } + + BTRFS_I(inode)->generation = trans->transid; header = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_free_space_header); btrfs_set_free_space_entries(leaf, header, entries); @@ -877,19 +1009,26 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); - ret = 1; - -out_free: - kfree(checksums); - kfree(pages); - -out_update: - if (ret != 1) { - invalidate_inode_pages2_range(inode->i_mapping, 0, index); + err = 0; +out: + io_ctl_free(&io_ctl); + if (err) { + invalidate_inode_pages2(inode->i_mapping); BTRFS_I(inode)->generation = 0; } btrfs_update_inode(trans, root, inode); - return ret; + return err; + +out_nospc: + list_for_each_safe(pos, n, &bitmap_list) { + struct btrfs_free_space *entry = + list_entry(pos, struct btrfs_free_space, list); + list_del_init(&entry->list); + } + io_ctl_drop_pages(&io_ctl); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, + i_size_read(inode) - 1, &cached_state, GFP_NOFS); + goto out; } int btrfs_write_out_cache(struct btrfs_root *root, @@ -916,14 +1055,15 @@ int btrfs_write_out_cache(struct btrfs_root *root, ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans, path, block_group->key.objectid); - if (ret < 0) { + if (ret) { spin_lock(&block_group->lock); block_group->disk_cache_state = BTRFS_DC_ERROR; spin_unlock(&block_group->lock); ret = 0; - +#ifdef DEBUG printk(KERN_ERR "btrfs: failed to write free space cace " "for block group %llu\n", block_group->key.objectid); +#endif } iput(inode); @@ -1219,9 +1359,9 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) div64_u64(extent_bytes, (sizeof(struct btrfs_free_space))); } -static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, - struct btrfs_free_space *info, u64 offset, - u64 bytes) +static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info, + u64 offset, u64 bytes) { unsigned long start, count; @@ -1232,6 +1372,13 @@ static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, bitmap_clear(info->bitmap, start, count); info->bytes -= bytes; +} + +static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info, u64 offset, + u64 bytes) +{ + __bitmap_clear_bits(ctl, info, offset, bytes); ctl->free_space -= bytes; } @@ -1323,6 +1470,7 @@ static void add_new_bitmap(struct btrfs_free_space_ctl *ctl, { info->offset = offset_to_bitmap(ctl, offset); info->bytes = 0; + INIT_LIST_HEAD(&info->list); link_free_space(ctl, info); ctl->total_bitmaps++; @@ -1702,7 +1850,13 @@ again: info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 1, 0); if (!info) { - WARN_ON(1); + /* the tree logging code might be calling us before we + * have fully loaded the free space rbtree for this + * block group. So it is possible the entry won't + * be in the rbtree yet at all. The caching code + * will make sure not to put it in the rbtree if + * the logging code has pinned it. + */ goto out_lock; } } @@ -1741,6 +1895,7 @@ again: ctl->total_bitmaps--; } kmem_cache_free(btrfs_free_space_cachep, info); + ret = 0; goto out_lock; } @@ -1748,7 +1903,8 @@ again: unlink_free_space(ctl, info); info->offset += bytes; info->bytes -= bytes; - link_free_space(ctl, info); + ret = link_free_space(ctl, info); + WARN_ON(ret); goto out_lock; } @@ -2035,7 +2191,7 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group, return 0; ret = search_start; - bitmap_clear_bits(ctl, entry, ret, bytes); + __bitmap_clear_bits(ctl, entry, ret, bytes); return ret; } @@ -2090,7 +2246,6 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, continue; } } else { - ret = entry->offset; entry->offset += bytes; @@ -2165,6 +2320,7 @@ again: if (!found) { start = i; + cluster->max_size = 0; found = true; } @@ -2308,16 +2464,23 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *entry; - struct rb_node *node; int ret = -ENOSPC; + u64 bitmap_offset = offset_to_bitmap(ctl, offset); if (ctl->total_bitmaps == 0) return -ENOSPC; /* - * First check our cached list of bitmaps and see if there is an entry - * here that will work. + * The bitmap that covers offset won't be in the list unless offset + * is just its start offset. */ + entry = list_first_entry(bitmaps, struct btrfs_free_space, list); + if (entry->offset != bitmap_offset) { + entry = tree_search_offset(ctl, bitmap_offset, 1, 0); + if (entry && list_empty(&entry->list)) + list_add(&entry->list, bitmaps); + } + list_for_each_entry(entry, bitmaps, list) { if (entry->bytes < min_bytes) continue; @@ -2328,38 +2491,10 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, } /* - * If we do have entries on our list and we are here then we didn't find - * anything, so go ahead and get the next entry after the last entry in - * this list and start the search from there. + * The bitmaps list has all the bitmaps that record free space + * starting after offset, so no more search is required. */ - if (!list_empty(bitmaps)) { - entry = list_entry(bitmaps->prev, struct btrfs_free_space, - list); - node = rb_next(&entry->offset_index); - if (!node) - return -ENOSPC; - entry = rb_entry(node, struct btrfs_free_space, offset_index); - goto search; - } - - entry = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 0, 1); - if (!entry) - return -ENOSPC; - -search: - node = &entry->offset_index; - do { - entry = rb_entry(node, struct btrfs_free_space, offset_index); - node = rb_next(&entry->offset_index); - if (!entry->bitmap) - continue; - if (entry->bytes < min_bytes) - continue; - ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset, - bytes, min_bytes); - } while (ret && node); - - return ret; + return -ENOSPC; } /* @@ -2377,8 +2512,8 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, u64 offset, u64 bytes, u64 empty_size) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; - struct list_head bitmaps; struct btrfs_free_space *entry, *tmp; + LIST_HEAD(bitmaps); u64 min_bytes; int ret; @@ -2417,7 +2552,6 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, goto out; } - INIT_LIST_HEAD(&bitmaps); ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, bytes, min_bytes); if (ret) @@ -2513,9 +2647,19 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, spin_unlock(&ctl->tree_lock); if (bytes >= minlen) { - int update_ret; - update_ret = btrfs_update_reserved_bytes(block_group, - bytes, 1, 1); + struct btrfs_space_info *space_info; + int update = 0; + + space_info = block_group->space_info; + spin_lock(&space_info->lock); + spin_lock(&block_group->lock); + if (!block_group->ro) { + block_group->reserved += bytes; + space_info->bytes_reserved += bytes; + update = 1; + } + spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); ret = btrfs_error_discard_extent(fs_info->extent_root, start, @@ -2523,9 +2667,16 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, &actually_trimmed); btrfs_add_free_space(block_group, start, bytes); - if (!update_ret) - btrfs_update_reserved_bytes(block_group, - bytes, 0, 1); + if (update) { + spin_lock(&space_info->lock); + spin_lock(&block_group->lock); + if (block_group->ro) + space_info->bytes_readonly += bytes; + block_group->reserved -= bytes; + space_info->bytes_reserved -= bytes; + spin_unlock(&space_info->lock); + spin_unlock(&block_group->lock); + } if (ret) break; @@ -2684,9 +2835,13 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root, return 0; ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0); - if (ret < 0) + if (ret) { + btrfs_delalloc_release_metadata(inode, inode->i_size); +#ifdef DEBUG printk(KERN_ERR "btrfs: failed to write free ino cache " "for root %llu\n", root->root_key.objectid); +#endif + } iput(inode); return ret; diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index b4087e0fa871..f8962a957d65 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -398,6 +398,8 @@ int btrfs_save_ino_cache(struct btrfs_root *root, struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; struct btrfs_path *path; struct inode *inode; + struct btrfs_block_rsv *rsv; + u64 num_bytes; u64 alloc_hint = 0; int ret; int prealloc; @@ -421,11 +423,26 @@ int btrfs_save_ino_cache(struct btrfs_root *root, if (!path) return -ENOMEM; + rsv = trans->block_rsv; + trans->block_rsv = &root->fs_info->trans_block_rsv; + + num_bytes = trans->bytes_reserved; + /* + * 1 item for inode item insertion if need + * 3 items for inode item update (in the worst case) + * 1 item for free space object + * 3 items for pre-allocation + */ + trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8); + ret = btrfs_block_rsv_add_noflush(root, trans->block_rsv, + trans->bytes_reserved); + if (ret) + goto out; again: inode = lookup_free_ino_inode(root, path); if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { ret = PTR_ERR(inode); - goto out; + goto out_release; } if (IS_ERR(inode)) { @@ -434,7 +451,7 @@ again: ret = create_free_ino_inode(root, trans, path); if (ret) - goto out; + goto out_release; goto again; } @@ -465,21 +482,26 @@ again: /* Just to make sure we have enough space */ prealloc += 8 * PAGE_CACHE_SIZE; - ret = btrfs_check_data_free_space(inode, prealloc); + ret = btrfs_delalloc_reserve_space(inode, prealloc); if (ret) goto out_put; ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc, prealloc, prealloc, &alloc_hint); - if (ret) + if (ret) { + btrfs_delalloc_release_space(inode, prealloc); goto out_put; + } btrfs_free_reserved_data_space(inode, prealloc); + ret = btrfs_write_out_ino_cache(root, trans, path); out_put: iput(inode); +out_release: + btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); out: - if (ret == 0) - ret = btrfs_write_out_ino_cache(root, trans, path); + trans->block_rsv = rsv; + trans->bytes_reserved = num_bytes; btrfs_free_path(path); return ret; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2548a04a0230..2c984f7d4c2a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -45,10 +45,10 @@ #include "btrfs_inode.h" #include "ioctl.h" #include "print-tree.h" -#include "volumes.h" #include "ordered-data.h" #include "xattr.h" #include "tree-log.h" +#include "volumes.h" #include "compression.h" #include "locking.h" #include "free-space-cache.h" @@ -93,6 +93,8 @@ static noinline int cow_file_range(struct inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, unsigned long *nr_written, int unlock); +static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode); static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir, @@ -393,7 +395,10 @@ again: (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) { WARN_ON(pages); pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); - BUG_ON(!pages); + if (!pages) { + /* just bail out to the uncompressed code */ + goto cont; + } if (BTRFS_I(inode)->force_compress) compress_type = BTRFS_I(inode)->force_compress; @@ -424,6 +429,7 @@ again: will_compress = 1; } } +cont: if (start == 0) { trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); @@ -750,15 +756,6 @@ static u64 get_extent_allocation_hint(struct inode *inode, u64 start, return alloc_hint; } -static inline bool is_free_space_inode(struct btrfs_root *root, - struct inode *inode) -{ - if (root == root->fs_info->tree_root || - BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) - return true; - return false; -} - /* * when extent_io.c finds a delayed allocation range in the file, * the call backs end up in this code. The basic idea is to @@ -791,7 +788,7 @@ static noinline int cow_file_range(struct inode *inode, struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; int ret = 0; - BUG_ON(is_free_space_inode(root, inode)); + BUG_ON(btrfs_is_free_space_inode(root, inode)); trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); trans->block_rsv = &root->fs_info->delalloc_block_rsv; @@ -829,7 +826,7 @@ static noinline int cow_file_range(struct inode *inode, } BUG_ON(disk_num_bytes > - btrfs_super_total_bytes(&root->fs_info->super_copy)); + btrfs_super_total_bytes(root->fs_info->super_copy)); alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); @@ -1070,9 +1067,10 @@ static noinline int run_delalloc_nocow(struct inode *inode, u64 ino = btrfs_ino(inode); path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; - nolock = is_free_space_inode(root, inode); + nolock = btrfs_is_free_space_inode(root, inode); if (nolock) trans = btrfs_join_transaction_nolock(root); @@ -1291,15 +1289,16 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, return ret; } -static int btrfs_split_extent_hook(struct inode *inode, - struct extent_state *orig, u64 split) +static void btrfs_split_extent_hook(struct inode *inode, + struct extent_state *orig, u64 split) { /* not delalloc, ignore it */ if (!(orig->state & EXTENT_DELALLOC)) - return 0; + return; - atomic_inc(&BTRFS_I(inode)->outstanding_extents); - return 0; + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->outstanding_extents++; + spin_unlock(&BTRFS_I(inode)->lock); } /* @@ -1308,16 +1307,17 @@ static int btrfs_split_extent_hook(struct inode *inode, * extents, such as when we are doing sequential writes, so we can properly * account for the metadata space we'll need. */ -static int btrfs_merge_extent_hook(struct inode *inode, - struct extent_state *new, - struct extent_state *other) +static void btrfs_merge_extent_hook(struct inode *inode, + struct extent_state *new, + struct extent_state *other) { /* not delalloc, ignore it */ if (!(other->state & EXTENT_DELALLOC)) - return 0; + return; - atomic_dec(&BTRFS_I(inode)->outstanding_extents); - return 0; + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->outstanding_extents--; + spin_unlock(&BTRFS_I(inode)->lock); } /* @@ -1325,8 +1325,8 @@ static int btrfs_merge_extent_hook(struct inode *inode, * bytes in this file, and to maintain the list of inodes that * have pending delalloc work to be done. */ -static int btrfs_set_bit_hook(struct inode *inode, - struct extent_state *state, int *bits) +static void btrfs_set_bit_hook(struct inode *inode, + struct extent_state *state, int *bits) { /* @@ -1337,12 +1337,15 @@ static int btrfs_set_bit_hook(struct inode *inode, if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; u64 len = state->end + 1 - state->start; - bool do_list = !is_free_space_inode(root, inode); + bool do_list = !btrfs_is_free_space_inode(root, inode); - if (*bits & EXTENT_FIRST_DELALLOC) + if (*bits & EXTENT_FIRST_DELALLOC) { *bits &= ~EXTENT_FIRST_DELALLOC; - else - atomic_inc(&BTRFS_I(inode)->outstanding_extents); + } else { + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->outstanding_extents++; + spin_unlock(&BTRFS_I(inode)->lock); + } spin_lock(&root->fs_info->delalloc_lock); BTRFS_I(inode)->delalloc_bytes += len; @@ -1353,14 +1356,13 @@ static int btrfs_set_bit_hook(struct inode *inode, } spin_unlock(&root->fs_info->delalloc_lock); } - return 0; } /* * extent_io.c clear_bit_hook, see set_bit_hook for why */ -static int btrfs_clear_bit_hook(struct inode *inode, - struct extent_state *state, int *bits) +static void btrfs_clear_bit_hook(struct inode *inode, + struct extent_state *state, int *bits) { /* * set_bit and clear bit hooks normally require _irqsave/restore @@ -1370,12 +1372,15 @@ static int btrfs_clear_bit_hook(struct inode *inode, if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; u64 len = state->end + 1 - state->start; - bool do_list = !is_free_space_inode(root, inode); + bool do_list = !btrfs_is_free_space_inode(root, inode); - if (*bits & EXTENT_FIRST_DELALLOC) + if (*bits & EXTENT_FIRST_DELALLOC) { *bits &= ~EXTENT_FIRST_DELALLOC; - else if (!(*bits & EXTENT_DO_ACCOUNTING)) - atomic_dec(&BTRFS_I(inode)->outstanding_extents); + } else if (!(*bits & EXTENT_DO_ACCOUNTING)) { + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->outstanding_extents--; + spin_unlock(&BTRFS_I(inode)->lock); + } if (*bits & EXTENT_DO_ACCOUNTING) btrfs_delalloc_release_metadata(inode, len); @@ -1394,7 +1399,6 @@ static int btrfs_clear_bit_hook(struct inode *inode, } spin_unlock(&root->fs_info->delalloc_lock); } - return 0; } /* @@ -1477,7 +1481,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; - if (is_free_space_inode(root, inode)) + if (btrfs_is_free_space_inode(root, inode)) ret = btrfs_bio_wq_end_io(root->fs_info, bio, 2); else ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); @@ -1644,7 +1648,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, int ret; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; path->leave_spinning = 1; @@ -1726,7 +1731,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) return 0; BUG_ON(!ordered_extent); - nolock = is_free_space_inode(root, inode); + nolock = btrfs_is_free_space_inode(root, inode); if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { BUG_ON(!list_empty(&ordered_extent->list)); @@ -1738,7 +1743,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); trans->block_rsv = &root->fs_info->delalloc_block_rsv; - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode_fallback(trans, root, inode); BUG_ON(ret); } goto out; @@ -1787,18 +1792,18 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) &ordered_extent->list); ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); - if (!ret) { - ret = btrfs_update_inode(trans, root, inode); + if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { + ret = btrfs_update_inode_fallback(trans, root, inode); BUG_ON(ret); } ret = 0; out: - if (nolock) { - if (trans) - btrfs_end_transaction_nolock(trans, root); - } else { + if (root != root->fs_info->tree_root) btrfs_delalloc_release_metadata(inode, ordered_extent->len); - if (trans) + if (trans) { + if (nolock) + btrfs_end_transaction_nolock(trans, root); + else btrfs_end_transaction(trans, root); } @@ -1820,153 +1825,9 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, } /* - * When IO fails, either with EIO or csum verification fails, we - * try other mirrors that might have a good copy of the data. This - * io_failure_record is used to record state as we go through all the - * mirrors. If another mirror has good data, the page is set up to date - * and things continue. If a good mirror can't be found, the original - * bio end_io callback is called to indicate things have failed. - */ -struct io_failure_record { - struct page *page; - u64 start; - u64 len; - u64 logical; - unsigned long bio_flags; - int last_mirror; -}; - -static int btrfs_io_failed_hook(struct bio *failed_bio, - struct page *page, u64 start, u64 end, - struct extent_state *state) -{ - struct io_failure_record *failrec = NULL; - u64 private; - struct extent_map *em; - struct inode *inode = page->mapping->host; - struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; - struct bio *bio; - int num_copies; - int ret; - int rw; - u64 logical; - - ret = get_state_private(failure_tree, start, &private); - if (ret) { - failrec = kmalloc(sizeof(*failrec), GFP_NOFS); - if (!failrec) - return -ENOMEM; - failrec->start = start; - failrec->len = end - start + 1; - failrec->last_mirror = 0; - failrec->bio_flags = 0; - - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, failrec->len); - if (em->start > start || em->start + em->len < start) { - free_extent_map(em); - em = NULL; - } - read_unlock(&em_tree->lock); - - if (IS_ERR_OR_NULL(em)) { - kfree(failrec); - return -EIO; - } - logical = start - em->start; - logical = em->block_start + logical; - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { - logical = em->block_start; - failrec->bio_flags = EXTENT_BIO_COMPRESSED; - extent_set_compress_type(&failrec->bio_flags, - em->compress_type); - } - failrec->logical = logical; - free_extent_map(em); - set_extent_bits(failure_tree, start, end, EXTENT_LOCKED | - EXTENT_DIRTY, GFP_NOFS); - set_state_private(failure_tree, start, - (u64)(unsigned long)failrec); - } else { - failrec = (struct io_failure_record *)(unsigned long)private; - } - num_copies = btrfs_num_copies( - &BTRFS_I(inode)->root->fs_info->mapping_tree, - failrec->logical, failrec->len); - failrec->last_mirror++; - if (!state) { - spin_lock(&BTRFS_I(inode)->io_tree.lock); - state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree, - failrec->start, - EXTENT_LOCKED); - if (state && state->start != failrec->start) - state = NULL; - spin_unlock(&BTRFS_I(inode)->io_tree.lock); - } - if (!state || failrec->last_mirror > num_copies) { - set_state_private(failure_tree, failrec->start, 0); - clear_extent_bits(failure_tree, failrec->start, - failrec->start + failrec->len - 1, - EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); - kfree(failrec); - return -EIO; - } - bio = bio_alloc(GFP_NOFS, 1); - bio->bi_private = state; - bio->bi_end_io = failed_bio->bi_end_io; - bio->bi_sector = failrec->logical >> 9; - bio->bi_bdev = failed_bio->bi_bdev; - bio->bi_size = 0; - - bio_add_page(bio, page, failrec->len, start - page_offset(page)); - if (failed_bio->bi_rw & REQ_WRITE) - rw = WRITE; - else - rw = READ; - - ret = BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio, - failrec->last_mirror, - failrec->bio_flags, 0); - return ret; -} - -/* - * each time an IO finishes, we do a fast check in the IO failure tree - * to see if we need to process or clean up an io_failure_record - */ -static int btrfs_clean_io_failures(struct inode *inode, u64 start) -{ - u64 private; - u64 private_failure; - struct io_failure_record *failure; - int ret; - - private = 0; - if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, - (u64)-1, 1, EXTENT_DIRTY, 0)) { - ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, - start, &private_failure); - if (ret == 0) { - failure = (struct io_failure_record *)(unsigned long) - private_failure; - set_state_private(&BTRFS_I(inode)->io_failure_tree, - failure->start, 0); - clear_extent_bits(&BTRFS_I(inode)->io_failure_tree, - failure->start, - failure->start + failure->len - 1, - EXTENT_DIRTY | EXTENT_LOCKED, - GFP_NOFS); - kfree(failure); - } - } - return 0; -} - -/* * when reads are done, we need to check csums to verify the data is correct - * if there's a match, we allow the bio to finish. If not, we go through - * the io_failure_record routines to find good copies + * if there's a match, we allow the bio to finish. If not, the code in + * extent_io.c will try to find good copies for us. */ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, struct extent_state *state) @@ -2012,10 +1873,6 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, kunmap_atomic(kaddr, KM_USER0); good: - /* if the io failure tree for this inode is non-empty, - * check to see if we've recovered from a failed IO - */ - btrfs_clean_io_failures(inode, start); return 0; zeroit: @@ -2080,89 +1937,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root) up_read(&root->fs_info->cleanup_work_sem); } -/* - * calculate extra metadata reservation when snapshotting a subvolume - * contains orphan files. - */ -void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans, - struct btrfs_pending_snapshot *pending, - u64 *bytes_to_reserve) -{ - struct btrfs_root *root; - struct btrfs_block_rsv *block_rsv; - u64 num_bytes; - int index; - - root = pending->root; - if (!root->orphan_block_rsv || list_empty(&root->orphan_list)) - return; - - block_rsv = root->orphan_block_rsv; - - /* orphan block reservation for the snapshot */ - num_bytes = block_rsv->size; - - /* - * after the snapshot is created, COWing tree blocks may use more - * space than it frees. So we should make sure there is enough - * reserved space. - */ - index = trans->transid & 0x1; - if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) { - num_bytes += block_rsv->size - - (block_rsv->reserved + block_rsv->freed[index]); - } - - *bytes_to_reserve += num_bytes; -} - -void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans, - struct btrfs_pending_snapshot *pending) -{ - struct btrfs_root *root = pending->root; - struct btrfs_root *snap = pending->snap; - struct btrfs_block_rsv *block_rsv; - u64 num_bytes; - int index; - int ret; - - if (!root->orphan_block_rsv || list_empty(&root->orphan_list)) - return; - - /* refill source subvolume's orphan block reservation */ - block_rsv = root->orphan_block_rsv; - index = trans->transid & 0x1; - if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) { - num_bytes = block_rsv->size - - (block_rsv->reserved + block_rsv->freed[index]); - ret = btrfs_block_rsv_migrate(&pending->block_rsv, - root->orphan_block_rsv, - num_bytes); - BUG_ON(ret); - } - - /* setup orphan block reservation for the snapshot */ - block_rsv = btrfs_alloc_block_rsv(snap); - BUG_ON(!block_rsv); - - btrfs_add_durable_block_rsv(root->fs_info, block_rsv); - snap->orphan_block_rsv = block_rsv; - - num_bytes = root->orphan_block_rsv->size; - ret = btrfs_block_rsv_migrate(&pending->block_rsv, - block_rsv, num_bytes); - BUG_ON(ret); - -#if 0 - /* insert orphan item for the snapshot */ - WARN_ON(!root->orphan_item_inserted); - ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root, - snap->root_key.objectid); - BUG_ON(ret); - snap->orphan_item_inserted = 1; -#endif -} - enum btrfs_orphan_cleanup_state { ORPHAN_CLEANUP_STARTED = 1, ORPHAN_CLEANUP_DONE = 2, @@ -2214,7 +1988,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) if (!root->orphan_block_rsv) { block_rsv = btrfs_alloc_block_rsv(root); - BUG_ON(!block_rsv); + if (!block_rsv) + return -ENOMEM; } spin_lock(&root->orphan_lock); @@ -2247,9 +2022,6 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) } spin_unlock(&root->orphan_lock); - if (block_rsv) - btrfs_add_durable_block_rsv(root->fs_info, block_rsv); - /* grab metadata reservation from transaction handle */ if (reserve) { ret = btrfs_orphan_reserve_metadata(trans, inode); @@ -2316,6 +2088,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) struct btrfs_key key, found_key; struct btrfs_trans_handle *trans; struct inode *inode; + u64 last_objectid = 0; int ret = 0, nr_unlink = 0, nr_truncate = 0; if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED)) @@ -2367,41 +2140,49 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) * crossing root thing. we store the inode number in the * offset of the orphan item. */ + + if (found_key.offset == last_objectid) { + printk(KERN_ERR "btrfs: Error removing orphan entry, " + "stopping orphan cleanup\n"); + ret = -EINVAL; + goto out; + } + + last_objectid = found_key.offset; + found_key.objectid = found_key.offset; found_key.type = BTRFS_INODE_ITEM_KEY; found_key.offset = 0; inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); - if (IS_ERR(inode)) { - ret = PTR_ERR(inode); + ret = PTR_RET(inode); + if (ret && ret != -ESTALE) goto out; - } /* - * add this inode to the orphan list so btrfs_orphan_del does - * the proper thing when we hit it + * Inode is already gone but the orphan item is still there, + * kill the orphan item. */ - spin_lock(&root->orphan_lock); - list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); - spin_unlock(&root->orphan_lock); - - /* - * if this is a bad inode, means we actually succeeded in - * removing the inode, but not the orphan record, which means - * we need to manually delete the orphan since iput will just - * do a destroy_inode - */ - if (is_bad_inode(inode)) { - trans = btrfs_start_transaction(root, 0); + if (ret == -ESTALE) { + trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out; } - btrfs_orphan_del(trans, inode); + ret = btrfs_del_orphan_item(trans, root, + found_key.objectid); + BUG_ON(ret); btrfs_end_transaction(trans, root); - iput(inode); continue; } + /* + * add this inode to the orphan list so btrfs_orphan_del does + * the proper thing when we hit it + */ + spin_lock(&root->orphan_lock); + list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); + spin_unlock(&root->orphan_lock); + /* if we have links, this was a truncate, lets do that */ if (inode->i_nlink) { if (!S_ISREG(inode->i_mode)) { @@ -2420,6 +2201,9 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) if (ret) goto out; } + /* release the path since we're done with it */ + btrfs_release_path(path); + root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE; if (root->orphan_block_rsv) @@ -2516,7 +2300,9 @@ static void btrfs_read_locked_inode(struct inode *inode) filled = true; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + goto make_bad; + path->leave_spinning = 1; memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); @@ -2531,15 +2317,8 @@ static void btrfs_read_locked_inode(struct inode *inode) inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); - if (!leaf->map_token) - map_private_extent_buffer(leaf, (unsigned long)inode_item, - sizeof(struct btrfs_inode_item), - &leaf->map_token, &leaf->kaddr, - &leaf->map_start, &leaf->map_len, - KM_USER1); - inode->i_mode = btrfs_inode_mode(leaf, inode_item); - inode->i_nlink = btrfs_inode_nlink(leaf, inode_item); + set_nlink(inode, btrfs_inode_nlink(leaf, inode_item)); inode->i_uid = btrfs_inode_uid(leaf, inode_item); inode->i_gid = btrfs_inode_gid(leaf, inode_item); btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item)); @@ -2575,11 +2354,6 @@ cache_acl: if (!maybe_acls) cache_no_acl(inode); - if (leaf->map_token) { - unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); - leaf->map_token = NULL; - } - btrfs_free_path(path); switch (inode->i_mode & S_IFMT) { @@ -2624,13 +2398,6 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode_item *item, struct inode *inode) { - if (!leaf->map_token) - map_private_extent_buffer(leaf, (unsigned long)item, - sizeof(struct btrfs_inode_item), - &leaf->map_token, &leaf->kaddr, - &leaf->map_start, &leaf->map_len, - KM_USER1); - btrfs_set_inode_uid(leaf, item, inode->i_uid); btrfs_set_inode_gid(leaf, item, inode->i_gid); btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size); @@ -2659,17 +2426,12 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, btrfs_set_inode_rdev(leaf, item, inode->i_rdev); btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); btrfs_set_inode_block_group(leaf, item, 0); - - if (leaf->map_token) { - unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); - leaf->map_token = NULL; - } } /* * copy everything in the in-memory inode into the btree. */ -noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, +static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode) { struct btrfs_inode_item *inode_item; @@ -2677,21 +2439,6 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; int ret; - /* - * If the inode is a free space inode, we can deadlock during commit - * if we put it into the delayed code. - * - * The data relocation inode should also be directly updated - * without delay - */ - if (!is_free_space_inode(root, inode) - && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) { - ret = btrfs_delayed_update_inode(trans, root, inode); - if (!ret) - btrfs_set_inode_last_trans(trans, inode); - return ret; - } - path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -2720,6 +2467,43 @@ failed: } /* + * copy everything in the in-memory inode into the btree. + */ +noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode) +{ + int ret; + + /* + * If the inode is a free space inode, we can deadlock during commit + * if we put it into the delayed code. + * + * The data relocation inode should also be directly updated + * without delay + */ + if (!btrfs_is_free_space_inode(root, inode) + && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) { + ret = btrfs_delayed_update_inode(trans, root, inode); + if (!ret) + btrfs_set_inode_last_trans(trans, inode); + return ret; + } + + return btrfs_update_inode_item(trans, root, inode); +} + +static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode) +{ + int ret; + + ret = btrfs_update_inode(trans, root, inode); + if (ret == -ENOSPC) + return btrfs_update_inode_item(trans, root, inode); + return ret; +} + +/* * unlink helper that gets used here in inode.c and in the tree logging * recovery code. It remove a link in a directory with a given name, and * also drops the back refs in the inode to the directory @@ -2857,7 +2641,16 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, u64 ino = btrfs_ino(inode); u64 dir_ino = btrfs_ino(dir); - trans = btrfs_start_transaction(root, 10); + /* + * 1 for the possible orphan item + * 1 for the dir item + * 1 for the dir index + * 1 for the inode ref + * 1 for the inode ref in the tree log + * 2 for the dir entries in the log + * 1 for the inode + */ + trans = btrfs_start_transaction(root, 8); if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC) return trans; @@ -2880,7 +2673,8 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, return ERR_PTR(-ENOMEM); } - trans = btrfs_start_transaction(root, 0); + /* 1 for the orphan item */ + trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { btrfs_free_path(path); root->fs_info->enospc_unlink = 0; @@ -2985,6 +2779,12 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, err = 0; out: btrfs_free_path(path); + /* Migrate the orphan reservation over */ + if (!err) + err = btrfs_block_rsv_migrate(trans->block_rsv, + &root->fs_info->global_block_rsv, + trans->bytes_reserved); + if (err) { btrfs_end_transaction(trans, root); root->fs_info->enospc_unlink = 0; @@ -2999,6 +2799,9 @@ static void __unlink_end_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root) { if (trans->block_rsv == &root->fs_info->global_block_rsv) { + btrfs_block_rsv_release(root, trans->block_rsv, + trans->bytes_reserved); + trans->block_rsv = &root->fs_info->trans_block_rsv; BUG_ON(!root->fs_info->enospc_unlink); root->fs_info->enospc_unlink = 0; } @@ -3021,13 +2824,16 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, dentry->d_name.name, dentry->d_name.len); - BUG_ON(ret); + if (ret) + goto out; if (inode->i_nlink == 0) { ret = btrfs_orphan_add(trans, inode); - BUG_ON(ret); + if (ret) + goto out; } +out: nr = trans->blocks_used; __unlink_end_trans(trans, root); btrfs_btree_balance_dirty(root, nr); @@ -3170,6 +2976,11 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->reada = -1; + if (root->ref_cows || root == root->fs_info->tree_root) btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0); @@ -3182,10 +2993,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, if (min_type == 0 && root == BTRFS_I(inode)->root) btrfs_kill_delayed_inode_items(inode); - path = btrfs_alloc_path(); - BUG_ON(!path); - path->reada = -1; - key.objectid = ino; key.offset = (u64)-1; key.type = (u8)-1; @@ -3386,6 +3193,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from) pgoff_t index = from >> PAGE_CACHE_SHIFT; unsigned offset = from & (PAGE_CACHE_SIZE-1); struct page *page; + gfp_t mask = btrfs_alloc_write_mask(mapping); int ret = 0; u64 page_start; u64 page_end; @@ -3398,7 +3206,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from) ret = -ENOMEM; again: - page = grab_cache_page(mapping, index); + page = find_or_create_page(mapping, index, mask); if (!page) { btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); goto out; @@ -3528,15 +3336,19 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) err = btrfs_drop_extents(trans, inode, cur_offset, cur_offset + hole_size, &hint_byte, 1); - if (err) + if (err) { + btrfs_end_transaction(trans, root); break; + } err = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), cur_offset, 0, 0, hole_size, 0, hole_size, 0, 0, 0); - if (err) + if (err) { + btrfs_end_transaction(trans, root); break; + } btrfs_drop_extent_cache(inode, hole_start, last_byte - 1, 0); @@ -3627,6 +3439,8 @@ void btrfs_evict_inode(struct inode *inode) { struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_block_rsv *rsv, *global_rsv; + u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); unsigned long nr; int ret; @@ -3634,7 +3448,7 @@ void btrfs_evict_inode(struct inode *inode) truncate_inode_pages(&inode->i_data, 0); if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 || - is_free_space_inode(root, inode))) + btrfs_is_free_space_inode(root, inode))) goto no_delete; if (is_bad_inode(inode)) { @@ -3654,22 +3468,55 @@ void btrfs_evict_inode(struct inode *inode) goto no_delete; } + rsv = btrfs_alloc_block_rsv(root); + if (!rsv) { + btrfs_orphan_del(NULL, inode); + goto no_delete; + } + rsv->size = min_size; + global_rsv = &root->fs_info->global_block_rsv; + btrfs_i_size_write(inode, 0); + /* + * This is a bit simpler than btrfs_truncate since + * + * 1) We've already reserved our space for our orphan item in the + * unlink. + * 2) We're going to delete the inode item, so we don't need to update + * it at all. + * + * So we just need to reserve some slack space in case we add bytes when + * doing the truncate. + */ while (1) { - trans = btrfs_join_transaction(root); - BUG_ON(IS_ERR(trans)); - trans->block_rsv = root->orphan_block_rsv; + ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size); + + /* + * Try and steal from the global reserve since we will + * likely not use this space anyway, we want to try as + * hard as possible to get this to work. + */ + if (ret) + ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size); - ret = btrfs_block_rsv_check(trans, root, - root->orphan_block_rsv, 0, 5); if (ret) { - BUG_ON(ret != -EAGAIN); - ret = btrfs_commit_transaction(trans, root); - BUG_ON(ret); - continue; + printk(KERN_WARNING "Could not get space for a " + "delete, will truncate on mount %d\n", ret); + btrfs_orphan_del(NULL, inode); + btrfs_free_block_rsv(root, rsv); + goto no_delete; + } + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + btrfs_orphan_del(NULL, inode); + btrfs_free_block_rsv(root, rsv); + goto no_delete; } + trans->block_rsv = rsv; + ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); if (ret != -EAGAIN) break; @@ -3678,14 +3525,17 @@ void btrfs_evict_inode(struct inode *inode) btrfs_end_transaction(trans, root); trans = NULL; btrfs_btree_balance_dirty(root, nr); - } + btrfs_free_block_rsv(root, rsv); + if (ret == 0) { + trans->block_rsv = root->orphan_block_rsv; ret = btrfs_orphan_del(trans, inode); BUG_ON(ret); } + trans->block_rsv = &root->fs_info->trans_block_rsv; if (!(root == root->fs_info->tree_root || root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)) btrfs_return_ino(root, btrfs_ino(inode)); @@ -3713,7 +3563,8 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, int ret = 0; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), name, namelen, 0); @@ -3978,10 +3829,16 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, BTRFS_I(inode)->root = root; memcpy(&BTRFS_I(inode)->location, location, sizeof(*location)); btrfs_read_locked_inode(inode); - inode_tree_add(inode); - unlock_new_inode(inode); - if (new) - *new = 1; + if (!is_bad_inode(inode)) { + inode_tree_add(inode); + unlock_new_inode(inode); + if (new) + *new = 1; + } else { + unlock_new_inode(inode); + iput(inode); + inode = ERR_PTR(-ESTALE); + } } return inode; @@ -4016,12 +3873,20 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) struct btrfs_root *sub_root = root; struct btrfs_key location; int index; - int ret; + int ret = 0; if (dentry->d_name.len > BTRFS_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); - ret = btrfs_inode_by_name(dir, dentry, &location); + if (unlikely(d_need_lookup(dentry))) { + memcpy(&location, dentry->d_fsdata, sizeof(struct btrfs_key)); + kfree(dentry->d_fsdata); + dentry->d_fsdata = NULL; + /* This thing is hashed, drop it for now */ + d_drop(dentry); + } else { + ret = btrfs_inode_by_name(dir, dentry, &location); + } if (ret < 0) return ERR_PTR(ret); @@ -4076,10 +3941,24 @@ static int btrfs_dentry_delete(const struct dentry *dentry) return 0; } +static void btrfs_dentry_release(struct dentry *dentry) +{ + if (dentry->d_fsdata) + kfree(dentry->d_fsdata); +} + static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { - return d_splice_alias(btrfs_lookup_dentry(dir, dentry), dentry); + struct dentry *ret; + + ret = d_splice_alias(btrfs_lookup_dentry(dir, dentry), dentry); + if (unlikely(d_need_lookup(dentry))) { + spin_lock(&dentry->d_lock); + dentry->d_flags &= ~DCACHE_NEED_LOOKUP; + spin_unlock(&dentry->d_lock); + } + return ret; } unsigned char btrfs_filetype_table[] = { @@ -4098,6 +3977,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, struct btrfs_path *path; struct list_head ins_list; struct list_head del_list; + struct qstr q; int ret; struct extent_buffer *leaf; int slot; @@ -4118,7 +3998,8 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, /* special case for "." */ if (filp->f_pos == 0) { - over = filldir(dirent, ".", 1, 1, btrfs_ino(inode), DT_DIR); + over = filldir(dirent, ".", 1, + filp->f_pos, btrfs_ino(inode), DT_DIR); if (over) return 0; filp->f_pos = 1; @@ -4127,7 +4008,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, if (filp->f_pos == 1) { u64 pino = parent_ino(filp->f_path.dentry); over = filldir(dirent, "..", 2, - 2, pino, DT_DIR); + filp->f_pos, pino, DT_DIR); if (over) return 0; filp->f_pos = 2; @@ -4187,6 +4068,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, while (di_cur < di_total) { struct btrfs_key location; + struct dentry *tmp; if (verify_dir_item(root, leaf, di)) break; @@ -4207,6 +4089,33 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)]; btrfs_dir_item_key_to_cpu(leaf, di, &location); + q.name = name_ptr; + q.len = name_len; + q.hash = full_name_hash(q.name, q.len); + tmp = d_lookup(filp->f_dentry, &q); + if (!tmp) { + struct btrfs_key *newkey; + + newkey = kzalloc(sizeof(struct btrfs_key), + GFP_NOFS); + if (!newkey) + goto no_dentry; + tmp = d_alloc(filp->f_dentry, &q); + if (!tmp) { + kfree(newkey); + dput(tmp); + goto no_dentry; + } + memcpy(newkey, &location, + sizeof(struct btrfs_key)); + tmp->d_fsdata = newkey; + tmp->d_flags |= DCACHE_NEED_LOOKUP; + d_rehash(tmp); + dput(tmp); + } else { + dput(tmp); + } +no_dentry: /* is this a reference to our own snapshot? If so * skip it */ @@ -4271,7 +4180,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc) if (BTRFS_I(inode)->dummy_inode) return 0; - if (btrfs_fs_closing(root->fs_info) && is_free_space_inode(root, inode)) + if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(root, inode)) nolock = true; if (wbc->sync_mode == WB_SYNC_ALL) { @@ -4432,7 +4341,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, int owner; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return ERR_PTR(-ENOMEM); inode = new_inode(root->fs_info->sb); if (!inode) { @@ -4467,7 +4377,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, inode->i_generation = BTRFS_I(inode)->generation; btrfs_set_inode_space_info(root, inode); - if (mode & S_IFDIR) + if (S_ISDIR(mode)) owner = 0; else owner = 1; @@ -4512,7 +4422,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, btrfs_inherit_iflags(inode, dir); - if ((mode & S_IFREG)) { + if (S_ISREG(mode)) { if (btrfs_test_opt(root, NODATASUM)) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; if (btrfs_test_opt(root, NODATACOW) || @@ -5749,8 +5659,7 @@ again: if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { ret = btrfs_ordered_update_i_size(inode, 0, ordered); if (!ret) - ret = btrfs_update_inode(trans, root, inode); - err = ret; + err = btrfs_update_inode_fallback(trans, root, inode); goto out; } @@ -5787,8 +5696,8 @@ again: add_pending_csums(trans, inode, ordered->file_offset, &ordered->list); ret = btrfs_ordered_update_i_size(inode, 0, ordered); - if (!ret) - btrfs_update_inode(trans, root, inode); + if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) + btrfs_update_inode_fallback(trans, root, inode); ret = 0; out_unlock: unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset, @@ -6243,7 +6152,7 @@ int btrfs_readpage(struct file *file, struct page *page) { struct extent_io_tree *tree; tree = &BTRFS_I(page->mapping->host)->io_tree; - return extent_read_full_page(tree, page, btrfs_get_extent); + return extent_read_full_page(tree, page, btrfs_get_extent, 0); } static int btrfs_writepage(struct page *page, struct writeback_control *wbc) @@ -6495,6 +6404,7 @@ static int btrfs_truncate(struct inode *inode) struct btrfs_trans_handle *trans; unsigned long nr; u64 mask = root->sectorsize - 1; + u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); ret = btrfs_truncate_page(inode->i_mapping, inode->i_size); if (ret) @@ -6542,19 +6452,23 @@ static int btrfs_truncate(struct inode *inode) rsv = btrfs_alloc_block_rsv(root); if (!rsv) return -ENOMEM; - btrfs_add_durable_block_rsv(root->fs_info, rsv); + rsv->size = min_size; + /* + * 1 for the truncate slack space + * 1 for the orphan item we're going to add + * 1 for the orphan item deletion + * 1 for updating the inode. + */ trans = btrfs_start_transaction(root, 4); if (IS_ERR(trans)) { err = PTR_ERR(trans); goto out; } - /* - * Reserve space for the truncate process. Truncate should be adding - * space, but if there are snapshots it may end up using space. - */ - ret = btrfs_truncate_reserve_metadata(trans, root, rsv); + /* Migrate the slack space for the truncate to our reserve */ + ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv, + min_size); BUG_ON(ret); ret = btrfs_orphan_add(trans, inode); @@ -6563,21 +6477,6 @@ static int btrfs_truncate(struct inode *inode) goto out; } - nr = trans->blocks_used; - btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root, nr); - - /* - * Ok so we've already migrated our bytes over for the truncate, so here - * just reserve the one slot we need for updating the inode. - */ - trans = btrfs_start_transaction(root, 1); - if (IS_ERR(trans)) { - err = PTR_ERR(trans); - goto out; - } - trans->block_rsv = rsv; - /* * setattr is responsible for setting the ordered_data_close flag, * but that is only tested during the last file release. That @@ -6599,20 +6498,30 @@ static int btrfs_truncate(struct inode *inode) btrfs_add_ordered_operation(trans, root, inode); while (1) { + ret = btrfs_block_rsv_refill(root, rsv, min_size); + if (ret) { + /* + * This can only happen with the original transaction we + * started above, every other time we shouldn't have a + * transaction started yet. + */ + if (ret == -EAGAIN) + goto end_trans; + err = ret; + break; + } + if (!trans) { - trans = btrfs_start_transaction(root, 3); + /* Just need the 1 for updating the inode */ + trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { err = PTR_ERR(trans); goto out; } - - ret = btrfs_truncate_reserve_metadata(trans, root, - rsv); - BUG_ON(ret); - - trans->block_rsv = rsv; } + trans->block_rsv = rsv; + ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, BTRFS_EXTENT_DATA_KEY); @@ -6627,7 +6536,7 @@ static int btrfs_truncate(struct inode *inode) err = ret; break; } - +end_trans: nr = trans->blocks_used; btrfs_end_transaction(trans, root); trans = NULL; @@ -6647,14 +6556,16 @@ static int btrfs_truncate(struct inode *inode) ret = btrfs_orphan_del(NULL, inode); } - trans->block_rsv = &root->fs_info->trans_block_rsv; - ret = btrfs_update_inode(trans, root, inode); - if (ret && !err) - err = ret; + if (trans) { + trans->block_rsv = &root->fs_info->trans_block_rsv; + ret = btrfs_update_inode(trans, root, inode); + if (ret && !err) + err = ret; - nr = trans->blocks_used; - ret = btrfs_end_transaction_throttle(trans, root); - btrfs_btree_balance_dirty(root, nr); + nr = trans->blocks_used; + ret = btrfs_end_transaction_throttle(trans, root); + btrfs_btree_balance_dirty(root, nr); + } out: btrfs_free_block_rsv(root, rsv); @@ -6682,7 +6593,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, inode->i_op = &btrfs_dir_inode_operations; inode->i_fop = &btrfs_dir_file_operations; - inode->i_nlink = 1; + set_nlink(inode, 1); btrfs_i_size_write(inode, 0); err = btrfs_update_inode(trans, new_root, inode); @@ -6692,19 +6603,6 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, return 0; } -/* helper function for file defrag and space balancing. This - * forces readahead on a given range of bytes in an inode - */ -unsigned long btrfs_force_ra(struct address_space *mapping, - struct file_ra_state *ra, struct file *file, - pgoff_t offset, pgoff_t last_index) -{ - pgoff_t req_size = last_index - offset + 1; - - page_cache_sync_readahead(mapping, ra, file, offset, req_size); - return offset + req_size; -} - struct inode *btrfs_alloc_inode(struct super_block *sb) { struct btrfs_inode *ei; @@ -6722,19 +6620,21 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->last_sub_trans = 0; ei->logged_trans = 0; ei->delalloc_bytes = 0; - ei->reserved_bytes = 0; ei->disk_i_size = 0; ei->flags = 0; + ei->csum_bytes = 0; ei->index_cnt = (u64)-1; ei->last_unlink_trans = 0; - atomic_set(&ei->outstanding_extents, 0); - atomic_set(&ei->reserved_extents, 0); + spin_lock_init(&ei->lock); + ei->outstanding_extents = 0; + ei->reserved_extents = 0; ei->ordered_data_close = 0; ei->orphan_meta_reserved = 0; ei->dummy_inode = 0; ei->in_defrag = 0; + ei->delalloc_meta_reserved = 0; ei->force_compress = BTRFS_COMPRESS_NONE; ei->delayed_node = NULL; @@ -6767,8 +6667,10 @@ void btrfs_destroy_inode(struct inode *inode) WARN_ON(!list_empty(&inode->i_dentry)); WARN_ON(inode->i_data.nrpages); - WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents)); - WARN_ON(atomic_read(&BTRFS_I(inode)->reserved_extents)); + WARN_ON(BTRFS_I(inode)->outstanding_extents); + WARN_ON(BTRFS_I(inode)->reserved_extents); + WARN_ON(BTRFS_I(inode)->delalloc_bytes); + WARN_ON(BTRFS_I(inode)->csum_bytes); /* * This can happen where we create an inode, but somebody else also @@ -6823,7 +6725,7 @@ int btrfs_drop_inode(struct inode *inode) struct btrfs_root *root = BTRFS_I(inode)->root; if (btrfs_root_refs(&root->root_item) == 0 && - !is_free_space_inode(root, inode)) + !btrfs_is_free_space_inode(root, inode)) return 1; else return generic_drop_inode(inode); @@ -6892,11 +6794,13 @@ static int btrfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { struct inode *inode = dentry->d_inode; + u32 blocksize = inode->i_sb->s_blocksize; + generic_fillattr(inode, stat); stat->dev = BTRFS_I(inode)->root->anon_dev; stat->blksize = PAGE_CACHE_SIZE; - stat->blocks = (inode_get_bytes(inode) + - BTRFS_I(inode)->delalloc_bytes) >> 9; + stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) + + ALIGN(BTRFS_I(inode)->delalloc_bytes, blocksize)) >> 9; return 0; } @@ -7186,7 +7090,11 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, goto out_unlock; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) { + err = -ENOMEM; + drop_inode = 1; + goto out_unlock; + } key.objectid = btrfs_ino(inode); key.offset = 0; btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); @@ -7326,11 +7234,15 @@ static int btrfs_set_page_dirty(struct page *page) static int btrfs_permission(struct inode *inode, int mask) { struct btrfs_root *root = BTRFS_I(inode)->root; + umode_t mode = inode->i_mode; - if (btrfs_root_readonly(root) && (mask & MAY_WRITE)) - return -EROFS; - if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE)) - return -EACCES; + if (mask & MAY_WRITE && + (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) { + if (btrfs_root_readonly(root)) + return -EROFS; + if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) + return -EACCES; + } return generic_permission(inode, mask); } @@ -7351,12 +7263,12 @@ static const struct inode_operations btrfs_dir_inode_operations = { .listxattr = btrfs_listxattr, .removexattr = btrfs_removexattr, .permission = btrfs_permission, - .check_acl = btrfs_check_acl, + .get_acl = btrfs_get_acl, }; static const struct inode_operations btrfs_dir_ro_inode_operations = { .lookup = btrfs_lookup, .permission = btrfs_permission, - .check_acl = btrfs_check_acl, + .get_acl = btrfs_get_acl, }; static const struct file_operations btrfs_dir_file_operations = { @@ -7378,7 +7290,6 @@ static struct extent_io_ops btrfs_extent_io_ops = { .readpage_end_io_hook = btrfs_readpage_end_io_hook, .writepage_end_io_hook = btrfs_writepage_end_io_hook, .writepage_start_hook = btrfs_writepage_start_hook, - .readpage_io_failed_hook = btrfs_io_failed_hook, .set_bit_hook = btrfs_set_bit_hook, .clear_bit_hook = btrfs_clear_bit_hook, .merge_extent_hook = btrfs_merge_extent_hook, @@ -7425,7 +7336,7 @@ static const struct inode_operations btrfs_file_inode_operations = { .removexattr = btrfs_removexattr, .permission = btrfs_permission, .fiemap = btrfs_fiemap, - .check_acl = btrfs_check_acl, + .get_acl = btrfs_get_acl, }; static const struct inode_operations btrfs_special_inode_operations = { .getattr = btrfs_getattr, @@ -7435,7 +7346,7 @@ static const struct inode_operations btrfs_special_inode_operations = { .getxattr = btrfs_getxattr, .listxattr = btrfs_listxattr, .removexattr = btrfs_removexattr, - .check_acl = btrfs_check_acl, + .get_acl = btrfs_get_acl, }; static const struct inode_operations btrfs_symlink_inode_operations = { .readlink = generic_readlink, @@ -7447,9 +7358,10 @@ static const struct inode_operations btrfs_symlink_inode_operations = { .getxattr = btrfs_getxattr, .listxattr = btrfs_listxattr, .removexattr = btrfs_removexattr, - .check_acl = btrfs_check_acl, + .get_acl = btrfs_get_acl, }; const struct dentry_operations btrfs_dentry_operations = { .d_delete = btrfs_dentry_delete, + .d_release = btrfs_dentry_release, }; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 622543309eb2..72d461656f60 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -51,6 +51,7 @@ #include "volumes.h" #include "locking.h" #include "inode-map.h" +#include "backref.h" /* Mask out flags that are inappropriate for the given type of inode. */ static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) @@ -117,7 +118,7 @@ void btrfs_update_iflags(struct inode *inode) /* * Inherit flags from the parent inode. * - * Unlike extN we don't have any flags we don't want to inherit currently. + * Currently only the compression flags and the cow flags are inherited. */ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) { @@ -128,12 +129,17 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) flags = BTRFS_I(dir)->flags; - if (S_ISREG(inode->i_mode)) - flags &= ~BTRFS_INODE_DIRSYNC; - else if (!S_ISDIR(inode->i_mode)) - flags &= (BTRFS_INODE_NODUMP | BTRFS_INODE_NOATIME); + if (flags & BTRFS_INODE_NOCOMPRESS) { + BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; + BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; + } else if (flags & BTRFS_INODE_COMPRESS) { + BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS; + BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS; + } + + if (flags & BTRFS_INODE_NODATACOW) + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; - BTRFS_I(inode)->flags = flags; btrfs_update_iflags(inode); } @@ -277,6 +283,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) struct fstrim_range range; u64 minlen = ULLONG_MAX; u64 num_devices = 0; + u64 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy); int ret; if (!capable(CAP_SYS_ADMIN)) @@ -295,12 +302,15 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) } } rcu_read_unlock(); + if (!num_devices) return -EOPNOTSUPP; - if (copy_from_user(&range, arg, sizeof(range))) return -EFAULT; + if (range.start > total_bytes) + return -EINVAL; + range.len = min(range.len, total_bytes - range.start); range.minlen = max(range.minlen, minlen); ret = btrfs_trim_fs(root, &range); if (ret < 0) @@ -760,7 +770,7 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len, int ret = 1; /* - * make sure that once we start defragging and extent, we keep on + * make sure that once we start defragging an extent, we keep on * defragging it */ if (start < *defrag_end) @@ -805,7 +815,6 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len, * extent will force at least part of that big extent to be defragged. */ if (ret) { - *last_len += len; *defrag_end = extent_map_end(em); } else { *last_len = 0; @@ -843,6 +852,7 @@ static int cluster_pages_for_defrag(struct inode *inode, int i_done; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; + gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); if (isize == 0) return 0; @@ -859,8 +869,8 @@ again: /* step one, lock all the pages */ for (i = 0; i < num_pages; i++) { struct page *page; - page = grab_cache_page(inode->i_mapping, - start_index + i); + page = find_or_create_page(inode->i_mapping, + start_index + i, mask); if (!page) break; @@ -930,7 +940,9 @@ again: GFP_NOFS); if (i_done != num_pages) { - atomic_inc(&BTRFS_I(inode)->outstanding_extents); + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->outstanding_extents++; + spin_unlock(&BTRFS_I(inode)->lock); btrfs_delalloc_release_space(inode, (num_pages - i_done) << PAGE_CACHE_SHIFT); } @@ -970,18 +982,20 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, struct btrfs_super_block *disk_super; struct file_ra_state *ra = NULL; unsigned long last_index; + u64 isize = i_size_read(inode); u64 features; u64 last_len = 0; u64 skip = 0; u64 defrag_end = 0; u64 newer_off = range->start; - int newer_left = 0; unsigned long i; + unsigned long ra_index = 0; int ret; int defrag_count = 0; int compress_type = BTRFS_COMPRESS_ZLIB; int extent_thresh = range->extent_thresh; - int newer_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT; + int max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT; + int cluster = max_cluster; u64 new_align = ~((u64)128 * 1024 - 1); struct page **pages = NULL; @@ -995,7 +1009,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, compress_type = range->compress_type; } - if (inode->i_size == 0) + if (isize == 0) return 0; /* @@ -1011,7 +1025,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, ra = &file->f_ra; } - pages = kmalloc(sizeof(struct page *) * newer_cluster, + pages = kmalloc(sizeof(struct page *) * max_cluster, GFP_NOFS); if (!pages) { ret = -ENOMEM; @@ -1020,10 +1034,10 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, /* find the last page to defrag */ if (range->start + range->len > range->start) { - last_index = min_t(u64, inode->i_size - 1, + last_index = min_t(u64, isize - 1, range->start + range->len - 1) >> PAGE_CACHE_SHIFT; } else { - last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT; + last_index = (isize - 1) >> PAGE_CACHE_SHIFT; } if (newer_than) { @@ -1036,16 +1050,24 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, * the extents in the file evenly spaced */ i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; - newer_left = newer_cluster; } else goto out_ra; } else { i = range->start >> PAGE_CACHE_SHIFT; } if (!max_to_defrag) - max_to_defrag = last_index - 1; + max_to_defrag = last_index; - while (i <= last_index && defrag_count < max_to_defrag) { + /* + * make writeback starts from i, so the defrag range can be + * written sequentially. + */ + if (i < inode->i_mapping->writeback_index) + inode->i_mapping->writeback_index = i; + + while (i <= last_index && defrag_count < max_to_defrag && + (i < (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT)) { /* * make sure we stop running if someone unmounts * the FS @@ -1068,18 +1090,31 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, i = max(i + 1, next); continue; } + + if (!newer_than) { + cluster = (PAGE_CACHE_ALIGN(defrag_end) >> + PAGE_CACHE_SHIFT) - i; + cluster = min(cluster, max_cluster); + } else { + cluster = max_cluster; + } + if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) BTRFS_I(inode)->force_compress = compress_type; - btrfs_force_ra(inode->i_mapping, ra, file, i, newer_cluster); + if (i + cluster > ra_index) { + ra_index = max(i, ra_index); + btrfs_force_ra(inode->i_mapping, ra, file, ra_index, + cluster); + ra_index += max_cluster; + } - ret = cluster_pages_for_defrag(inode, pages, i, newer_cluster); + ret = cluster_pages_for_defrag(inode, pages, i, cluster); if (ret < 0) goto out_ra; defrag_count += ret; balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret); - i += ret; if (newer_than) { if (newer_off == (u64)-1) @@ -1094,12 +1129,17 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, if (!ret) { range->start = newer_off; i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; - newer_left = newer_cluster; } else { break; } } else { - i++; + if (ret > 0) { + i += ret; + last_len += ret << PAGE_CACHE_SHIFT; + } else { + i++; + last_len = 0; + } } } @@ -1125,16 +1165,14 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, mutex_unlock(&inode->i_mutex); } - disk_super = &root->fs_info->super_copy; + disk_super = root->fs_info->super_copy; features = btrfs_super_incompat_flags(disk_super); if (range->compress_type == BTRFS_COMPRESS_LZO) { features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; btrfs_set_super_incompat_flags(disk_super, features); } - if (!file) - kfree(ra); - return defrag_count; + ret = defrag_count; out_ra: if (!file) @@ -1178,12 +1216,12 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, *devstr = '\0'; devstr = vol_args->name; devid = simple_strtoull(devstr, &end, 10); - printk(KERN_INFO "resizing devid %llu\n", + printk(KERN_INFO "btrfs: resizing devid %llu\n", (unsigned long long)devid); } device = btrfs_find_device(root, devid, NULL, NULL); if (!device) { - printk(KERN_INFO "resizer unable to find device %llu\n", + printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", (unsigned long long)devid); ret = -EINVAL; goto out_unlock; @@ -1229,7 +1267,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, do_div(new_size, root->sectorsize); new_size *= root->sectorsize; - printk(KERN_INFO "new size for %s is %llu\n", + printk(KERN_INFO "btrfs: new size for %s is %llu\n", device->name, (unsigned long long)new_size); if (new_size > old_size) { @@ -1240,7 +1278,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, } ret = btrfs_grow_device(trans, device, new_size); btrfs_commit_transaction(trans, root); - } else { + } else if (new_size < old_size) { ret = btrfs_shrink_device(device, new_size); } @@ -1747,11 +1785,10 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info, key.objectid = key.offset; key.offset = (u64)-1; dirid = key.objectid; - } if (ptr < name) goto out; - memcpy(name, ptr, total_len); + memmove(name, ptr, total_len); name[total_len]='\0'; ret = 0; out: @@ -2176,6 +2213,11 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, if (!(src_file->f_mode & FMODE_READ)) goto out_fput; + /* don't make the dst file partly checksummed */ + if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != + (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) + goto out_fput; + ret = -EISDIR; if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode)) goto out_fput; @@ -2219,6 +2261,16 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, !IS_ALIGNED(destoff, bs)) goto out_unlock; + if (destoff > inode->i_size) { + ret = btrfs_cont_expand(inode, inode->i_size, destoff); + if (ret) + goto out_unlock; + } + + /* truncate page cache pages from target inode range */ + truncate_inode_pages_range(&inode->i_data, destoff, + PAGE_CACHE_ALIGN(destoff + len) - 1); + /* do any pending delalloc/csum calc on src, one way or another, and lock file content */ while (1) { @@ -2312,7 +2364,12 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, else new_key.offset = destoff; - trans = btrfs_start_transaction(root, 1); + /* + * 1 - adjusting old extent (we may have to split it) + * 1 - add new extent + * 1 - inode update + */ + trans = btrfs_start_transaction(root, 3); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out; @@ -2320,14 +2377,21 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, if (type == BTRFS_FILE_EXTENT_REG || type == BTRFS_FILE_EXTENT_PREALLOC) { + /* + * a | --- range to clone ---| b + * | ------------- extent ------------- | + */ + + /* substract range b */ + if (key.offset + datal > off + len) + datal = off + len - key.offset; + + /* substract range a */ if (off > key.offset) { datao += off - key.offset; datal -= off - key.offset; } - if (key.offset + datal > off + len) - datal = off + len - key.offset; - ret = btrfs_drop_extents(trans, inode, new_key.offset, new_key.offset + datal, @@ -2424,7 +2488,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, if (endoff > inode->i_size) btrfs_i_size_write(inode, endoff); - BTRFS_I(inode)->flags = BTRFS_I(src)->flags; ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); btrfs_end_transaction(trans, root); @@ -2551,7 +2614,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) return PTR_ERR(trans); } - dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); + dir_id = btrfs_super_root_dir(root->fs_info->super_copy); di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path, dir_id, "default", 7, 1); if (IS_ERR_OR_NULL(di)) { @@ -2567,7 +2630,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_free_path(path); - disk_super = &root->fs_info->super_copy; + disk_super = root->fs_info->super_copy; features = btrfs_super_incompat_flags(disk_super); if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) { features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL; @@ -2828,6 +2891,147 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_root *root, return ret; } +static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) +{ + int ret = 0; + int i; + u64 rel_ptr; + int size; + struct btrfs_ioctl_ino_path_args *ipa = NULL; + struct inode_fs_paths *ipath = NULL; + struct btrfs_path *path; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + ipa = memdup_user(arg, sizeof(*ipa)); + if (IS_ERR(ipa)) { + ret = PTR_ERR(ipa); + ipa = NULL; + goto out; + } + + size = min_t(u32, ipa->size, 4096); + ipath = init_ipath(size, root, path); + if (IS_ERR(ipath)) { + ret = PTR_ERR(ipath); + ipath = NULL; + goto out; + } + + ret = paths_from_inode(ipa->inum, ipath); + if (ret < 0) + goto out; + + for (i = 0; i < ipath->fspath->elem_cnt; ++i) { + rel_ptr = ipath->fspath->val[i] - + (u64)(unsigned long)ipath->fspath->val; + ipath->fspath->val[i] = rel_ptr; + } + + ret = copy_to_user((void *)(unsigned long)ipa->fspath, + (void *)(unsigned long)ipath->fspath, size); + if (ret) { + ret = -EFAULT; + goto out; + } + +out: + btrfs_free_path(path); + free_ipath(ipath); + kfree(ipa); + + return ret; +} + +static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx) +{ + struct btrfs_data_container *inodes = ctx; + const size_t c = 3 * sizeof(u64); + + if (inodes->bytes_left >= c) { + inodes->bytes_left -= c; + inodes->val[inodes->elem_cnt] = inum; + inodes->val[inodes->elem_cnt + 1] = offset; + inodes->val[inodes->elem_cnt + 2] = root; + inodes->elem_cnt += 3; + } else { + inodes->bytes_missing += c - inodes->bytes_left; + inodes->bytes_left = 0; + inodes->elem_missed += 3; + } + + return 0; +} + +static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root, + void __user *arg) +{ + int ret = 0; + int size; + u64 extent_offset; + struct btrfs_ioctl_logical_ino_args *loi; + struct btrfs_data_container *inodes = NULL; + struct btrfs_path *path = NULL; + struct btrfs_key key; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + loi = memdup_user(arg, sizeof(*loi)); + if (IS_ERR(loi)) { + ret = PTR_ERR(loi); + loi = NULL; + goto out; + } + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + size = min_t(u32, loi->size, 4096); + inodes = init_data_container(size); + if (IS_ERR(inodes)) { + ret = PTR_ERR(inodes); + inodes = NULL; + goto out; + } + + ret = extent_from_logical(root->fs_info, loi->logical, path, &key); + + if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) + ret = -ENOENT; + if (ret < 0) + goto out; + + extent_offset = loi->logical - key.objectid; + ret = iterate_extent_inodes(root->fs_info, path, key.objectid, + extent_offset, build_ino_list, inodes); + + if (ret < 0) + goto out; + + ret = copy_to_user((void *)(unsigned long)loi->inodes, + (void *)(unsigned long)inodes, size); + if (ret) + ret = -EFAULT; + +out: + btrfs_free_path(path); + kfree(inodes); + kfree(loi); + + return ret; +} + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -2885,6 +3089,10 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_tree_search(file, argp); case BTRFS_IOC_INO_LOOKUP: return btrfs_ioctl_ino_lookup(file, argp); + case BTRFS_IOC_INO_PATHS: + return btrfs_ioctl_ino_to_path(root, argp); + case BTRFS_IOC_LOGICAL_INO: + return btrfs_ioctl_logical_to_ino(root, argp); case BTRFS_IOC_SPACE_INFO: return btrfs_ioctl_space_info(root, argp); case BTRFS_IOC_SYNC: diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index ad1ea789fcb4..252ae9915de8 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -193,6 +193,30 @@ struct btrfs_ioctl_space_args { struct btrfs_ioctl_space_info spaces[0]; }; +struct btrfs_data_container { + __u32 bytes_left; /* out -- bytes not needed to deliver output */ + __u32 bytes_missing; /* out -- additional bytes needed for result */ + __u32 elem_cnt; /* out */ + __u32 elem_missed; /* out */ + __u64 val[0]; /* out */ +}; + +struct btrfs_ioctl_ino_path_args { + __u64 inum; /* in */ + __u32 size; /* in */ + __u64 reserved[4]; + /* struct btrfs_data_container *fspath; out */ + __u64 fspath; /* out */ +}; + +struct btrfs_ioctl_logical_ino_args { + __u64 logical; /* in */ + __u32 size; /* in */ + __u64 reserved[4]; + /* struct btrfs_data_container *inodes; out */ + __u64 inodes; +}; + #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ struct btrfs_ioctl_vol_args) #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ @@ -248,4 +272,9 @@ struct btrfs_ioctl_space_args { struct btrfs_ioctl_dev_info_args) #define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \ struct btrfs_ioctl_fs_info_args) +#define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \ + struct btrfs_ioctl_ino_path_args) +#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \ + struct btrfs_ioctl_ino_path_args) + #endif diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 66fa43dc3f0f..d77b67c4b275 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -24,185 +24,197 @@ #include "extent_io.h" #include "locking.h" -static inline void spin_nested(struct extent_buffer *eb) -{ - spin_lock(&eb->lock); -} +void btrfs_assert_tree_read_locked(struct extent_buffer *eb); /* - * Setting a lock to blocking will drop the spinlock and set the - * flag that forces other procs who want the lock to wait. After - * this you can safely schedule with the lock held. + * if we currently have a spinning reader or writer lock + * (indicated by the rw flag) this will bump the count + * of blocking holders and drop the spinlock. */ -void btrfs_set_lock_blocking(struct extent_buffer *eb) +void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw) { - if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) { - set_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags); - spin_unlock(&eb->lock); + if (rw == BTRFS_WRITE_LOCK) { + if (atomic_read(&eb->blocking_writers) == 0) { + WARN_ON(atomic_read(&eb->spinning_writers) != 1); + atomic_dec(&eb->spinning_writers); + btrfs_assert_tree_locked(eb); + atomic_inc(&eb->blocking_writers); + write_unlock(&eb->lock); + } + } else if (rw == BTRFS_READ_LOCK) { + btrfs_assert_tree_read_locked(eb); + atomic_inc(&eb->blocking_readers); + WARN_ON(atomic_read(&eb->spinning_readers) == 0); + atomic_dec(&eb->spinning_readers); + read_unlock(&eb->lock); } - /* exit with the spin lock released and the bit set */ + return; } /* - * clearing the blocking flag will take the spinlock again. - * After this you can't safely schedule + * if we currently have a blocking lock, take the spinlock + * and drop our blocking count */ -void btrfs_clear_lock_blocking(struct extent_buffer *eb) +void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw) { - if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) { - spin_nested(eb); - clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags); - smp_mb__after_clear_bit(); + if (rw == BTRFS_WRITE_LOCK_BLOCKING) { + BUG_ON(atomic_read(&eb->blocking_writers) != 1); + write_lock(&eb->lock); + WARN_ON(atomic_read(&eb->spinning_writers)); + atomic_inc(&eb->spinning_writers); + if (atomic_dec_and_test(&eb->blocking_writers)) + wake_up(&eb->write_lock_wq); + } else if (rw == BTRFS_READ_LOCK_BLOCKING) { + BUG_ON(atomic_read(&eb->blocking_readers) == 0); + read_lock(&eb->lock); + atomic_inc(&eb->spinning_readers); + if (atomic_dec_and_test(&eb->blocking_readers)) + wake_up(&eb->read_lock_wq); } - /* exit with the spin lock held */ + return; } /* - * unfortunately, many of the places that currently set a lock to blocking - * don't end up blocking for very long, and often they don't block - * at all. For a dbench 50 run, if we don't spin on the blocking bit - * at all, the context switch rate can jump up to 400,000/sec or more. - * - * So, we're still stuck with this crummy spin on the blocking bit, - * at least until the most common causes of the short blocks - * can be dealt with. + * take a spinning read lock. This will wait for any blocking + * writers */ -static int btrfs_spin_on_block(struct extent_buffer *eb) +void btrfs_tree_read_lock(struct extent_buffer *eb) { - int i; - - for (i = 0; i < 512; i++) { - if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) - return 1; - if (need_resched()) - break; - cpu_relax(); +again: + wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0); + read_lock(&eb->lock); + if (atomic_read(&eb->blocking_writers)) { + read_unlock(&eb->lock); + wait_event(eb->write_lock_wq, + atomic_read(&eb->blocking_writers) == 0); + goto again; } - return 0; + atomic_inc(&eb->read_locks); + atomic_inc(&eb->spinning_readers); } /* - * This is somewhat different from trylock. It will take the - * spinlock but if it finds the lock is set to blocking, it will - * return without the lock held. - * - * returns 1 if it was able to take the lock and zero otherwise - * - * After this call, scheduling is not safe without first calling - * btrfs_set_lock_blocking() + * returns 1 if we get the read lock and 0 if we don't + * this won't wait for blocking writers */ -int btrfs_try_spin_lock(struct extent_buffer *eb) +int btrfs_try_tree_read_lock(struct extent_buffer *eb) { - int i; + if (atomic_read(&eb->blocking_writers)) + return 0; - if (btrfs_spin_on_block(eb)) { - spin_nested(eb); - if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) - return 1; - spin_unlock(&eb->lock); + read_lock(&eb->lock); + if (atomic_read(&eb->blocking_writers)) { + read_unlock(&eb->lock); + return 0; } - /* spin for a bit on the BLOCKING flag */ - for (i = 0; i < 2; i++) { - cpu_relax(); - if (!btrfs_spin_on_block(eb)) - break; - - spin_nested(eb); - if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) - return 1; - spin_unlock(&eb->lock); - } - return 0; + atomic_inc(&eb->read_locks); + atomic_inc(&eb->spinning_readers); + return 1; } /* - * the autoremove wake function will return 0 if it tried to wake up - * a process that was already awake, which means that process won't - * count as an exclusive wakeup. The waitq code will continue waking - * procs until it finds one that was actually sleeping. - * - * For btrfs, this isn't quite what we want. We want a single proc - * to be notified that the lock is ready for taking. If that proc - * already happen to be awake, great, it will loop around and try for - * the lock. - * - * So, btrfs_wake_function always returns 1, even when the proc that we - * tried to wake up was already awake. + * returns 1 if we get the read lock and 0 if we don't + * this won't wait for blocking writers or readers */ -static int btrfs_wake_function(wait_queue_t *wait, unsigned mode, - int sync, void *key) +int btrfs_try_tree_write_lock(struct extent_buffer *eb) { - autoremove_wake_function(wait, mode, sync, key); + if (atomic_read(&eb->blocking_writers) || + atomic_read(&eb->blocking_readers)) + return 0; + write_lock(&eb->lock); + if (atomic_read(&eb->blocking_writers) || + atomic_read(&eb->blocking_readers)) { + write_unlock(&eb->lock); + return 0; + } + atomic_inc(&eb->write_locks); + atomic_inc(&eb->spinning_writers); return 1; } /* - * returns with the extent buffer spinlocked. - * - * This will spin and/or wait as required to take the lock, and then - * return with the spinlock held. - * - * After this call, scheduling is not safe without first calling - * btrfs_set_lock_blocking() + * drop a spinning read lock + */ +void btrfs_tree_read_unlock(struct extent_buffer *eb) +{ + btrfs_assert_tree_read_locked(eb); + WARN_ON(atomic_read(&eb->spinning_readers) == 0); + atomic_dec(&eb->spinning_readers); + atomic_dec(&eb->read_locks); + read_unlock(&eb->lock); +} + +/* + * drop a blocking read lock + */ +void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) +{ + btrfs_assert_tree_read_locked(eb); + WARN_ON(atomic_read(&eb->blocking_readers) == 0); + if (atomic_dec_and_test(&eb->blocking_readers)) + wake_up(&eb->read_lock_wq); + atomic_dec(&eb->read_locks); +} + +/* + * take a spinning write lock. This will wait for both + * blocking readers or writers */ int btrfs_tree_lock(struct extent_buffer *eb) { - DEFINE_WAIT(wait); - wait.func = btrfs_wake_function; - - if (!btrfs_spin_on_block(eb)) - goto sleep; - - while(1) { - spin_nested(eb); - - /* nobody is blocking, exit with the spinlock held */ - if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) - return 0; - - /* - * we have the spinlock, but the real owner is blocking. - * wait for them - */ - spin_unlock(&eb->lock); - - /* - * spin for a bit, and if the blocking flag goes away, - * loop around - */ - cpu_relax(); - if (btrfs_spin_on_block(eb)) - continue; -sleep: - prepare_to_wait_exclusive(&eb->lock_wq, &wait, - TASK_UNINTERRUPTIBLE); - - if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) - schedule(); - - finish_wait(&eb->lock_wq, &wait); +again: + wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0); + wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0); + write_lock(&eb->lock); + if (atomic_read(&eb->blocking_readers)) { + write_unlock(&eb->lock); + wait_event(eb->read_lock_wq, + atomic_read(&eb->blocking_readers) == 0); + goto again; } + if (atomic_read(&eb->blocking_writers)) { + write_unlock(&eb->lock); + wait_event(eb->write_lock_wq, + atomic_read(&eb->blocking_writers) == 0); + goto again; + } + WARN_ON(atomic_read(&eb->spinning_writers)); + atomic_inc(&eb->spinning_writers); + atomic_inc(&eb->write_locks); return 0; } +/* + * drop a spinning or a blocking write lock. + */ int btrfs_tree_unlock(struct extent_buffer *eb) { - /* - * if we were a blocking owner, we don't have the spinlock held - * just clear the bit and look for waiters - */ - if (test_and_clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) - smp_mb__after_clear_bit(); - else - spin_unlock(&eb->lock); - - if (waitqueue_active(&eb->lock_wq)) - wake_up(&eb->lock_wq); + int blockers = atomic_read(&eb->blocking_writers); + + BUG_ON(blockers > 1); + + btrfs_assert_tree_locked(eb); + atomic_dec(&eb->write_locks); + + if (blockers) { + WARN_ON(atomic_read(&eb->spinning_writers)); + atomic_dec(&eb->blocking_writers); + smp_wmb(); + wake_up(&eb->write_lock_wq); + } else { + WARN_ON(atomic_read(&eb->spinning_writers) != 1); + atomic_dec(&eb->spinning_writers); + write_unlock(&eb->lock); + } return 0; } void btrfs_assert_tree_locked(struct extent_buffer *eb) { - if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) - assert_spin_locked(&eb->lock); + BUG_ON(!atomic_read(&eb->write_locks)); +} + +void btrfs_assert_tree_read_locked(struct extent_buffer *eb) +{ + BUG_ON(!atomic_read(&eb->read_locks)); } diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 5c33a560a2f1..17247ddb81a0 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -19,11 +19,43 @@ #ifndef __BTRFS_LOCKING_ #define __BTRFS_LOCKING_ +#define BTRFS_WRITE_LOCK 1 +#define BTRFS_READ_LOCK 2 +#define BTRFS_WRITE_LOCK_BLOCKING 3 +#define BTRFS_READ_LOCK_BLOCKING 4 + int btrfs_tree_lock(struct extent_buffer *eb); int btrfs_tree_unlock(struct extent_buffer *eb); int btrfs_try_spin_lock(struct extent_buffer *eb); -void btrfs_set_lock_blocking(struct extent_buffer *eb); -void btrfs_clear_lock_blocking(struct extent_buffer *eb); +void btrfs_tree_read_lock(struct extent_buffer *eb); +void btrfs_tree_read_unlock(struct extent_buffer *eb); +void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb); +void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw); +void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw); void btrfs_assert_tree_locked(struct extent_buffer *eb); +int btrfs_try_tree_read_lock(struct extent_buffer *eb); +int btrfs_try_tree_write_lock(struct extent_buffer *eb); + +static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw) +{ + if (rw == BTRFS_WRITE_LOCK || rw == BTRFS_WRITE_LOCK_BLOCKING) + btrfs_tree_unlock(eb); + else if (rw == BTRFS_READ_LOCK_BLOCKING) + btrfs_tree_read_unlock_blocking(eb); + else if (rw == BTRFS_READ_LOCK) + btrfs_tree_read_unlock(eb); + else + BUG(); +} + +static inline void btrfs_set_lock_blocking(struct extent_buffer *eb) +{ + btrfs_set_lock_blocking_rw(eb, BTRFS_WRITE_LOCK); +} + +static inline void btrfs_clear_lock_blocking(struct extent_buffer *eb) +{ + btrfs_clear_lock_blocking_rw(eb, BTRFS_WRITE_LOCK_BLOCKING); +} #endif diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index fb2605d998e9..f38e452486b8 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -158,8 +158,7 @@ static void print_extent_ref_v0(struct extent_buffer *eb, int slot) void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) { int i; - u32 type; - u32 nr = btrfs_header_nritems(l); + u32 type, nr; struct btrfs_item *item; struct btrfs_root_item *ri; struct btrfs_dir_item *di; @@ -172,6 +171,11 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) struct btrfs_key key; struct btrfs_key found_key; + if (!l) + return; + + nr = btrfs_header_nritems(l); + printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n", (unsigned long long)btrfs_header_bytenr(l), nr, btrfs_leaf_free_space(root, l)); diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c new file mode 100644 index 000000000000..2373b39a132b --- /dev/null +++ b/fs/btrfs/reada.c @@ -0,0 +1,951 @@ +/* + * Copyright (C) 2011 STRATO. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/sched.h> +#include <linux/pagemap.h> +#include <linux/writeback.h> +#include <linux/blkdev.h> +#include <linux/rbtree.h> +#include <linux/slab.h> +#include <linux/workqueue.h> +#include "ctree.h" +#include "volumes.h" +#include "disk-io.h" +#include "transaction.h" + +#undef DEBUG + +/* + * This is the implementation for the generic read ahead framework. + * + * To trigger a readahead, btrfs_reada_add must be called. It will start + * a read ahead for the given range [start, end) on tree root. The returned + * handle can either be used to wait on the readahead to finish + * (btrfs_reada_wait), or to send it to the background (btrfs_reada_detach). + * + * The read ahead works as follows: + * On btrfs_reada_add, the root of the tree is inserted into a radix_tree. + * reada_start_machine will then search for extents to prefetch and trigger + * some reads. When a read finishes for a node, all contained node/leaf + * pointers that lie in the given range will also be enqueued. The reads will + * be triggered in sequential order, thus giving a big win over a naive + * enumeration. It will also make use of multi-device layouts. Each disk + * will have its on read pointer and all disks will by utilized in parallel. + * Also will no two disks read both sides of a mirror simultaneously, as this + * would waste seeking capacity. Instead both disks will read different parts + * of the filesystem. + * Any number of readaheads can be started in parallel. The read order will be + * determined globally, i.e. 2 parallel readaheads will normally finish faster + * than the 2 started one after another. + */ + +#define MAX_MIRRORS 2 +#define MAX_IN_FLIGHT 6 + +struct reada_extctl { + struct list_head list; + struct reada_control *rc; + u64 generation; +}; + +struct reada_extent { + u64 logical; + struct btrfs_key top; + u32 blocksize; + int err; + struct list_head extctl; + struct kref refcnt; + spinlock_t lock; + struct reada_zone *zones[MAX_MIRRORS]; + int nzones; + struct btrfs_device *scheduled_for; +}; + +struct reada_zone { + u64 start; + u64 end; + u64 elems; + struct list_head list; + spinlock_t lock; + int locked; + struct btrfs_device *device; + struct btrfs_device *devs[MAX_MIRRORS]; /* full list, incl self */ + int ndevs; + struct kref refcnt; +}; + +struct reada_machine_work { + struct btrfs_work work; + struct btrfs_fs_info *fs_info; +}; + +static void reada_extent_put(struct btrfs_fs_info *, struct reada_extent *); +static void reada_control_release(struct kref *kref); +static void reada_zone_release(struct kref *kref); +static void reada_start_machine(struct btrfs_fs_info *fs_info); +static void __reada_start_machine(struct btrfs_fs_info *fs_info); + +static int reada_add_block(struct reada_control *rc, u64 logical, + struct btrfs_key *top, int level, u64 generation); + +/* recurses */ +/* in case of err, eb might be NULL */ +static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, + u64 start, int err) +{ + int level = 0; + int nritems; + int i; + u64 bytenr; + u64 generation; + struct reada_extent *re; + struct btrfs_fs_info *fs_info = root->fs_info; + struct list_head list; + unsigned long index = start >> PAGE_CACHE_SHIFT; + struct btrfs_device *for_dev; + + if (eb) + level = btrfs_header_level(eb); + + /* find extent */ + spin_lock(&fs_info->reada_lock); + re = radix_tree_lookup(&fs_info->reada_tree, index); + if (re) + kref_get(&re->refcnt); + spin_unlock(&fs_info->reada_lock); + + if (!re) + return -1; + + spin_lock(&re->lock); + /* + * just take the full list from the extent. afterwards we + * don't need the lock anymore + */ + list_replace_init(&re->extctl, &list); + for_dev = re->scheduled_for; + re->scheduled_for = NULL; + spin_unlock(&re->lock); + + if (err == 0) { + nritems = level ? btrfs_header_nritems(eb) : 0; + generation = btrfs_header_generation(eb); + /* + * FIXME: currently we just set nritems to 0 if this is a leaf, + * effectively ignoring the content. In a next step we could + * trigger more readahead depending from the content, e.g. + * fetch the checksums for the extents in the leaf. + */ + } else { + /* + * this is the error case, the extent buffer has not been + * read correctly. We won't access anything from it and + * just cleanup our data structures. Effectively this will + * cut the branch below this node from read ahead. + */ + nritems = 0; + generation = 0; + } + + for (i = 0; i < nritems; i++) { + struct reada_extctl *rec; + u64 n_gen; + struct btrfs_key key; + struct btrfs_key next_key; + + btrfs_node_key_to_cpu(eb, &key, i); + if (i + 1 < nritems) + btrfs_node_key_to_cpu(eb, &next_key, i + 1); + else + next_key = re->top; + bytenr = btrfs_node_blockptr(eb, i); + n_gen = btrfs_node_ptr_generation(eb, i); + + list_for_each_entry(rec, &list, list) { + struct reada_control *rc = rec->rc; + + /* + * if the generation doesn't match, just ignore this + * extctl. This will probably cut off a branch from + * prefetch. Alternatively one could start a new (sub-) + * prefetch for this branch, starting again from root. + * FIXME: move the generation check out of this loop + */ +#ifdef DEBUG + if (rec->generation != generation) { + printk(KERN_DEBUG "generation mismatch for " + "(%llu,%d,%llu) %llu != %llu\n", + key.objectid, key.type, key.offset, + rec->generation, generation); + } +#endif + if (rec->generation == generation && + btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 && + btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0) + reada_add_block(rc, bytenr, &next_key, + level - 1, n_gen); + } + } + /* + * free extctl records + */ + while (!list_empty(&list)) { + struct reada_control *rc; + struct reada_extctl *rec; + + rec = list_first_entry(&list, struct reada_extctl, list); + list_del(&rec->list); + rc = rec->rc; + kfree(rec); + + kref_get(&rc->refcnt); + if (atomic_dec_and_test(&rc->elems)) { + kref_put(&rc->refcnt, reada_control_release); + wake_up(&rc->wait); + } + kref_put(&rc->refcnt, reada_control_release); + + reada_extent_put(fs_info, re); /* one ref for each entry */ + } + reada_extent_put(fs_info, re); /* our ref */ + if (for_dev) + atomic_dec(&for_dev->reada_in_flight); + + return 0; +} + +/* + * start is passed separately in case eb in NULL, which may be the case with + * failed I/O + */ +int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, + u64 start, int err) +{ + int ret; + + ret = __readahead_hook(root, eb, start, err); + + reada_start_machine(root->fs_info); + + return ret; +} + +static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info, + struct btrfs_device *dev, u64 logical, + struct btrfs_bio *bbio) +{ + int ret; + int looped = 0; + struct reada_zone *zone; + struct btrfs_block_group_cache *cache = NULL; + u64 start; + u64 end; + int i; + +again: + zone = NULL; + spin_lock(&fs_info->reada_lock); + ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone, + logical >> PAGE_CACHE_SHIFT, 1); + if (ret == 1) + kref_get(&zone->refcnt); + spin_unlock(&fs_info->reada_lock); + + if (ret == 1) { + if (logical >= zone->start && logical < zone->end) + return zone; + spin_lock(&fs_info->reada_lock); + kref_put(&zone->refcnt, reada_zone_release); + spin_unlock(&fs_info->reada_lock); + } + + if (looped) + return NULL; + + cache = btrfs_lookup_block_group(fs_info, logical); + if (!cache) + return NULL; + + start = cache->key.objectid; + end = start + cache->key.offset - 1; + btrfs_put_block_group(cache); + + zone = kzalloc(sizeof(*zone), GFP_NOFS); + if (!zone) + return NULL; + + zone->start = start; + zone->end = end; + INIT_LIST_HEAD(&zone->list); + spin_lock_init(&zone->lock); + zone->locked = 0; + kref_init(&zone->refcnt); + zone->elems = 0; + zone->device = dev; /* our device always sits at index 0 */ + for (i = 0; i < bbio->num_stripes; ++i) { + /* bounds have already been checked */ + zone->devs[i] = bbio->stripes[i].dev; + } + zone->ndevs = bbio->num_stripes; + + spin_lock(&fs_info->reada_lock); + ret = radix_tree_insert(&dev->reada_zones, + (unsigned long)zone->end >> PAGE_CACHE_SHIFT, + zone); + spin_unlock(&fs_info->reada_lock); + + if (ret) { + kfree(zone); + looped = 1; + goto again; + } + + return zone; +} + +static struct reada_extent *reada_find_extent(struct btrfs_root *root, + u64 logical, + struct btrfs_key *top, int level) +{ + int ret; + int looped = 0; + struct reada_extent *re = NULL; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; + struct btrfs_bio *bbio = NULL; + struct btrfs_device *dev; + u32 blocksize; + u64 length; + int nzones = 0; + int i; + unsigned long index = logical >> PAGE_CACHE_SHIFT; + +again: + spin_lock(&fs_info->reada_lock); + re = radix_tree_lookup(&fs_info->reada_tree, index); + if (re) + kref_get(&re->refcnt); + spin_unlock(&fs_info->reada_lock); + + if (re || looped) + return re; + + re = kzalloc(sizeof(*re), GFP_NOFS); + if (!re) + return NULL; + + blocksize = btrfs_level_size(root, level); + re->logical = logical; + re->blocksize = blocksize; + re->top = *top; + INIT_LIST_HEAD(&re->extctl); + spin_lock_init(&re->lock); + kref_init(&re->refcnt); + + /* + * map block + */ + length = blocksize; + ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, &bbio, 0); + if (ret || !bbio || length < blocksize) + goto error; + + if (bbio->num_stripes > MAX_MIRRORS) { + printk(KERN_ERR "btrfs readahead: more than %d copies not " + "supported", MAX_MIRRORS); + goto error; + } + + for (nzones = 0; nzones < bbio->num_stripes; ++nzones) { + struct reada_zone *zone; + + dev = bbio->stripes[nzones].dev; + zone = reada_find_zone(fs_info, dev, logical, bbio); + if (!zone) + break; + + re->zones[nzones] = zone; + spin_lock(&zone->lock); + if (!zone->elems) + kref_get(&zone->refcnt); + ++zone->elems; + spin_unlock(&zone->lock); + spin_lock(&fs_info->reada_lock); + kref_put(&zone->refcnt, reada_zone_release); + spin_unlock(&fs_info->reada_lock); + } + re->nzones = nzones; + if (nzones == 0) { + /* not a single zone found, error and out */ + goto error; + } + + /* insert extent in reada_tree + all per-device trees, all or nothing */ + spin_lock(&fs_info->reada_lock); + ret = radix_tree_insert(&fs_info->reada_tree, index, re); + if (ret) { + spin_unlock(&fs_info->reada_lock); + if (ret != -ENOMEM) { + /* someone inserted the extent in the meantime */ + looped = 1; + } + goto error; + } + for (i = 0; i < nzones; ++i) { + dev = bbio->stripes[i].dev; + ret = radix_tree_insert(&dev->reada_extents, index, re); + if (ret) { + while (--i >= 0) { + dev = bbio->stripes[i].dev; + BUG_ON(dev == NULL); + radix_tree_delete(&dev->reada_extents, index); + } + BUG_ON(fs_info == NULL); + radix_tree_delete(&fs_info->reada_tree, index); + spin_unlock(&fs_info->reada_lock); + goto error; + } + } + spin_unlock(&fs_info->reada_lock); + + kfree(bbio); + return re; + +error: + while (nzones) { + struct reada_zone *zone; + + --nzones; + zone = re->zones[nzones]; + kref_get(&zone->refcnt); + spin_lock(&zone->lock); + --zone->elems; + if (zone->elems == 0) { + /* + * no fs_info->reada_lock needed, as this can't be + * the last ref + */ + kref_put(&zone->refcnt, reada_zone_release); + } + spin_unlock(&zone->lock); + + spin_lock(&fs_info->reada_lock); + kref_put(&zone->refcnt, reada_zone_release); + spin_unlock(&fs_info->reada_lock); + } + kfree(bbio); + kfree(re); + if (looped) + goto again; + return NULL; +} + +static void reada_kref_dummy(struct kref *kr) +{ +} + +static void reada_extent_put(struct btrfs_fs_info *fs_info, + struct reada_extent *re) +{ + int i; + unsigned long index = re->logical >> PAGE_CACHE_SHIFT; + + spin_lock(&fs_info->reada_lock); + if (!kref_put(&re->refcnt, reada_kref_dummy)) { + spin_unlock(&fs_info->reada_lock); + return; + } + + radix_tree_delete(&fs_info->reada_tree, index); + for (i = 0; i < re->nzones; ++i) { + struct reada_zone *zone = re->zones[i]; + + radix_tree_delete(&zone->device->reada_extents, index); + } + + spin_unlock(&fs_info->reada_lock); + + for (i = 0; i < re->nzones; ++i) { + struct reada_zone *zone = re->zones[i]; + + kref_get(&zone->refcnt); + spin_lock(&zone->lock); + --zone->elems; + if (zone->elems == 0) { + /* no fs_info->reada_lock needed, as this can't be + * the last ref */ + kref_put(&zone->refcnt, reada_zone_release); + } + spin_unlock(&zone->lock); + + spin_lock(&fs_info->reada_lock); + kref_put(&zone->refcnt, reada_zone_release); + spin_unlock(&fs_info->reada_lock); + } + if (re->scheduled_for) + atomic_dec(&re->scheduled_for->reada_in_flight); + + kfree(re); +} + +static void reada_zone_release(struct kref *kref) +{ + struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt); + + radix_tree_delete(&zone->device->reada_zones, + zone->end >> PAGE_CACHE_SHIFT); + + kfree(zone); +} + +static void reada_control_release(struct kref *kref) +{ + struct reada_control *rc = container_of(kref, struct reada_control, + refcnt); + + kfree(rc); +} + +static int reada_add_block(struct reada_control *rc, u64 logical, + struct btrfs_key *top, int level, u64 generation) +{ + struct btrfs_root *root = rc->root; + struct reada_extent *re; + struct reada_extctl *rec; + + re = reada_find_extent(root, logical, top, level); /* takes one ref */ + if (!re) + return -1; + + rec = kzalloc(sizeof(*rec), GFP_NOFS); + if (!rec) { + reada_extent_put(root->fs_info, re); + return -1; + } + + rec->rc = rc; + rec->generation = generation; + atomic_inc(&rc->elems); + + spin_lock(&re->lock); + list_add_tail(&rec->list, &re->extctl); + spin_unlock(&re->lock); + + /* leave the ref on the extent */ + + return 0; +} + +/* + * called with fs_info->reada_lock held + */ +static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock) +{ + int i; + unsigned long index = zone->end >> PAGE_CACHE_SHIFT; + + for (i = 0; i < zone->ndevs; ++i) { + struct reada_zone *peer; + peer = radix_tree_lookup(&zone->devs[i]->reada_zones, index); + if (peer && peer->device != zone->device) + peer->locked = lock; + } +} + +/* + * called with fs_info->reada_lock held + */ +static int reada_pick_zone(struct btrfs_device *dev) +{ + struct reada_zone *top_zone = NULL; + struct reada_zone *top_locked_zone = NULL; + u64 top_elems = 0; + u64 top_locked_elems = 0; + unsigned long index = 0; + int ret; + + if (dev->reada_curr_zone) { + reada_peer_zones_set_lock(dev->reada_curr_zone, 0); + kref_put(&dev->reada_curr_zone->refcnt, reada_zone_release); + dev->reada_curr_zone = NULL; + } + /* pick the zone with the most elements */ + while (1) { + struct reada_zone *zone; + + ret = radix_tree_gang_lookup(&dev->reada_zones, + (void **)&zone, index, 1); + if (ret == 0) + break; + index = (zone->end >> PAGE_CACHE_SHIFT) + 1; + if (zone->locked) { + if (zone->elems > top_locked_elems) { + top_locked_elems = zone->elems; + top_locked_zone = zone; + } + } else { + if (zone->elems > top_elems) { + top_elems = zone->elems; + top_zone = zone; + } + } + } + if (top_zone) + dev->reada_curr_zone = top_zone; + else if (top_locked_zone) + dev->reada_curr_zone = top_locked_zone; + else + return 0; + + dev->reada_next = dev->reada_curr_zone->start; + kref_get(&dev->reada_curr_zone->refcnt); + reada_peer_zones_set_lock(dev->reada_curr_zone, 1); + + return 1; +} + +static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, + struct btrfs_device *dev) +{ + struct reada_extent *re = NULL; + int mirror_num = 0; + struct extent_buffer *eb = NULL; + u64 logical; + u32 blocksize; + int ret; + int i; + int need_kick = 0; + + spin_lock(&fs_info->reada_lock); + if (dev->reada_curr_zone == NULL) { + ret = reada_pick_zone(dev); + if (!ret) { + spin_unlock(&fs_info->reada_lock); + return 0; + } + } + /* + * FIXME currently we issue the reads one extent at a time. If we have + * a contiguous block of extents, we could also coagulate them or use + * plugging to speed things up + */ + ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re, + dev->reada_next >> PAGE_CACHE_SHIFT, 1); + if (ret == 0 || re->logical >= dev->reada_curr_zone->end) { + ret = reada_pick_zone(dev); + if (!ret) { + spin_unlock(&fs_info->reada_lock); + return 0; + } + re = NULL; + ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re, + dev->reada_next >> PAGE_CACHE_SHIFT, 1); + } + if (ret == 0) { + spin_unlock(&fs_info->reada_lock); + return 0; + } + dev->reada_next = re->logical + re->blocksize; + kref_get(&re->refcnt); + + spin_unlock(&fs_info->reada_lock); + + /* + * find mirror num + */ + for (i = 0; i < re->nzones; ++i) { + if (re->zones[i]->device == dev) { + mirror_num = i + 1; + break; + } + } + logical = re->logical; + blocksize = re->blocksize; + + spin_lock(&re->lock); + if (re->scheduled_for == NULL) { + re->scheduled_for = dev; + need_kick = 1; + } + spin_unlock(&re->lock); + + reada_extent_put(fs_info, re); + + if (!need_kick) + return 0; + + atomic_inc(&dev->reada_in_flight); + ret = reada_tree_block_flagged(fs_info->extent_root, logical, blocksize, + mirror_num, &eb); + if (ret) + __readahead_hook(fs_info->extent_root, NULL, logical, ret); + else if (eb) + __readahead_hook(fs_info->extent_root, eb, eb->start, ret); + + if (eb) + free_extent_buffer(eb); + + return 1; + +} + +static void reada_start_machine_worker(struct btrfs_work *work) +{ + struct reada_machine_work *rmw; + struct btrfs_fs_info *fs_info; + + rmw = container_of(work, struct reada_machine_work, work); + fs_info = rmw->fs_info; + + kfree(rmw); + + __reada_start_machine(fs_info); +} + +static void __reada_start_machine(struct btrfs_fs_info *fs_info) +{ + struct btrfs_device *device; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + u64 enqueued; + u64 total = 0; + int i; + + do { + enqueued = 0; + list_for_each_entry(device, &fs_devices->devices, dev_list) { + if (atomic_read(&device->reada_in_flight) < + MAX_IN_FLIGHT) + enqueued += reada_start_machine_dev(fs_info, + device); + } + total += enqueued; + } while (enqueued && total < 10000); + + if (enqueued == 0) + return; + + /* + * If everything is already in the cache, this is effectively single + * threaded. To a) not hold the caller for too long and b) to utilize + * more cores, we broke the loop above after 10000 iterations and now + * enqueue to workers to finish it. This will distribute the load to + * the cores. + */ + for (i = 0; i < 2; ++i) + reada_start_machine(fs_info); +} + +static void reada_start_machine(struct btrfs_fs_info *fs_info) +{ + struct reada_machine_work *rmw; + + rmw = kzalloc(sizeof(*rmw), GFP_NOFS); + if (!rmw) { + /* FIXME we cannot handle this properly right now */ + BUG(); + } + rmw->work.func = reada_start_machine_worker; + rmw->fs_info = fs_info; + + btrfs_queue_worker(&fs_info->readahead_workers, &rmw->work); +} + +#ifdef DEBUG +static void dump_devs(struct btrfs_fs_info *fs_info, int all) +{ + struct btrfs_device *device; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + unsigned long index; + int ret; + int i; + int j; + int cnt; + + spin_lock(&fs_info->reada_lock); + list_for_each_entry(device, &fs_devices->devices, dev_list) { + printk(KERN_DEBUG "dev %lld has %d in flight\n", device->devid, + atomic_read(&device->reada_in_flight)); + index = 0; + while (1) { + struct reada_zone *zone; + ret = radix_tree_gang_lookup(&device->reada_zones, + (void **)&zone, index, 1); + if (ret == 0) + break; + printk(KERN_DEBUG " zone %llu-%llu elems %llu locked " + "%d devs", zone->start, zone->end, zone->elems, + zone->locked); + for (j = 0; j < zone->ndevs; ++j) { + printk(KERN_CONT " %lld", + zone->devs[j]->devid); + } + if (device->reada_curr_zone == zone) + printk(KERN_CONT " curr off %llu", + device->reada_next - zone->start); + printk(KERN_CONT "\n"); + index = (zone->end >> PAGE_CACHE_SHIFT) + 1; + } + cnt = 0; + index = 0; + while (all) { + struct reada_extent *re = NULL; + + ret = radix_tree_gang_lookup(&device->reada_extents, + (void **)&re, index, 1); + if (ret == 0) + break; + printk(KERN_DEBUG + " re: logical %llu size %u empty %d for %lld", + re->logical, re->blocksize, + list_empty(&re->extctl), re->scheduled_for ? + re->scheduled_for->devid : -1); + + for (i = 0; i < re->nzones; ++i) { + printk(KERN_CONT " zone %llu-%llu devs", + re->zones[i]->start, + re->zones[i]->end); + for (j = 0; j < re->zones[i]->ndevs; ++j) { + printk(KERN_CONT " %lld", + re->zones[i]->devs[j]->devid); + } + } + printk(KERN_CONT "\n"); + index = (re->logical >> PAGE_CACHE_SHIFT) + 1; + if (++cnt > 15) + break; + } + } + + index = 0; + cnt = 0; + while (all) { + struct reada_extent *re = NULL; + + ret = radix_tree_gang_lookup(&fs_info->reada_tree, (void **)&re, + index, 1); + if (ret == 0) + break; + if (!re->scheduled_for) { + index = (re->logical >> PAGE_CACHE_SHIFT) + 1; + continue; + } + printk(KERN_DEBUG + "re: logical %llu size %u list empty %d for %lld", + re->logical, re->blocksize, list_empty(&re->extctl), + re->scheduled_for ? re->scheduled_for->devid : -1); + for (i = 0; i < re->nzones; ++i) { + printk(KERN_CONT " zone %llu-%llu devs", + re->zones[i]->start, + re->zones[i]->end); + for (i = 0; i < re->nzones; ++i) { + printk(KERN_CONT " zone %llu-%llu devs", + re->zones[i]->start, + re->zones[i]->end); + for (j = 0; j < re->zones[i]->ndevs; ++j) { + printk(KERN_CONT " %lld", + re->zones[i]->devs[j]->devid); + } + } + } + printk(KERN_CONT "\n"); + index = (re->logical >> PAGE_CACHE_SHIFT) + 1; + } + spin_unlock(&fs_info->reada_lock); +} +#endif + +/* + * interface + */ +struct reada_control *btrfs_reada_add(struct btrfs_root *root, + struct btrfs_key *key_start, struct btrfs_key *key_end) +{ + struct reada_control *rc; + u64 start; + u64 generation; + int level; + struct extent_buffer *node; + static struct btrfs_key max_key = { + .objectid = (u64)-1, + .type = (u8)-1, + .offset = (u64)-1 + }; + + rc = kzalloc(sizeof(*rc), GFP_NOFS); + if (!rc) + return ERR_PTR(-ENOMEM); + + rc->root = root; + rc->key_start = *key_start; + rc->key_end = *key_end; + atomic_set(&rc->elems, 0); + init_waitqueue_head(&rc->wait); + kref_init(&rc->refcnt); + kref_get(&rc->refcnt); /* one ref for having elements */ + + node = btrfs_root_node(root); + start = node->start; + level = btrfs_header_level(node); + generation = btrfs_header_generation(node); + free_extent_buffer(node); + + reada_add_block(rc, start, &max_key, level, generation); + + reada_start_machine(root->fs_info); + + return rc; +} + +#ifdef DEBUG +int btrfs_reada_wait(void *handle) +{ + struct reada_control *rc = handle; + + while (atomic_read(&rc->elems)) { + wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0, + 5 * HZ); + dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0); + } + + dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0); + + kref_put(&rc->refcnt, reada_control_release); + + return 0; +} +#else +int btrfs_reada_wait(void *handle) +{ + struct reada_control *rc = handle; + + while (atomic_read(&rc->elems)) { + wait_event(rc->wait, atomic_read(&rc->elems) == 0); + } + + kref_put(&rc->refcnt, reada_control_release); + + return 0; +} +#endif + +void btrfs_reada_detach(void *handle) +{ + struct reada_control *rc = handle; + + kref_put(&rc->refcnt, reada_control_release); +} diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c deleted file mode 100644 index 82d569cb6267..000000000000 --- a/fs/btrfs/ref-cache.c +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright (C) 2008 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include <linux/sched.h> -#include <linux/slab.h> -#include <linux/sort.h> -#include "ctree.h" -#include "ref-cache.h" -#include "transaction.h" - -static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr, - struct rb_node *node) -{ - struct rb_node **p = &root->rb_node; - struct rb_node *parent = NULL; - struct btrfs_leaf_ref *entry; - - while (*p) { - parent = *p; - entry = rb_entry(parent, struct btrfs_leaf_ref, rb_node); - - if (bytenr < entry->bytenr) - p = &(*p)->rb_left; - else if (bytenr > entry->bytenr) - p = &(*p)->rb_right; - else - return parent; - } - - entry = rb_entry(node, struct btrfs_leaf_ref, rb_node); - rb_link_node(node, parent, p); - rb_insert_color(node, root); - return NULL; -} - -static struct rb_node *tree_search(struct rb_root *root, u64 bytenr) -{ - struct rb_node *n = root->rb_node; - struct btrfs_leaf_ref *entry; - - while (n) { - entry = rb_entry(n, struct btrfs_leaf_ref, rb_node); - WARN_ON(!entry->in_tree); - - if (bytenr < entry->bytenr) - n = n->rb_left; - else if (bytenr > entry->bytenr) - n = n->rb_right; - else - return n; - } - return NULL; -} diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h deleted file mode 100644 index 24f7001f6387..000000000000 --- a/fs/btrfs/ref-cache.h +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright (C) 2008 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ -#ifndef __REFCACHE__ -#define __REFCACHE__ - -struct btrfs_extent_info { - /* bytenr and num_bytes find the extent in the extent allocation tree */ - u64 bytenr; - u64 num_bytes; - - /* objectid and offset find the back reference for the file */ - u64 objectid; - u64 offset; -}; - -struct btrfs_leaf_ref { - struct rb_node rb_node; - struct btrfs_leaf_ref_tree *tree; - int in_tree; - atomic_t usage; - - u64 root_gen; - u64 bytenr; - u64 owner; - u64 generation; - int nritems; - - struct list_head list; - struct btrfs_extent_info extents[]; -}; - -static inline size_t btrfs_leaf_ref_size(int nr_extents) -{ - return sizeof(struct btrfs_leaf_ref) + - sizeof(struct btrfs_extent_info) * nr_extents; -} -#endif diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 5e0a3dc79a45..dff29d5e151a 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1174,6 +1174,8 @@ static int clone_backref_node(struct btrfs_trans_handle *trans, list_add_tail(&new_edge->list[UPPER], &new_node->lower); } + } else { + list_add_tail(&new_node->lower, &cache->leaves); } rb_node = tree_insert(&cache->rb_root, new_node->bytenr, @@ -2041,8 +2043,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, BUG_ON(IS_ERR(trans)); trans->block_rsv = rc->block_rsv; - ret = btrfs_block_rsv_check(trans, root, rc->block_rsv, - min_reserved, 0); + ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved); if (ret) { BUG_ON(ret != -EAGAIN); ret = btrfs_commit_transaction(trans, root); @@ -2152,8 +2153,7 @@ int prepare_to_merge(struct reloc_control *rc, int err) again: if (!err) { num_bytes = rc->merging_rsv_size; - ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv, - num_bytes); + ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes); if (ret) err = ret; } @@ -2427,7 +2427,7 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans, num_bytes = calcu_metadata_size(rc, node, 1) * 2; trans->block_rsv = rc->block_rsv; - ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes); + ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes); if (ret) { if (ret == -EAGAIN) rc->commit_transaction = 1; @@ -2922,6 +2922,7 @@ static int relocate_file_extent_cluster(struct inode *inode, unsigned long last_index; struct page *page; struct file_ra_state *ra; + gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); int nr = 0; int ret = 0; @@ -2955,7 +2956,8 @@ static int relocate_file_extent_cluster(struct inode *inode, page_cache_sync_readahead(inode->i_mapping, ra, NULL, index, last_index + 1 - index); - page = grab_cache_page(inode->i_mapping, index); + page = find_or_create_page(inode->i_mapping, index, + mask); if (!page) { btrfs_delalloc_release_metadata(inode, PAGE_CACHE_SIZE); @@ -3322,8 +3324,11 @@ static int find_data_references(struct reloc_control *rc, } key.objectid = ref_objectid; - key.offset = ref_offset; key.type = BTRFS_EXTENT_DATA_KEY; + if (ref_offset > ((u64)-1 << 32)) + key.offset = 0; + else + key.offset = ref_offset; path->search_commit_root = 1; path->skip_locking = 1; @@ -3644,14 +3649,11 @@ int prepare_to_relocate(struct reloc_control *rc) * btrfs_init_reloc_root will use them when there * is no reservation in transaction handle. */ - ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv, + ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv, rc->extent_root->nodesize * 256); if (ret) return ret; - rc->block_rsv->refill_used = 1; - btrfs_add_durable_block_rsv(rc->extent_root->fs_info, rc->block_rsv); - memset(&rc->cluster, 0, sizeof(rc->cluster)); rc->search_start = rc->block_group->key.objectid; rc->extents_found = 0; @@ -3776,8 +3778,7 @@ restart: } } - ret = btrfs_block_rsv_check(trans, rc->extent_root, - rc->block_rsv, 0, 5); + ret = btrfs_block_rsv_check(rc->extent_root, rc->block_rsv, 5); if (ret < 0) { if (ret != -EAGAIN) { err = ret; diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index ebe45443de06..f4099904565a 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -71,13 +71,12 @@ out: return ret; } -int btrfs_set_root_node(struct btrfs_root_item *item, - struct extent_buffer *node) +void btrfs_set_root_node(struct btrfs_root_item *item, + struct extent_buffer *node) { btrfs_set_root_bytenr(item, node->start); btrfs_set_root_level(item, btrfs_header_level(node)); btrfs_set_root_generation(item, btrfs_header_generation(node)); - return 0; } /* diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index a8d03d5efb5d..c27bcb67f330 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -17,10 +17,14 @@ */ #include <linux/blkdev.h> +#include <linux/ratelimit.h> #include "ctree.h" #include "volumes.h" #include "disk-io.h" #include "ordered-data.h" +#include "transaction.h" +#include "backref.h" +#include "extent_io.h" /* * This is only the first step towards a full-features scrub. It reads all @@ -29,15 +33,12 @@ * any can be found. * * Future enhancements: - * - To enhance the performance, better read-ahead strategies for the - * extent-tree can be employed. * - In case an unrepairable extent is encountered, track which files are * affected and report them * - In case of a read error on files with nodatasum, map the file and read * the extent to trigger a writeback of the good copy * - track and record media errors, throw out bad devices * - add a mode to also read unallocated space - * - make the prefetch cancellable */ struct scrub_bio; @@ -63,7 +64,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix); struct scrub_page { u64 flags; /* extent flags */ u64 generation; - u64 mirror_num; + int mirror_num; int have_csum; u8 csum[BTRFS_CSUM_SIZE]; }; @@ -87,6 +88,7 @@ struct scrub_dev { int first_free; int curr; atomic_t in_flight; + atomic_t fixup_cnt; spinlock_t list_lock; wait_queue_head_t list_wait; u16 csum_size; @@ -100,6 +102,27 @@ struct scrub_dev { spinlock_t stat_lock; }; +struct scrub_fixup_nodatasum { + struct scrub_dev *sdev; + u64 logical; + struct btrfs_root *root; + struct btrfs_work work; + int mirror_num; +}; + +struct scrub_warning { + struct btrfs_path *path; + u64 extent_item_size; + char *scratch_buf; + char *msg_buf; + const char *errstr; + sector_t sector; + u64 logical; + struct btrfs_device *dev; + int msg_bufsize; + int scratch_bufsize; +}; + static void scrub_free_csums(struct scrub_dev *sdev) { while (!list_empty(&sdev->csum_list)) { @@ -175,14 +198,15 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) if (i != SCRUB_BIOS_PER_DEV-1) sdev->bios[i]->next_free = i + 1; - else + else sdev->bios[i]->next_free = -1; } sdev->first_free = 0; sdev->curr = -1; atomic_set(&sdev->in_flight, 0); + atomic_set(&sdev->fixup_cnt, 0); atomic_set(&sdev->cancel_req, 0); - sdev->csum_size = btrfs_super_csum_size(&fs_info->super_copy); + sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy); INIT_LIST_HEAD(&sdev->csum_list); spin_lock_init(&sdev->list_lock); @@ -195,24 +219,366 @@ nomem: return ERR_PTR(-ENOMEM); } +static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx) +{ + u64 isize; + u32 nlink; + int ret; + int i; + struct extent_buffer *eb; + struct btrfs_inode_item *inode_item; + struct scrub_warning *swarn = ctx; + struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info; + struct inode_fs_paths *ipath = NULL; + struct btrfs_root *local_root; + struct btrfs_key root_key; + + root_key.objectid = root; + root_key.type = BTRFS_ROOT_ITEM_KEY; + root_key.offset = (u64)-1; + local_root = btrfs_read_fs_root_no_name(fs_info, &root_key); + if (IS_ERR(local_root)) { + ret = PTR_ERR(local_root); + goto err; + } + + ret = inode_item_info(inum, 0, local_root, swarn->path); + if (ret) { + btrfs_release_path(swarn->path); + goto err; + } + + eb = swarn->path->nodes[0]; + inode_item = btrfs_item_ptr(eb, swarn->path->slots[0], + struct btrfs_inode_item); + isize = btrfs_inode_size(eb, inode_item); + nlink = btrfs_inode_nlink(eb, inode_item); + btrfs_release_path(swarn->path); + + ipath = init_ipath(4096, local_root, swarn->path); + if (IS_ERR(ipath)) { + ret = PTR_ERR(ipath); + ipath = NULL; + goto err; + } + ret = paths_from_inode(inum, ipath); + + if (ret < 0) + goto err; + + /* + * we deliberately ignore the bit ipath might have been too small to + * hold all of the paths here + */ + for (i = 0; i < ipath->fspath->elem_cnt; ++i) + printk(KERN_WARNING "btrfs: %s at logical %llu on dev " + "%s, sector %llu, root %llu, inode %llu, offset %llu, " + "length %llu, links %u (path: %s)\n", swarn->errstr, + swarn->logical, swarn->dev->name, + (unsigned long long)swarn->sector, root, inum, offset, + min(isize - offset, (u64)PAGE_SIZE), nlink, + (char *)(unsigned long)ipath->fspath->val[i]); + + free_ipath(ipath); + return 0; + +err: + printk(KERN_WARNING "btrfs: %s at logical %llu on dev " + "%s, sector %llu, root %llu, inode %llu, offset %llu: path " + "resolving failed with ret=%d\n", swarn->errstr, + swarn->logical, swarn->dev->name, + (unsigned long long)swarn->sector, root, inum, offset, ret); + + free_ipath(ipath); + return 0; +} + +static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio, + int ix) +{ + struct btrfs_device *dev = sbio->sdev->dev; + struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; + struct btrfs_path *path; + struct btrfs_key found_key; + struct extent_buffer *eb; + struct btrfs_extent_item *ei; + struct scrub_warning swarn; + u32 item_size; + int ret; + u64 ref_root; + u8 ref_level; + unsigned long ptr = 0; + const int bufsize = 4096; + u64 extent_offset; + + path = btrfs_alloc_path(); + + swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS); + swarn.msg_buf = kmalloc(bufsize, GFP_NOFS); + swarn.sector = (sbio->physical + ix * PAGE_SIZE) >> 9; + swarn.logical = sbio->logical + ix * PAGE_SIZE; + swarn.errstr = errstr; + swarn.dev = dev; + swarn.msg_bufsize = bufsize; + swarn.scratch_bufsize = bufsize; + + if (!path || !swarn.scratch_buf || !swarn.msg_buf) + goto out; + + ret = extent_from_logical(fs_info, swarn.logical, path, &found_key); + if (ret < 0) + goto out; + + extent_offset = swarn.logical - found_key.objectid; + swarn.extent_item_size = found_key.offset; + + eb = path->nodes[0]; + ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); + item_size = btrfs_item_size_nr(eb, path->slots[0]); + + if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + do { + ret = tree_backref_for_extent(&ptr, eb, ei, item_size, + &ref_root, &ref_level); + printk(KERN_WARNING "%s at logical %llu on dev %s, " + "sector %llu: metadata %s (level %d) in tree " + "%llu\n", errstr, swarn.logical, dev->name, + (unsigned long long)swarn.sector, + ref_level ? "node" : "leaf", + ret < 0 ? -1 : ref_level, + ret < 0 ? -1 : ref_root); + } while (ret != 1); + } else { + swarn.path = path; + iterate_extent_inodes(fs_info, path, found_key.objectid, + extent_offset, + scrub_print_warning_inode, &swarn); + } + +out: + btrfs_free_path(path); + kfree(swarn.scratch_buf); + kfree(swarn.msg_buf); +} + +static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx) +{ + struct page *page = NULL; + unsigned long index; + struct scrub_fixup_nodatasum *fixup = ctx; + int ret; + int corrected = 0; + struct btrfs_key key; + struct inode *inode = NULL; + u64 end = offset + PAGE_SIZE - 1; + struct btrfs_root *local_root; + + key.objectid = root; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key); + if (IS_ERR(local_root)) + return PTR_ERR(local_root); + + key.type = BTRFS_INODE_ITEM_KEY; + key.objectid = inum; + key.offset = 0; + inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + index = offset >> PAGE_CACHE_SHIFT; + + page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); + if (!page) { + ret = -ENOMEM; + goto out; + } + + if (PageUptodate(page)) { + struct btrfs_mapping_tree *map_tree; + if (PageDirty(page)) { + /* + * we need to write the data to the defect sector. the + * data that was in that sector is not in memory, + * because the page was modified. we must not write the + * modified page to that sector. + * + * TODO: what could be done here: wait for the delalloc + * runner to write out that page (might involve + * COW) and see whether the sector is still + * referenced afterwards. + * + * For the meantime, we'll treat this error + * incorrectable, although there is a chance that a + * later scrub will find the bad sector again and that + * there's no dirty page in memory, then. + */ + ret = -EIO; + goto out; + } + map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; + ret = repair_io_failure(map_tree, offset, PAGE_SIZE, + fixup->logical, page, + fixup->mirror_num); + unlock_page(page); + corrected = !ret; + } else { + /* + * we need to get good data first. the general readpage path + * will call repair_io_failure for us, we just have to make + * sure we read the bad mirror. + */ + ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end, + EXTENT_DAMAGED, GFP_NOFS); + if (ret) { + /* set_extent_bits should give proper error */ + WARN_ON(ret > 0); + if (ret > 0) + ret = -EFAULT; + goto out; + } + + ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page, + btrfs_get_extent, + fixup->mirror_num); + wait_on_page_locked(page); + + corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset, + end, EXTENT_DAMAGED, 0, NULL); + if (!corrected) + clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end, + EXTENT_DAMAGED, GFP_NOFS); + } + +out: + if (page) + put_page(page); + if (inode) + iput(inode); + + if (ret < 0) + return ret; + + if (ret == 0 && corrected) { + /* + * we only need to call readpage for one of the inodes belonging + * to this extent. so make iterate_extent_inodes stop + */ + return 1; + } + + return -EIO; +} + +static void scrub_fixup_nodatasum(struct btrfs_work *work) +{ + int ret; + struct scrub_fixup_nodatasum *fixup; + struct scrub_dev *sdev; + struct btrfs_trans_handle *trans = NULL; + struct btrfs_fs_info *fs_info; + struct btrfs_path *path; + int uncorrectable = 0; + + fixup = container_of(work, struct scrub_fixup_nodatasum, work); + sdev = fixup->sdev; + fs_info = fixup->root->fs_info; + + path = btrfs_alloc_path(); + if (!path) { + spin_lock(&sdev->stat_lock); + ++sdev->stat.malloc_errors; + spin_unlock(&sdev->stat_lock); + uncorrectable = 1; + goto out; + } + + trans = btrfs_join_transaction(fixup->root); + if (IS_ERR(trans)) { + uncorrectable = 1; + goto out; + } + + /* + * the idea is to trigger a regular read through the standard path. we + * read a page from the (failed) logical address by specifying the + * corresponding copynum of the failed sector. thus, that readpage is + * expected to fail. + * that is the point where on-the-fly error correction will kick in + * (once it's finished) and rewrite the failed sector if a good copy + * can be found. + */ + ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info, + path, scrub_fixup_readpage, + fixup); + if (ret < 0) { + uncorrectable = 1; + goto out; + } + WARN_ON(ret != 1); + + spin_lock(&sdev->stat_lock); + ++sdev->stat.corrected_errors; + spin_unlock(&sdev->stat_lock); + +out: + if (trans && !IS_ERR(trans)) + btrfs_end_transaction(trans, fixup->root); + if (uncorrectable) { + spin_lock(&sdev->stat_lock); + ++sdev->stat.uncorrectable_errors; + spin_unlock(&sdev->stat_lock); + printk_ratelimited(KERN_ERR "btrfs: unable to fixup " + "(nodatasum) error at logical %llu\n", + fixup->logical); + } + + btrfs_free_path(path); + kfree(fixup); + + /* see caller why we're pretending to be paused in the scrub counters */ + mutex_lock(&fs_info->scrub_lock); + atomic_dec(&fs_info->scrubs_running); + atomic_dec(&fs_info->scrubs_paused); + mutex_unlock(&fs_info->scrub_lock); + atomic_dec(&sdev->fixup_cnt); + wake_up(&fs_info->scrub_pause_wait); + wake_up(&sdev->list_wait); +} + /* * scrub_recheck_error gets called when either verification of the page * failed or the bio failed to read, e.g. with EIO. In the latter case, * recheck_error gets called for every page in the bio, even though only * one may be bad */ -static void scrub_recheck_error(struct scrub_bio *sbio, int ix) +static int scrub_recheck_error(struct scrub_bio *sbio, int ix) { + struct scrub_dev *sdev = sbio->sdev; + u64 sector = (sbio->physical + ix * PAGE_SIZE) >> 9; + static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + if (sbio->err) { - if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, - (sbio->physical + ix * PAGE_SIZE) >> 9, + if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, sector, sbio->bio->bi_io_vec[ix].bv_page) == 0) { if (scrub_fixup_check(sbio, ix) == 0) - return; + return 0; } + if (__ratelimit(&_rs)) + scrub_print_warning("i/o error", sbio, ix); + } else { + if (__ratelimit(&_rs)) + scrub_print_warning("checksum error", sbio, ix); } + spin_lock(&sdev->stat_lock); + ++sdev->stat.read_errors; + spin_unlock(&sdev->stat_lock); + scrub_fixup(sbio, ix); + return 1; } static int scrub_fixup_check(struct scrub_bio *sbio, int ix) @@ -250,7 +616,8 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) struct scrub_dev *sdev = sbio->sdev; struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; - struct btrfs_multi_bio *multi = NULL; + struct btrfs_bio *bbio = NULL; + struct scrub_fixup_nodatasum *fixup; u64 logical = sbio->logical + ix * PAGE_SIZE; u64 length; int i; @@ -259,38 +626,57 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) && (sbio->spag[ix].have_csum == 0)) { + fixup = kzalloc(sizeof(*fixup), GFP_NOFS); + if (!fixup) + goto uncorrectable; + fixup->sdev = sdev; + fixup->logical = logical; + fixup->root = fs_info->extent_root; + fixup->mirror_num = sbio->spag[ix].mirror_num; /* - * nodatasum, don't try to fix anything - * FIXME: we can do better, open the inode and trigger a - * writeback + * increment scrubs_running to prevent cancel requests from + * completing as long as a fixup worker is running. we must also + * increment scrubs_paused to prevent deadlocking on pause + * requests used for transactions commits (as the worker uses a + * transaction context). it is safe to regard the fixup worker + * as paused for all matters practical. effectively, we only + * avoid cancellation requests from completing. */ - goto uncorrectable; + mutex_lock(&fs_info->scrub_lock); + atomic_inc(&fs_info->scrubs_running); + atomic_inc(&fs_info->scrubs_paused); + mutex_unlock(&fs_info->scrub_lock); + atomic_inc(&sdev->fixup_cnt); + fixup->work.func = scrub_fixup_nodatasum; + btrfs_queue_worker(&fs_info->scrub_workers, &fixup->work); + return; } length = PAGE_SIZE; ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, - &multi, 0); - if (ret || !multi || length < PAGE_SIZE) { + &bbio, 0); + if (ret || !bbio || length < PAGE_SIZE) { printk(KERN_ERR "scrub_fixup: btrfs_map_block failed us for %llu\n", (unsigned long long)logical); WARN_ON(1); + kfree(bbio); return; } - if (multi->num_stripes == 1) + if (bbio->num_stripes == 1) /* there aren't any replicas */ goto uncorrectable; /* * first find a good copy */ - for (i = 0; i < multi->num_stripes; ++i) { - if (i == sbio->spag[ix].mirror_num) + for (i = 0; i < bbio->num_stripes; ++i) { + if (i + 1 == sbio->spag[ix].mirror_num) continue; - if (scrub_fixup_io(READ, multi->stripes[i].dev->bdev, - multi->stripes[i].physical >> 9, + if (scrub_fixup_io(READ, bbio->stripes[i].dev->bdev, + bbio->stripes[i].physical >> 9, sbio->bio->bi_io_vec[ix].bv_page)) { /* I/O-error, this is not a good copy */ continue; @@ -299,7 +685,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) if (scrub_fixup_check(sbio, ix) == 0) break; } - if (i == multi->num_stripes) + if (i == bbio->num_stripes) goto uncorrectable; if (!sdev->readonly) { @@ -314,25 +700,23 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) } } - kfree(multi); + kfree(bbio); spin_lock(&sdev->stat_lock); ++sdev->stat.corrected_errors; spin_unlock(&sdev->stat_lock); - if (printk_ratelimit()) - printk(KERN_ERR "btrfs: fixed up at %llu\n", - (unsigned long long)logical); + printk_ratelimited(KERN_ERR "btrfs: fixed up error at logical %llu\n", + (unsigned long long)logical); return; uncorrectable: - kfree(multi); + kfree(bbio); spin_lock(&sdev->stat_lock); ++sdev->stat.uncorrectable_errors; spin_unlock(&sdev->stat_lock); - if (printk_ratelimit()) - printk(KERN_ERR "btrfs: unable to fixup at %llu\n", - (unsigned long long)logical); + printk_ratelimited(KERN_ERR "btrfs: unable to fixup (regular) error at " + "logical %llu\n", (unsigned long long)logical); } static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, @@ -382,8 +766,14 @@ static void scrub_checksum(struct btrfs_work *work) int ret; if (sbio->err) { + ret = 0; for (i = 0; i < sbio->count; ++i) - scrub_recheck_error(sbio, i); + ret |= scrub_recheck_error(sbio, i); + if (!ret) { + spin_lock(&sdev->stat_lock); + ++sdev->stat.unverified_errors; + spin_unlock(&sdev->stat_lock); + } sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1); sbio->bio->bi_flags |= 1 << BIO_UPTODATE; @@ -396,10 +786,6 @@ static void scrub_checksum(struct btrfs_work *work) bi->bv_offset = 0; bi->bv_len = PAGE_SIZE; } - - spin_lock(&sdev->stat_lock); - ++sdev->stat.read_errors; - spin_unlock(&sdev->stat_lock); goto out; } for (i = 0; i < sbio->count; ++i) { @@ -420,8 +806,14 @@ static void scrub_checksum(struct btrfs_work *work) WARN_ON(1); } kunmap_atomic(buffer, KM_USER0); - if (ret) - scrub_recheck_error(sbio, i); + if (ret) { + ret = scrub_recheck_error(sbio, i); + if (!ret) { + spin_lock(&sdev->stat_lock); + ++sdev->stat.unverified_errors; + spin_unlock(&sdev->stat_lock); + } + } } out: @@ -557,57 +949,27 @@ static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer) static int scrub_submit(struct scrub_dev *sdev) { struct scrub_bio *sbio; - struct bio *bio; - int i; if (sdev->curr == -1) return 0; sbio = sdev->bios[sdev->curr]; - - bio = bio_alloc(GFP_NOFS, sbio->count); - if (!bio) - goto nomem; - - bio->bi_private = sbio; - bio->bi_end_io = scrub_bio_end_io; - bio->bi_bdev = sdev->dev->bdev; - bio->bi_sector = sbio->physical >> 9; - - for (i = 0; i < sbio->count; ++i) { - struct page *page; - int ret; - - page = alloc_page(GFP_NOFS); - if (!page) - goto nomem; - - ret = bio_add_page(bio, page, PAGE_SIZE, 0); - if (!ret) { - __free_page(page); - goto nomem; - } - } - sbio->err = 0; sdev->curr = -1; atomic_inc(&sdev->in_flight); - submit_bio(READ, bio); + submit_bio(READ, sbio->bio); return 0; - -nomem: - scrub_free_bio(bio); - - return -ENOMEM; } static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len, - u64 physical, u64 flags, u64 gen, u64 mirror_num, + u64 physical, u64 flags, u64 gen, int mirror_num, u8 *csum, int force) { struct scrub_bio *sbio; + struct page *page; + int ret; again: /* @@ -628,12 +990,22 @@ again: } sbio = sdev->bios[sdev->curr]; if (sbio->count == 0) { + struct bio *bio; + sbio->physical = physical; sbio->logical = logical; + bio = bio_alloc(GFP_NOFS, SCRUB_PAGES_PER_BIO); + if (!bio) + return -ENOMEM; + + bio->bi_private = sbio; + bio->bi_end_io = scrub_bio_end_io; + bio->bi_bdev = sdev->dev->bdev; + bio->bi_sector = sbio->physical >> 9; + sbio->err = 0; + sbio->bio = bio; } else if (sbio->physical + sbio->count * PAGE_SIZE != physical || sbio->logical + sbio->count * PAGE_SIZE != logical) { - int ret; - ret = scrub_submit(sdev); if (ret) return ret; @@ -643,6 +1015,20 @@ again: sbio->spag[sbio->count].generation = gen; sbio->spag[sbio->count].have_csum = 0; sbio->spag[sbio->count].mirror_num = mirror_num; + + page = alloc_page(GFP_NOFS); + if (!page) + return -ENOMEM; + + ret = bio_add_page(sbio->bio, page, PAGE_SIZE, 0); + if (!ret) { + __free_page(page); + ret = scrub_submit(sdev); + if (ret) + return ret; + goto again; + } + if (csum) { sbio->spag[sbio->count].have_csum = 1; memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size); @@ -701,7 +1087,7 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, /* scrub extent tries to collect up to 64 kB for each bio */ static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, - u64 physical, u64 flags, u64 gen, u64 mirror_num) + u64 physical, u64 flags, u64 gen, int mirror_num) { int ret; u8 csum[BTRFS_CSUM_SIZE]; @@ -741,13 +1127,16 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, int slot; int i; u64 nstripes; - int start_stripe; struct extent_buffer *l; struct btrfs_key key; u64 physical; u64 logical; u64 generation; - u64 mirror_num; + int mirror_num; + struct reada_control *reada1; + struct reada_control *reada2; + struct btrfs_key key_start; + struct btrfs_key key_end; u64 increment = map->stripe_len; u64 offset; @@ -758,102 +1147,88 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, if (map->type & BTRFS_BLOCK_GROUP_RAID0) { offset = map->stripe_len * num; increment = map->stripe_len * map->num_stripes; - mirror_num = 0; + mirror_num = 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { int factor = map->num_stripes / map->sub_stripes; offset = map->stripe_len * (num / map->sub_stripes); increment = map->stripe_len * factor; - mirror_num = num % map->sub_stripes; + mirror_num = num % map->sub_stripes + 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { increment = map->stripe_len; - mirror_num = num % map->num_stripes; + mirror_num = num % map->num_stripes + 1; } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { increment = map->stripe_len; - mirror_num = num % map->num_stripes; + mirror_num = num % map->num_stripes + 1; } else { increment = map->stripe_len; - mirror_num = 0; + mirror_num = 1; } path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = 2; path->search_commit_root = 1; path->skip_locking = 1; /* - * find all extents for each stripe and just read them to get - * them into the page cache - * FIXME: we can do better. build a more intelligent prefetching + * trigger the readahead for extent tree csum tree and wait for + * completion. During readahead, the scrub is officially paused + * to not hold off transaction commits */ logical = base + offset; - physical = map->stripes[num].physical; - ret = 0; - for (i = 0; i < nstripes; ++i) { - key.objectid = logical; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = (u64)0; - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out_noplug; - - /* - * we might miss half an extent here, but that doesn't matter, - * as it's only the prefetch - */ - while (1) { - l = path->nodes[0]; - slot = path->slots[0]; - if (slot >= btrfs_header_nritems(l)) { - ret = btrfs_next_leaf(root, path); - if (ret == 0) - continue; - if (ret < 0) - goto out_noplug; - break; - } - btrfs_item_key_to_cpu(l, &key, slot); + wait_event(sdev->list_wait, + atomic_read(&sdev->in_flight) == 0); + atomic_inc(&fs_info->scrubs_paused); + wake_up(&fs_info->scrub_pause_wait); - if (key.objectid >= logical + map->stripe_len) - break; + /* FIXME it might be better to start readahead at commit root */ + key_start.objectid = logical; + key_start.type = BTRFS_EXTENT_ITEM_KEY; + key_start.offset = (u64)0; + key_end.objectid = base + offset + nstripes * increment; + key_end.type = BTRFS_EXTENT_ITEM_KEY; + key_end.offset = (u64)0; + reada1 = btrfs_reada_add(root, &key_start, &key_end); + + key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + key_start.type = BTRFS_EXTENT_CSUM_KEY; + key_start.offset = logical; + key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + key_end.type = BTRFS_EXTENT_CSUM_KEY; + key_end.offset = base + offset + nstripes * increment; + reada2 = btrfs_reada_add(csum_root, &key_start, &key_end); + + if (!IS_ERR(reada1)) + btrfs_reada_wait(reada1); + if (!IS_ERR(reada2)) + btrfs_reada_wait(reada2); - path->slots[0]++; - } - btrfs_release_path(path); - logical += increment; - physical += map->stripe_len; - cond_resched(); + mutex_lock(&fs_info->scrub_lock); + while (atomic_read(&fs_info->scrub_pause_req)) { + mutex_unlock(&fs_info->scrub_lock); + wait_event(fs_info->scrub_pause_wait, + atomic_read(&fs_info->scrub_pause_req) == 0); + mutex_lock(&fs_info->scrub_lock); } + atomic_dec(&fs_info->scrubs_paused); + mutex_unlock(&fs_info->scrub_lock); + wake_up(&fs_info->scrub_pause_wait); /* * collect all data csums for the stripe to avoid seeking during * the scrub. This might currently (crc32) end up to be about 1MB */ - start_stripe = 0; blk_start_plug(&plug); -again: - logical = base + offset + start_stripe * increment; - for (i = start_stripe; i < nstripes; ++i) { - ret = btrfs_lookup_csums_range(csum_root, logical, - logical + map->stripe_len - 1, - &sdev->csum_list, 1); - if (ret) - goto out; - logical += increment; - cond_resched(); - } /* * now find all extents for each stripe and scrub them */ - logical = base + offset + start_stripe * increment; - physical = map->stripes[num].physical + start_stripe * map->stripe_len; + logical = base + offset; + physical = map->stripes[num].physical; ret = 0; - for (i = start_stripe; i < nstripes; ++i) { + for (i = 0; i < nstripes; ++i) { /* * canceled? */ @@ -882,11 +1257,14 @@ again: atomic_dec(&fs_info->scrubs_paused); mutex_unlock(&fs_info->scrub_lock); wake_up(&fs_info->scrub_pause_wait); - scrub_free_csums(sdev); - start_stripe = i; - goto again; } + ret = btrfs_lookup_csums_range(csum_root, logical, + logical + map->stripe_len - 1, + &sdev->csum_list, 1); + if (ret) + goto out; + key.objectid = logical; key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = (u64)0; @@ -982,7 +1360,6 @@ next: out: blk_finish_plug(&plug); -out_noplug: btrfs_free_path(path); return ret < 0 ? ret : 0; } @@ -1253,10 +1630,11 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, ret = scrub_enumerate_chunks(sdev, start, end); wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); - atomic_dec(&fs_info->scrubs_running); wake_up(&fs_info->scrub_pause_wait); + wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0); + if (progress) memcpy(progress, &sdev->stat, sizeof(*progress)); diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c index c0f7ecaf1e79..bc1f6ad18442 100644 --- a/fs/btrfs/struct-funcs.c +++ b/fs/btrfs/struct-funcs.c @@ -50,36 +50,22 @@ u##bits btrfs_##name(struct extent_buffer *eb, \ unsigned long part_offset = (unsigned long)s; \ unsigned long offset = part_offset + offsetof(type, member); \ type *p; \ - /* ugly, but we want the fast path here */ \ - if (eb->map_token && offset >= eb->map_start && \ - offset + sizeof(((type *)0)->member) <= eb->map_start + \ - eb->map_len) { \ - p = (type *)(eb->kaddr + part_offset - eb->map_start); \ - return le##bits##_to_cpu(p->member); \ - } \ - { \ - int err; \ - char *map_token; \ - char *kaddr; \ - int unmap_on_exit = (eb->map_token == NULL); \ - unsigned long map_start; \ - unsigned long map_len; \ - u##bits res; \ - err = map_extent_buffer(eb, offset, \ - sizeof(((type *)0)->member), \ - &map_token, &kaddr, \ - &map_start, &map_len, KM_USER1); \ - if (err) { \ - __le##bits leres; \ - read_eb_member(eb, s, type, member, &leres); \ - return le##bits##_to_cpu(leres); \ - } \ - p = (type *)(kaddr + part_offset - map_start); \ - res = le##bits##_to_cpu(p->member); \ - if (unmap_on_exit) \ - unmap_extent_buffer(eb, map_token, KM_USER1); \ - return res; \ - } \ + int err; \ + char *kaddr; \ + unsigned long map_start; \ + unsigned long map_len; \ + u##bits res; \ + err = map_private_extent_buffer(eb, offset, \ + sizeof(((type *)0)->member), \ + &kaddr, &map_start, &map_len); \ + if (err) { \ + __le##bits leres; \ + read_eb_member(eb, s, type, member, &leres); \ + return le##bits##_to_cpu(leres); \ + } \ + p = (type *)(kaddr + part_offset - map_start); \ + res = le##bits##_to_cpu(p->member); \ + return res; \ } \ void btrfs_set_##name(struct extent_buffer *eb, \ type *s, u##bits val) \ @@ -87,36 +73,21 @@ void btrfs_set_##name(struct extent_buffer *eb, \ unsigned long part_offset = (unsigned long)s; \ unsigned long offset = part_offset + offsetof(type, member); \ type *p; \ - /* ugly, but we want the fast path here */ \ - if (eb->map_token && offset >= eb->map_start && \ - offset + sizeof(((type *)0)->member) <= eb->map_start + \ - eb->map_len) { \ - p = (type *)(eb->kaddr + part_offset - eb->map_start); \ - p->member = cpu_to_le##bits(val); \ - return; \ - } \ - { \ - int err; \ - char *map_token; \ - char *kaddr; \ - int unmap_on_exit = (eb->map_token == NULL); \ - unsigned long map_start; \ - unsigned long map_len; \ - err = map_extent_buffer(eb, offset, \ - sizeof(((type *)0)->member), \ - &map_token, &kaddr, \ - &map_start, &map_len, KM_USER1); \ - if (err) { \ - __le##bits val2; \ - val2 = cpu_to_le##bits(val); \ - write_eb_member(eb, s, type, member, &val2); \ - return; \ - } \ - p = (type *)(kaddr + part_offset - map_start); \ - p->member = cpu_to_le##bits(val); \ - if (unmap_on_exit) \ - unmap_extent_buffer(eb, map_token, KM_USER1); \ - } \ + int err; \ + char *kaddr; \ + unsigned long map_start; \ + unsigned long map_len; \ + err = map_private_extent_buffer(eb, offset, \ + sizeof(((type *)0)->member), \ + &kaddr, &map_start, &map_len); \ + if (err) { \ + __le##bits val2; \ + val2 = cpu_to_le##bits(val); \ + write_eb_member(eb, s, type, member, &val2); \ + return; \ + } \ + p = (type *)(kaddr + part_offset - map_start); \ + p->member = cpu_to_le##bits(val); \ } #include "ctree.h" @@ -125,15 +96,6 @@ void btrfs_node_key(struct extent_buffer *eb, struct btrfs_disk_key *disk_key, int nr) { unsigned long ptr = btrfs_node_key_ptr_offset(nr); - if (eb->map_token && ptr >= eb->map_start && - ptr + sizeof(*disk_key) <= eb->map_start + eb->map_len) { - memcpy(disk_key, eb->kaddr + ptr - eb->map_start, - sizeof(*disk_key)); - return; - } else if (eb->map_token) { - unmap_extent_buffer(eb, eb->map_token, KM_USER1); - eb->map_token = NULL; - } read_eb_member(eb, (struct btrfs_key_ptr *)ptr, struct btrfs_key_ptr, key, disk_key); } diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 15634d4648d7..e28ad4baf483 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -40,6 +40,7 @@ #include <linux/magic.h> #include <linux/slab.h> #include <linux/cleancache.h> +#include <linux/mnt_namespace.h> #include "compat.h" #include "delayed-inode.h" #include "ctree.h" @@ -58,6 +59,7 @@ #include <trace/events/btrfs.h> static const struct super_operations btrfs_super_ops; +static struct file_system_type btrfs_fs_type; static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno, char nbuf[16]) @@ -162,7 +164,7 @@ enum { Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, - Opt_inode_cache, Opt_err, + Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_err, }; static match_table_t tokens = { @@ -195,6 +197,8 @@ static match_table_t tokens = { {Opt_subvolrootid, "subvolrootid=%d"}, {Opt_defrag, "autodefrag"}, {Opt_inode_cache, "inode_cache"}, + {Opt_no_space_cache, "nospace_cache"}, + {Opt_recovery, "recovery"}, {Opt_err, NULL}, }; @@ -206,14 +210,19 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) { struct btrfs_fs_info *info = root->fs_info; substring_t args[MAX_OPT_ARGS]; - char *p, *num, *orig; + char *p, *num, *orig = NULL; + u64 cache_gen; int intarg; int ret = 0; char *compress_type; bool compress_force = false; + cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); + if (cache_gen) + btrfs_set_opt(info->mount_opt, SPACE_CACHE); + if (!options) - return 0; + goto out; /* * strsep changes the string, duplicate it because parse_options @@ -360,9 +369,12 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) btrfs_set_opt(info->mount_opt, DISCARD); break; case Opt_space_cache: - printk(KERN_INFO "btrfs: enabling disk space caching\n"); btrfs_set_opt(info->mount_opt, SPACE_CACHE); break; + case Opt_no_space_cache: + printk(KERN_INFO "btrfs: disabling disk space caching\n"); + btrfs_clear_opt(info->mount_opt, SPACE_CACHE); + break; case Opt_inode_cache: printk(KERN_INFO "btrfs: enabling inode map caching\n"); btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE); @@ -381,6 +393,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) printk(KERN_INFO "btrfs: enabling auto defrag"); btrfs_set_opt(info->mount_opt, AUTO_DEFRAG); break; + case Opt_recovery: + printk(KERN_INFO "btrfs: enabling auto recovery"); + btrfs_set_opt(info->mount_opt, RECOVERY); + break; case Opt_err: printk(KERN_INFO "btrfs: unrecognized mount option " "'%s'\n", p); @@ -391,6 +407,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) } } out: + if (!ret && btrfs_test_opt(root, SPACE_CACHE)) + printk(KERN_INFO "btrfs: disk space caching is enabled\n"); kfree(orig); return ret; } @@ -406,12 +424,12 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, u64 *subvol_rootid, struct btrfs_fs_devices **fs_devices) { substring_t args[MAX_OPT_ARGS]; - char *opts, *orig, *p; + char *device_name, *opts, *orig, *p; int error = 0; int intarg; if (!options) - goto out; + return 0; /* * strsep changes the string, duplicate it because parse_options @@ -430,6 +448,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, token = match_token(p, tokens, args); switch (token) { case Opt_subvol: + kfree(*subvol_name); *subvol_name = match_strdup(&args[0]); break; case Opt_subvolid: @@ -457,29 +476,24 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, } break; case Opt_device: - error = btrfs_scan_one_device(match_strdup(&args[0]), + device_name = match_strdup(&args[0]); + if (!device_name) { + error = -ENOMEM; + goto out; + } + error = btrfs_scan_one_device(device_name, flags, holder, fs_devices); + kfree(device_name); if (error) - goto out_free_opts; + goto out; break; default: break; } } - out_free_opts: +out: kfree(orig); - out: - /* - * If no subvolume name is specified we use the default one. Allocate - * a copy of the string "." here so that code later in the - * mount path doesn't care if it's the default volume or another one. - */ - if (!*subvol_name) { - *subvol_name = kstrdup(".", GFP_KERNEL); - if (!*subvol_name) - return -ENOMEM; - } return error; } @@ -492,7 +506,6 @@ static struct dentry *get_default_root(struct super_block *sb, struct btrfs_path *path; struct btrfs_key location; struct inode *inode; - struct dentry *dentry; u64 dir_id; int new = 0; @@ -517,7 +530,7 @@ static struct dentry *get_default_root(struct super_block *sb, * will mount by default if we haven't been given a specific subvolume * to mount. */ - dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); + dir_id = btrfs_super_root_dir(root->fs_info->super_copy); di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0); if (IS_ERR(di)) { btrfs_free_path(path); @@ -566,29 +579,7 @@ setup_root: return dget(sb->s_root); } - if (new) { - const struct qstr name = { .name = "/", .len = 1 }; - - /* - * New inode, we need to make the dentry a sibling of s_root so - * everything gets cleaned up properly on unmount. - */ - dentry = d_alloc(sb->s_root, &name); - if (!dentry) { - iput(inode); - return ERR_PTR(-ENOMEM); - } - d_splice_alias(inode, dentry); - } else { - /* - * We found the inode in cache, just find a dentry for it and - * put the reference to the inode we just got. - */ - dentry = d_find_alias(inode); - iput(inode); - } - - return dentry; + return d_obtain_alias(inode); } static int btrfs_fill_super(struct super_block *sb, @@ -719,6 +710,8 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",noacl"); if (btrfs_test_opt(root, SPACE_CACHE)) seq_puts(seq, ",space_cache"); + else + seq_puts(seq, ",nospace_cache"); if (btrfs_test_opt(root, CLEAR_CACHE)) seq_puts(seq, ",clear_cache"); if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED)) @@ -753,6 +746,111 @@ static int btrfs_set_super(struct super_block *s, void *data) return set_anon_super(s, data); } +/* + * subvolumes are identified by ino 256 + */ +static inline int is_subvolume_inode(struct inode *inode) +{ + if (inode && inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) + return 1; + return 0; +} + +/* + * This will strip out the subvol=%s argument for an argument string and add + * subvolid=0 to make sure we get the actual tree root for path walking to the + * subvol we want. + */ +static char *setup_root_args(char *args) +{ + unsigned copied = 0; + unsigned len = strlen(args) + 2; + char *pos; + char *ret; + + /* + * We need the same args as before, but minus + * + * subvol=a + * + * and add + * + * subvolid=0 + * + * which is a difference of 2 characters, so we allocate strlen(args) + + * 2 characters. + */ + ret = kzalloc(len * sizeof(char), GFP_NOFS); + if (!ret) + return NULL; + pos = strstr(args, "subvol="); + + /* This shouldn't happen, but just in case.. */ + if (!pos) { + kfree(ret); + return NULL; + } + + /* + * The subvol=<> arg is not at the front of the string, copy everybody + * up to that into ret. + */ + if (pos != args) { + *pos = '\0'; + strcpy(ret, args); + copied += strlen(args); + pos++; + } + + strncpy(ret + copied, "subvolid=0", len - copied); + + /* Length of subvolid=0 */ + copied += 10; + + /* + * If there is no , after the subvol= option then we know there's no + * other options and we can just return. + */ + pos = strchr(pos, ','); + if (!pos) + return ret; + + /* Copy the rest of the arguments into our buffer */ + strncpy(ret + copied, pos, len - copied); + copied += strlen(pos); + + return ret; +} + +static struct dentry *mount_subvol(const char *subvol_name, int flags, + const char *device_name, char *data) +{ + struct dentry *root; + struct vfsmount *mnt; + char *newargs; + + newargs = setup_root_args(data); + if (!newargs) + return ERR_PTR(-ENOMEM); + mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name, + newargs); + kfree(newargs); + if (IS_ERR(mnt)) + return ERR_CAST(mnt); + + root = mount_subtree(mnt, subvol_name); + + if (!IS_ERR(root) && !is_subvolume_inode(root->d_inode)) { + struct super_block *s = root->d_sb; + dput(root); + root = ERR_PTR(-EINVAL); + deactivate_locked_super(s); + printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n", + subvol_name); + } + + return root; +} /* * Find a superblock for the given device / mount point. @@ -767,7 +865,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, struct super_block *s; struct dentry *root; struct btrfs_fs_devices *fs_devices = NULL; - struct btrfs_root *tree_root = NULL; struct btrfs_fs_info *fs_info = NULL; fmode_t mode = FMODE_READ; char *subvol_name = NULL; @@ -781,21 +878,20 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, error = btrfs_parse_early_options(data, mode, fs_type, &subvol_name, &subvol_objectid, &subvol_rootid, &fs_devices); - if (error) + if (error) { + kfree(subvol_name); return ERR_PTR(error); + } - error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices); - if (error) - goto error_free_subvol_name; + if (subvol_name) { + root = mount_subvol(subvol_name, flags, device_name, data); + kfree(subvol_name); + return root; + } - error = btrfs_open_devices(fs_devices, mode, fs_type); + error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices); if (error) - goto error_free_subvol_name; - - if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) { - error = -EACCES; - goto error_close_devices; - } + return ERR_PTR(error); /* * Setup a dummy root and fs_info for test/set super. This is because @@ -804,19 +900,40 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, * then open_ctree will properly initialize everything later. */ fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS); - tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); - if (!fs_info || !tree_root) { + if (!fs_info) + return ERR_PTR(-ENOMEM); + + fs_info->tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); + if (!fs_info->tree_root) { error = -ENOMEM; - goto error_close_devices; + goto error_fs_info; } - fs_info->tree_root = tree_root; + fs_info->tree_root->fs_info = fs_info; fs_info->fs_devices = fs_devices; - tree_root->fs_info = fs_info; + + fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); + fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); + if (!fs_info->super_copy || !fs_info->super_for_commit) { + error = -ENOMEM; + goto error_fs_info; + } + + error = btrfs_open_devices(fs_devices, mode, fs_type); + if (error) + goto error_fs_info; + + if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) { + error = -EACCES; + goto error_close_devices; + } bdev = fs_devices->latest_bdev; - s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root); - if (IS_ERR(s)) - goto error_s; + s = sget(fs_type, btrfs_test_super, btrfs_set_super, + fs_info->tree_root); + if (IS_ERR(s)) { + error = PTR_ERR(s); + goto error_close_devices; + } if (s->s_root) { if ((flags ^ s->s_flags) & MS_RDONLY) { @@ -826,75 +943,35 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, } btrfs_close_devices(fs_devices); - kfree(fs_info); - kfree(tree_root); + free_fs_info(fs_info); } else { char b[BDEVNAME_SIZE]; s->s_flags = flags | MS_NOSEC; strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); + btrfs_sb(s)->fs_info->bdev_holder = fs_type; error = btrfs_fill_super(s, fs_devices, data, flags & MS_SILENT ? 1 : 0); if (error) { deactivate_locked_super(s); - goto error_free_subvol_name; + return ERR_PTR(error); } - btrfs_sb(s)->fs_info->bdev_holder = fs_type; s->s_flags |= MS_ACTIVE; } - /* if they gave us a subvolume name bind mount into that */ - if (strcmp(subvol_name, ".")) { - struct dentry *new_root; - - root = get_default_root(s, subvol_rootid); - if (IS_ERR(root)) { - error = PTR_ERR(root); - deactivate_locked_super(s); - goto error_free_subvol_name; - } - - mutex_lock(&root->d_inode->i_mutex); - new_root = lookup_one_len(subvol_name, root, - strlen(subvol_name)); - mutex_unlock(&root->d_inode->i_mutex); - - if (IS_ERR(new_root)) { - dput(root); - deactivate_locked_super(s); - error = PTR_ERR(new_root); - goto error_free_subvol_name; - } - if (!new_root->d_inode) { - dput(root); - dput(new_root); - deactivate_locked_super(s); - error = -ENXIO; - goto error_free_subvol_name; - } - dput(root); - root = new_root; - } else { - root = get_default_root(s, subvol_objectid); - if (IS_ERR(root)) { - error = PTR_ERR(root); - deactivate_locked_super(s); - goto error_free_subvol_name; - } + root = get_default_root(s, subvol_objectid); + if (IS_ERR(root)) { + deactivate_locked_super(s); + return root; } - kfree(subvol_name); return root; -error_s: - error = PTR_ERR(s); error_close_devices: btrfs_close_devices(fs_devices); - kfree(fs_info); - kfree(tree_root); -error_free_subvol_name: - kfree(subvol_name); +error_fs_info: + free_fs_info(fs_info); return ERR_PTR(error); } @@ -919,7 +996,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) if (root->fs_info->fs_devices->rw_devices == 0) return -EACCES; - if (btrfs_super_log_root(&root->fs_info->super_copy) != 0) + if (btrfs_super_log_root(root->fs_info->super_copy) != 0) return -EINVAL; ret = btrfs_cleanup_fs_roots(root->fs_info); @@ -980,7 +1057,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) int i = 0, nr_devices; int ret; - nr_devices = fs_info->fs_devices->rw_devices; + nr_devices = fs_info->fs_devices->open_devices; BUG_ON(!nr_devices); devices_info = kmalloc(sizeof(*devices_info) * nr_devices, @@ -1002,8 +1079,8 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) else min_stripe_size = BTRFS_STRIPE_LEN; - list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { - if (!device->in_fs_metadata) + list_for_each_entry(device, &fs_devices->devices, dev_list) { + if (!device->in_fs_metadata || !device->bdev) continue; avail_space = device->total_bytes - device->bytes_used; @@ -1085,7 +1162,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct btrfs_root *root = btrfs_sb(dentry->d_sb); - struct btrfs_super_block *disk_super = &root->fs_info->super_copy; + struct btrfs_super_block *disk_super = root->fs_info->super_copy; struct list_head *head = &root->fs_info->space_info; struct btrfs_space_info *found; u64 total_used = 0; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 51dcec86757f..81376d94cd3c 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -55,6 +55,7 @@ static noinline int join_transaction(struct btrfs_root *root, int nofail) struct btrfs_transaction *cur_trans; spin_lock(&root->fs_info->trans_lock); +loop: if (root->fs_info->trans_no_join) { if (!nofail) { spin_unlock(&root->fs_info->trans_lock); @@ -75,16 +76,18 @@ static noinline int join_transaction(struct btrfs_root *root, int nofail) cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS); if (!cur_trans) return -ENOMEM; + spin_lock(&root->fs_info->trans_lock); if (root->fs_info->running_transaction) { + /* + * someone started a transaction after we unlocked. Make sure + * to redo the trans_no_join checks above + */ kmem_cache_free(btrfs_transaction_cachep, cur_trans); cur_trans = root->fs_info->running_transaction; - atomic_inc(&cur_trans->use_count); - atomic_inc(&cur_trans->num_writers); - cur_trans->num_joined++; - spin_unlock(&root->fs_info->trans_lock); - return 0; + goto loop; } + atomic_set(&cur_trans->num_writers, 1); cur_trans->num_joined = 0; init_waitqueue_head(&cur_trans->writer_wait); @@ -216,17 +219,11 @@ static void wait_current_trans(struct btrfs_root *root) spin_lock(&root->fs_info->trans_lock); cur_trans = root->fs_info->running_transaction; if (cur_trans && cur_trans->blocked) { - DEFINE_WAIT(wait); atomic_inc(&cur_trans->use_count); spin_unlock(&root->fs_info->trans_lock); - while (1) { - prepare_to_wait(&root->fs_info->transaction_wait, &wait, - TASK_UNINTERRUPTIBLE); - if (!cur_trans->blocked) - break; - schedule(); - } - finish_wait(&root->fs_info->transaction_wait, &wait); + + wait_event(root->fs_info->transaction_wait, + !cur_trans->blocked); put_transaction(cur_trans); } else { spin_unlock(&root->fs_info->trans_lock); @@ -260,7 +257,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, { struct btrfs_trans_handle *h; struct btrfs_transaction *cur_trans; - int retries = 0; + u64 num_bytes = 0; int ret; if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) @@ -274,6 +271,19 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, h->block_rsv = NULL; goto got_it; } + + /* + * Do the reservation before we join the transaction so we can do all + * the appropriate flushing if need be. + */ + if (num_items > 0 && root != root->fs_info->chunk_root) { + num_bytes = btrfs_calc_trans_metadata_size(root, num_items); + ret = btrfs_block_rsv_add(root, + &root->fs_info->trans_block_rsv, + num_bytes); + if (ret) + return ERR_PTR(ret); + } again: h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); if (!h) @@ -310,24 +320,9 @@ again: goto again; } - if (num_items > 0) { - ret = btrfs_trans_reserve_metadata(h, root, num_items); - if (ret == -EAGAIN && !retries) { - retries++; - btrfs_commit_transaction(h, root); - goto again; - } else if (ret == -EAGAIN) { - /* - * We have already retried and got EAGAIN, so really we - * don't have space, so set ret to -ENOSPC. - */ - ret = -ENOSPC; - } - - if (ret < 0) { - btrfs_end_transaction(h, root); - return ERR_PTR(ret); - } + if (num_bytes) { + h->block_rsv = &root->fs_info->trans_block_rsv; + h->bytes_reserved = num_bytes; } got_it: @@ -359,19 +354,10 @@ struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root } /* wait for a transaction commit to be fully complete */ -static noinline int wait_for_commit(struct btrfs_root *root, +static noinline void wait_for_commit(struct btrfs_root *root, struct btrfs_transaction *commit) { - DEFINE_WAIT(wait); - while (!commit->commit_done) { - prepare_to_wait(&commit->commit_wait, &wait, - TASK_UNINTERRUPTIBLE); - if (commit->commit_done) - break; - schedule(); - } - finish_wait(&commit->commit_wait, &wait); - return 0; + wait_event(commit->commit_wait, commit->commit_done); } int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) @@ -435,8 +421,8 @@ static int should_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { int ret; - ret = btrfs_block_rsv_check(trans, root, - &root->fs_info->global_block_rsv, 0, 5); + + ret = btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5); return ret ? 1 : 0; } @@ -444,17 +430,26 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_transaction *cur_trans = trans->transaction; + struct btrfs_block_rsv *rsv = trans->block_rsv; int updates; smp_mb(); if (cur_trans->blocked || cur_trans->delayed_refs.flushing) return 1; + /* + * We need to do this in case we're deleting csums so the global block + * rsv get's used instead of the csum block rsv. + */ + trans->block_rsv = NULL; + updates = trans->delayed_ref_updates; trans->delayed_ref_updates = 0; if (updates) btrfs_run_delayed_refs(trans, root, updates); + trans->block_rsv = rsv; + return should_end_transaction(trans, root); } @@ -470,6 +465,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, return 0; } + btrfs_trans_release_metadata(trans, root); + trans->block_rsv = NULL; while (count < 4) { unsigned long cur = trans->delayed_ref_updates; trans->delayed_ref_updates = 0; @@ -490,8 +487,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, count++; } - btrfs_trans_release_metadata(trans, root); - if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && should_end_transaction(trans, root)) { trans->transaction->blocked = 1; @@ -499,10 +494,17 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, } if (lock && cur_trans->blocked && !cur_trans->in_commit) { - if (throttle) + if (throttle) { + /* + * We may race with somebody else here so end up having + * to call end_transaction on ourselves again, so inc + * our use_count. + */ + trans->use_count++; return btrfs_commit_transaction(trans, root); - else + } else { wake_up_process(info->transaction_kthread); + } } WARN_ON(cur_trans != info->running_transaction); @@ -572,50 +574,21 @@ int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans, int btrfs_write_marked_extents(struct btrfs_root *root, struct extent_io_tree *dirty_pages, int mark) { - int ret; int err = 0; int werr = 0; - struct page *page; - struct inode *btree_inode = root->fs_info->btree_inode; + struct address_space *mapping = root->fs_info->btree_inode->i_mapping; u64 start = 0; u64 end; - unsigned long index; - - while (1) { - ret = find_first_extent_bit(dirty_pages, start, &start, &end, - mark); - if (ret) - break; - while (start <= end) { - cond_resched(); - - index = start >> PAGE_CACHE_SHIFT; - start = (u64)(index + 1) << PAGE_CACHE_SHIFT; - page = find_get_page(btree_inode->i_mapping, index); - if (!page) - continue; - - btree_lock_page_hook(page); - if (!page->mapping) { - unlock_page(page); - page_cache_release(page); - continue; - } - if (PageWriteback(page)) { - if (PageDirty(page)) - wait_on_page_writeback(page); - else { - unlock_page(page); - page_cache_release(page); - continue; - } - } - err = write_one_page(page, 0); - if (err) - werr = err; - page_cache_release(page); - } + while (!find_first_extent_bit(dirty_pages, start, &start, &end, + mark)) { + convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, mark, + GFP_NOFS); + err = filemap_fdatawrite_range(mapping, start, end); + if (err) + werr = err; + cond_resched(); + start = end + 1; } if (err) werr = err; @@ -631,39 +604,20 @@ int btrfs_write_marked_extents(struct btrfs_root *root, int btrfs_wait_marked_extents(struct btrfs_root *root, struct extent_io_tree *dirty_pages, int mark) { - int ret; int err = 0; int werr = 0; - struct page *page; - struct inode *btree_inode = root->fs_info->btree_inode; + struct address_space *mapping = root->fs_info->btree_inode->i_mapping; u64 start = 0; u64 end; - unsigned long index; - - while (1) { - ret = find_first_extent_bit(dirty_pages, start, &start, &end, - mark); - if (ret) - break; - clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS); - while (start <= end) { - index = start >> PAGE_CACHE_SHIFT; - start = (u64)(index + 1) << PAGE_CACHE_SHIFT; - page = find_get_page(btree_inode->i_mapping, index); - if (!page) - continue; - if (PageDirty(page)) { - btree_lock_page_hook(page); - wait_on_page_writeback(page); - err = write_one_page(page, 0); - if (err) - werr = err; - } - wait_on_page_writeback(page); - page_cache_release(page); - cond_resched(); - } + while (!find_first_extent_bit(dirty_pages, start, &start, &end, + EXTENT_NEED_WAIT)) { + clear_extent_bits(dirty_pages, start, end, EXTENT_NEED_WAIT, GFP_NOFS); + err = filemap_fdatawait_range(mapping, start, end); + if (err) + werr = err; + cond_resched(); + start = end + 1; } if (err) werr = err; @@ -683,7 +637,12 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, ret = btrfs_write_marked_extents(root, dirty_pages, mark); ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark); - return ret || ret2; + + if (ret) + return ret; + if (ret2) + return ret2; + return 0; } int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, @@ -826,6 +785,10 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, btrfs_save_ino_cache(root, trans); + /* see comments in should_cow_block() */ + root->force_cow = 0; + smp_wmb(); + if (root->commit_root != root->node) { mutex_lock(&root->fs_commit_mutex); switch_commit_root(root); @@ -894,6 +857,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root *tree_root = fs_info->tree_root; struct btrfs_root *root = pending->root; struct btrfs_root *parent_root; + struct btrfs_block_rsv *rsv; struct inode *parent_inode; struct dentry *parent; struct dentry *dentry; @@ -905,6 +869,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, u64 objectid; u64 root_flags; + rsv = trans->block_rsv; + new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); if (!new_root_item) { pending->error = -ENOMEM; @@ -918,11 +884,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, } btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); - btrfs_orphan_pre_snapshot(trans, pending, &to_reserve); if (to_reserve > 0) { - ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv, - to_reserve); + ret = btrfs_block_rsv_add_noflush(root, &pending->block_rsv, + to_reserve); if (ret) { pending->error = ret; goto fail; @@ -986,6 +951,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_tree_unlock(old); free_extent_buffer(old); + /* see comments in should_cow_block() */ + root->force_cow = 1; + smp_wmb(); + btrfs_set_root_node(new_root_item, tmp); /* record when the snapshot was created in key.offset */ key.offset = trans->transid; @@ -1009,9 +978,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, BUG_ON(IS_ERR(pending->snap)); btrfs_reloc_post_snapshot(trans, pending); - btrfs_orphan_post_snapshot(trans, pending); fail: kfree(new_root_item); + trans->block_rsv = rsv; btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1); return 0; } @@ -1038,7 +1007,7 @@ static void update_super_roots(struct btrfs_root *root) struct btrfs_root_item *root_item; struct btrfs_super_block *super; - super = &root->fs_info->super_copy; + super = root->fs_info->super_copy; root_item = &root->fs_info->chunk_root->root_item; super->chunk_root = root_item->bytenr; @@ -1049,7 +1018,7 @@ static void update_super_roots(struct btrfs_root *root) super->root = root_item->bytenr; super->generation = root_item->generation; super->root_level = root_item->level; - if (super->cache_generation != 0 || btrfs_test_opt(root, SPACE_CACHE)) + if (btrfs_test_opt(root, SPACE_CACHE)) super->cache_generation = root_item->generation; } @@ -1080,22 +1049,7 @@ int btrfs_transaction_blocked(struct btrfs_fs_info *info) static void wait_current_trans_commit_start(struct btrfs_root *root, struct btrfs_transaction *trans) { - DEFINE_WAIT(wait); - - if (trans->in_commit) - return; - - while (1) { - prepare_to_wait(&root->fs_info->transaction_blocked_wait, &wait, - TASK_UNINTERRUPTIBLE); - if (trans->in_commit) { - finish_wait(&root->fs_info->transaction_blocked_wait, - &wait); - break; - } - schedule(); - finish_wait(&root->fs_info->transaction_blocked_wait, &wait); - } + wait_event(root->fs_info->transaction_blocked_wait, trans->in_commit); } /* @@ -1105,24 +1059,8 @@ static void wait_current_trans_commit_start(struct btrfs_root *root, static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root, struct btrfs_transaction *trans) { - DEFINE_WAIT(wait); - - if (trans->commit_done || (trans->in_commit && !trans->blocked)) - return; - - while (1) { - prepare_to_wait(&root->fs_info->transaction_wait, &wait, - TASK_UNINTERRUPTIBLE); - if (trans->commit_done || - (trans->in_commit && !trans->blocked)) { - finish_wait(&root->fs_info->transaction_wait, - &wait); - break; - } - schedule(); - finish_wait(&root->fs_info->transaction_wait, - &wait); - } + wait_event(root->fs_info->transaction_wait, + trans->commit_done || (trans->in_commit && !trans->blocked)); } /* @@ -1205,14 +1143,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_run_ordered_operations(root, 0); + btrfs_trans_release_metadata(trans, root); + trans->block_rsv = NULL; + /* make a pass through all the delayed refs we have so far * any runnings procs may add more while we are here */ ret = btrfs_run_delayed_refs(trans, root, 0); BUG_ON(ret); - btrfs_trans_release_metadata(trans, root); - cur_trans = trans->transaction; /* * set the flushing flag so procs in this transaction have to @@ -1229,8 +1168,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, atomic_inc(&cur_trans->use_count); btrfs_end_transaction(trans, root); - ret = wait_for_commit(root, cur_trans); - BUG_ON(ret); + wait_for_commit(root, cur_trans); put_transaction(cur_trans); @@ -1379,12 +1317,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, update_super_roots(root); if (!root->fs_info->log_root_recovering) { - btrfs_set_super_log_root(&root->fs_info->super_copy, 0); - btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0); + btrfs_set_super_log_root(root->fs_info->super_copy, 0); + btrfs_set_super_log_root_level(root->fs_info->super_copy, 0); } - memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy, - sizeof(root->fs_info->super_copy)); + memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy, + sizeof(*root->fs_info->super_copy)); trans->transaction->blocked = 0; spin_lock(&root->fs_info->trans_lock); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 4ce8a9f41d1e..3568374d419d 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -276,8 +276,9 @@ static int process_one_buffer(struct btrfs_root *log, struct walk_control *wc, u64 gen) { if (wc->pin) - btrfs_pin_extent(log->fs_info->extent_root, - eb->start, eb->len, 0); + btrfs_pin_extent_for_log_replay(wc->trans, + log->fs_info->extent_root, + eb->start, eb->len); if (btrfs_buffer_uptodate(eb, gen)) { if (wc->write) @@ -799,14 +800,15 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, struct extent_buffer *eb, int slot, struct btrfs_key *key) { - struct inode *dir; - int ret; struct btrfs_inode_ref *ref; + struct btrfs_dir_item *di; + struct inode *dir; struct inode *inode; - char *name; - int namelen; unsigned long ref_ptr; unsigned long ref_end; + char *name; + int namelen; + int ret; int search_done = 0; /* @@ -909,6 +911,25 @@ again: } btrfs_release_path(path); + /* look for a conflicting sequence number */ + di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), + btrfs_inode_ref_index(eb, ref), + name, namelen, 0); + if (di && !IS_ERR(di)) { + ret = drop_one_dir_item(trans, root, path, dir, di); + BUG_ON(ret); + } + btrfs_release_path(path); + + /* look for a conflicing name */ + di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), + name, namelen, 0); + if (di && !IS_ERR(di)) { + ret = drop_one_dir_item(trans, root, path, dir, di); + BUG_ON(ret); + } + btrfs_release_path(path); + insert: /* insert our name */ ret = btrfs_add_link(trans, dir, inode, name, namelen, 0, @@ -1010,7 +1031,7 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, } btrfs_release_path(path); if (nlink != inode->i_nlink) { - inode->i_nlink = nlink; + set_nlink(inode, nlink); btrfs_update_inode(trans, root, inode); } BTRFS_I(inode)->index_cnt = (u64)-1; @@ -1617,7 +1638,8 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, return 0; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; nritems = btrfs_header_nritems(eb); for (i = 0; i < nritems; i++) { @@ -1723,21 +1745,23 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, return -ENOMEM; if (*level == 1) { - wc->process_func(root, next, wc, ptr_gen); + ret = wc->process_func(root, next, wc, ptr_gen); + if (ret) + return ret; path->slots[*level]++; if (wc->free) { btrfs_read_buffer(next, ptr_gen); btrfs_tree_lock(next); - clean_tree_block(trans, root, next); btrfs_set_lock_blocking(next); + clean_tree_block(trans, root, next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); - ret = btrfs_free_reserved_extent(root, + ret = btrfs_free_and_pin_reserved_extent(root, bytenr, blocksize); BUG_ON(ret); } @@ -1788,21 +1812,24 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, parent = path->nodes[*level + 1]; root_owner = btrfs_header_owner(parent); - wc->process_func(root, path->nodes[*level], wc, + ret = wc->process_func(root, path->nodes[*level], wc, btrfs_header_generation(path->nodes[*level])); + if (ret) + return ret; + if (wc->free) { struct extent_buffer *next; next = path->nodes[*level]; btrfs_tree_lock(next); - clean_tree_block(trans, root, next); btrfs_set_lock_blocking(next); + clean_tree_block(trans, root, next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); - ret = btrfs_free_reserved_extent(root, + ret = btrfs_free_and_pin_reserved_extent(root, path->nodes[*level]->start, path->nodes[*level]->len); BUG_ON(ret); @@ -1864,14 +1891,14 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, next = path->nodes[orig_level]; btrfs_tree_lock(next); - clean_tree_block(trans, log, next); btrfs_set_lock_blocking(next); + clean_tree_block(trans, log, next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); WARN_ON(log->root_key.objectid != BTRFS_TREE_LOG_OBJECTID); - ret = btrfs_free_reserved_extent(log, next->start, + ret = btrfs_free_and_pin_reserved_extent(log, next->start, next->len); BUG_ON(ret); } @@ -1987,10 +2014,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, /* wait for previous tree log sync to complete */ if (atomic_read(&root->log_commit[(index1 + 1) % 2])) wait_log_commit(trans, root, root->log_transid - 1); - while (1) { unsigned long batch = root->log_batch; - if (root->log_multiple_pids) { + /* when we're on an ssd, just kick the log commit out */ + if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) { mutex_unlock(&root->log_mutex); schedule_timeout_uninterruptible(1); mutex_lock(&root->log_mutex); @@ -2091,9 +2118,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, BUG_ON(ret); btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); - btrfs_set_super_log_root(&root->fs_info->super_for_commit, + btrfs_set_super_log_root(root->fs_info->super_for_commit, log_root_tree->node->start); - btrfs_set_super_log_root_level(&root->fs_info->super_for_commit, + btrfs_set_super_log_root_level(root->fs_info->super_for_commit, btrfs_header_level(log_root_tree->node)); log_root_tree->log_batch = 0; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 19450bc53632..c37433d3cd82 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -142,6 +142,7 @@ static noinline int run_scheduled_bios(struct btrfs_device *device) unsigned long limit; unsigned long last_waited = 0; int force_reg = 0; + int sync_pending = 0; struct blk_plug plug; /* @@ -229,6 +230,22 @@ loop_lock: BUG_ON(atomic_read(&cur->bi_cnt) == 0); + /* + * if we're doing the sync list, record that our + * plug has some sync requests on it + * + * If we're doing the regular list and there are + * sync requests sitting around, unplug before + * we add more + */ + if (pending_bios == &device->pending_sync_bios) { + sync_pending = 1; + } else if (sync_pending) { + blk_finish_plug(&plug); + blk_start_plug(&plug); + sync_pending = 0; + } + submit_bio(cur->bi_rw, cur); num_run++; batch_run++; @@ -349,6 +366,14 @@ static noinline int device_list_add(const char *path, } INIT_LIST_HEAD(&device->dev_alloc_list); + /* init readahead state */ + spin_lock_init(&device->reada_lock); + device->reada_curr_zone = NULL; + atomic_set(&device->reada_in_flight, 0); + device->reada_next = 0; + INIT_RADIX_TREE(&device->reada_zones, GFP_NOFS & ~__GFP_WAIT); + INIT_RADIX_TREE(&device->reada_extents, GFP_NOFS & ~__GFP_WAIT); + mutex_lock(&fs_devices->device_list_mutex); list_add_rcu(&device->dev_list, &fs_devices->devices); mutex_unlock(&fs_devices->device_list_mutex); @@ -500,6 +525,9 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) fs_devices->rw_devices--; } + if (device->can_discard) + fs_devices->num_can_discard--; + new_device = kmalloc(sizeof(*new_device), GFP_NOFS); BUG_ON(!new_device); memcpy(new_device, device, sizeof(*new_device)); @@ -508,6 +536,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) new_device->bdev = NULL; new_device->writeable = 0; new_device->in_fs_metadata = 0; + new_device->can_discard = 0; list_replace_rcu(&device->dev_list, &new_device->dev_list); call_rcu(&device->rcu, free_device); @@ -547,6 +576,7 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, fmode_t flags, void *holder) { + struct request_queue *q; struct block_device *bdev; struct list_head *head = &fs_devices->devices; struct btrfs_device *device; @@ -575,10 +605,8 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, set_blocksize(bdev, 4096); bh = btrfs_read_dev_super(bdev); - if (!bh) { - ret = -EINVAL; + if (!bh) goto error_close; - } disk_super = (struct btrfs_super_block *)bh->b_data; devid = btrfs_stack_device_id(&disk_super->dev_item); @@ -603,6 +631,12 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, seeding = 0; } + q = bdev_get_queue(bdev); + if (blk_queue_discard(q)) { + device->can_discard = 1; + fs_devices->num_can_discard++; + } + device->bdev = bdev; device->in_fs_metadata = 0; device->mode = flags; @@ -627,7 +661,7 @@ error: continue; } if (fs_devices->open_devices == 0) { - ret = -EIO; + ret = -EINVAL; goto out; } fs_devices->seeding = seeding; @@ -835,6 +869,7 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans, max_hole_start = search_start; max_hole_size = 0; + hole_size = 0; if (search_start >= search_end) { ret = -ENOSPC; @@ -917,7 +952,14 @@ next: cond_resched(); } - hole_size = search_end- search_start; + /* + * At this point, search_start should be the end of + * allocated dev extents, and when shrinking the device, + * search_end may be smaller than search_start. + */ + if (search_end > search_start) + hole_size = search_end - search_start; + if (hole_size > max_hole_size) { max_hole_start = search_start; max_hole_size = hole_size; @@ -957,7 +999,7 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, key.objectid = device->devid; key.offset = start; key.type = BTRFS_DEV_EXTENT_KEY; - +again: ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) { ret = btrfs_previous_item(root, path, key.objectid, @@ -970,6 +1012,9 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_dev_extent); BUG_ON(found_key.offset > start || found_key.offset + btrfs_dev_extent_length(leaf, extent) < start); + key = found_key; + btrfs_release_path(path); + goto again; } else if (ret == 0) { leaf = path->nodes[0]; extent = btrfs_item_ptr(leaf, path->slots[0], @@ -977,8 +1022,13 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, } BUG_ON(ret); - if (device->bytes_used > 0) - device->bytes_used -= btrfs_dev_extent_length(leaf, extent); + if (device->bytes_used > 0) { + u64 len = btrfs_dev_extent_length(leaf, extent); + device->bytes_used -= len; + spin_lock(&root->fs_info->free_chunk_lock); + root->fs_info->free_chunk_space += len; + spin_unlock(&root->fs_info->free_chunk_lock); + } ret = btrfs_del_item(trans, root, path); out: @@ -1037,7 +1087,8 @@ static noinline int find_next_chunk(struct btrfs_root *root, struct btrfs_key found_key; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; key.objectid = objectid; key.offset = (u64)-1; @@ -1319,6 +1370,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) if (ret) goto error_undo; + spin_lock(&root->fs_info->free_chunk_lock); + root->fs_info->free_chunk_space = device->total_bytes - + device->bytes_used; + spin_unlock(&root->fs_info->free_chunk_lock); + device->in_fs_metadata = 0; btrfs_scrub_cancel_dev(root, device); @@ -1350,8 +1406,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) call_rcu(&device->rcu, free_device); mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); - num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; - btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices); + num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1; + btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices); if (cur_devices->open_devices == 0) { struct btrfs_fs_devices *fs_devices; @@ -1413,7 +1469,7 @@ static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans, struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; struct btrfs_fs_devices *old_devices; struct btrfs_fs_devices *seed_devices; - struct btrfs_super_block *disk_super = &root->fs_info->super_copy; + struct btrfs_super_block *disk_super = root->fs_info->super_copy; struct btrfs_device *device; u64 super_flags; @@ -1542,6 +1598,7 @@ error: int btrfs_init_new_device(struct btrfs_root *root, char *device_path) { + struct request_queue *q; struct btrfs_trans_handle *trans; struct btrfs_device *device; struct block_device *bdev; @@ -1611,6 +1668,9 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) lock_chunks(root); + q = bdev_get_queue(bdev); + if (blk_queue_discard(q)) + device->can_discard = 1; device->writeable = 1; device->work.func = pending_bios_fn; generate_random_uuid(device->uuid); @@ -1646,17 +1706,23 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) root->fs_info->fs_devices->num_devices++; root->fs_info->fs_devices->open_devices++; root->fs_info->fs_devices->rw_devices++; + if (device->can_discard) + root->fs_info->fs_devices->num_can_discard++; root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; + spin_lock(&root->fs_info->free_chunk_lock); + root->fs_info->free_chunk_space += device->total_bytes; + spin_unlock(&root->fs_info->free_chunk_lock); + if (!blk_queue_nonrot(bdev_get_queue(bdev))) root->fs_info->fs_devices->rotating = 1; - total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy); - btrfs_set_super_total_bytes(&root->fs_info->super_copy, + total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy); + btrfs_set_super_total_bytes(root->fs_info->super_copy, total_bytes + device->total_bytes); - total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy); - btrfs_set_super_num_devices(&root->fs_info->super_copy, + total_bytes = btrfs_super_num_devices(root->fs_info->super_copy); + btrfs_set_super_num_devices(root->fs_info->super_copy, total_bytes + 1); mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); @@ -1747,7 +1813,7 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 new_size) { struct btrfs_super_block *super_copy = - &device->dev_root->fs_info->super_copy; + device->dev_root->fs_info->super_copy; u64 old_total = btrfs_super_total_bytes(super_copy); u64 diff = new_size - device->total_bytes; @@ -1806,7 +1872,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 chunk_offset) { - struct btrfs_super_block *super_copy = &root->fs_info->super_copy; + struct btrfs_super_block *super_copy = root->fs_info->super_copy; struct btrfs_disk_key *disk_key; struct btrfs_chunk *chunk; u8 *ptr; @@ -2061,8 +2127,10 @@ int btrfs_balance(struct btrfs_root *dev_root) /* step two, relocate all the chunks */ path = btrfs_alloc_path(); - BUG_ON(!path); - + if (!path) { + ret = -ENOMEM; + goto error; + } key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; key.offset = (u64)-1; key.type = BTRFS_CHUNK_ITEM_KEY; @@ -2130,7 +2198,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) bool retried = false; struct extent_buffer *l; struct btrfs_key key; - struct btrfs_super_block *super_copy = &root->fs_info->super_copy; + struct btrfs_super_block *super_copy = root->fs_info->super_copy; u64 old_total = btrfs_super_total_bytes(super_copy); u64 old_size = device->total_bytes; u64 diff = device->total_bytes - new_size; @@ -2147,8 +2215,12 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) lock_chunks(root); device->total_bytes = new_size; - if (device->writeable) + if (device->writeable) { device->fs_devices->total_rw_bytes -= diff; + spin_lock(&root->fs_info->free_chunk_lock); + root->fs_info->free_chunk_space -= diff; + spin_unlock(&root->fs_info->free_chunk_lock); + } unlock_chunks(root); again: @@ -2212,6 +2284,9 @@ again: device->total_bytes = old_size; if (device->writeable) device->fs_devices->total_rw_bytes += diff; + spin_lock(&root->fs_info->free_chunk_lock); + root->fs_info->free_chunk_space += diff; + spin_unlock(&root->fs_info->free_chunk_lock); unlock_chunks(root); goto done; } @@ -2247,7 +2322,7 @@ static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans, struct btrfs_key *key, struct btrfs_chunk *chunk, int item_size) { - struct btrfs_super_block *super_copy = &root->fs_info->super_copy; + struct btrfs_super_block *super_copy = root->fs_info->super_copy; struct btrfs_disk_key disk_key; u32 array_size; u8 *ptr; @@ -2410,9 +2485,10 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, total_avail = device->total_bytes - device->bytes_used; else total_avail = 0; - /* avail is off by max(alloc_start, 1MB), but that is the same - * for all devices, so it doesn't hurt the sorting later on - */ + + /* If there is no space on this device, skip it. */ + if (total_avail == 0) + continue; ret = find_free_dev_extent(trans, device, max_stripe_size * dev_stripes, @@ -2569,6 +2645,11 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, index++; } + spin_lock(&extent_root->fs_info->free_chunk_lock); + extent_root->fs_info->free_chunk_space -= (stripe_size * + map->num_stripes); + spin_unlock(&extent_root->fs_info->free_chunk_lock); + index = 0; stripe = &chunk->stripe; while (index < map->num_stripes) { @@ -2661,7 +2742,8 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, ret = find_next_chunk(fs_info->chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset); - BUG_ON(ret); + if (ret) + return ret; alloc_profile = BTRFS_BLOCK_GROUP_METADATA | (fs_info->metadata_alloc_profile & @@ -2801,7 +2883,7 @@ static int find_live_mirror(struct map_lookup *map, int first, int num, static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, u64 logical, u64 *length, - struct btrfs_multi_bio **multi_ret, + struct btrfs_bio **bbio_ret, int mirror_num) { struct extent_map *em; @@ -2819,18 +2901,18 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, int i; int num_stripes; int max_errors = 0; - struct btrfs_multi_bio *multi = NULL; + struct btrfs_bio *bbio = NULL; - if (multi_ret && !(rw & (REQ_WRITE | REQ_DISCARD))) + if (bbio_ret && !(rw & (REQ_WRITE | REQ_DISCARD))) stripes_allocated = 1; again: - if (multi_ret) { - multi = kzalloc(btrfs_multi_bio_size(stripes_allocated), + if (bbio_ret) { + bbio = kzalloc(btrfs_bio_size(stripes_allocated), GFP_NOFS); - if (!multi) + if (!bbio) return -ENOMEM; - atomic_set(&multi->error, 0); + atomic_set(&bbio->error, 0); } read_lock(&em_tree->lock); @@ -2851,7 +2933,7 @@ again: if (mirror_num > map->num_stripes) mirror_num = 0; - /* if our multi bio struct is too small, back off and try again */ + /* if our btrfs_bio struct is too small, back off and try again */ if (rw & REQ_WRITE) { if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP)) { @@ -2870,11 +2952,11 @@ again: stripes_required = map->num_stripes; } } - if (multi_ret && (rw & (REQ_WRITE | REQ_DISCARD)) && + if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) && stripes_allocated < stripes_required) { stripes_allocated = map->num_stripes; free_extent_map(em); - kfree(multi); + kfree(bbio); goto again; } stripe_nr = offset; @@ -2903,7 +2985,7 @@ again: *length = em->len - offset; } - if (!multi_ret) + if (!bbio_ret) goto out; num_stripes = 1; @@ -2928,13 +3010,17 @@ again: stripe_index = find_live_mirror(map, 0, map->num_stripes, current->pid % map->num_stripes); + mirror_num = stripe_index + 1; } } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { - if (rw & (REQ_WRITE | REQ_DISCARD)) + if (rw & (REQ_WRITE | REQ_DISCARD)) { num_stripes = map->num_stripes; - else if (mirror_num) + } else if (mirror_num) { stripe_index = mirror_num - 1; + } else { + mirror_num = 1; + } } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { int factor = map->num_stripes / map->sub_stripes; @@ -2954,6 +3040,7 @@ again: stripe_index = find_live_mirror(map, stripe_index, map->sub_stripes, stripe_index + current->pid % map->sub_stripes); + mirror_num = stripe_index + 1; } } else { /* @@ -2962,15 +3049,16 @@ again: * stripe_index is the number of our device in the stripe array */ stripe_index = do_div(stripe_nr, map->num_stripes); + mirror_num = stripe_index + 1; } BUG_ON(stripe_index >= map->num_stripes); if (rw & REQ_DISCARD) { for (i = 0; i < num_stripes; i++) { - multi->stripes[i].physical = + bbio->stripes[i].physical = map->stripes[stripe_index].physical + stripe_offset + stripe_nr * map->stripe_len; - multi->stripes[i].dev = map->stripes[stripe_index].dev; + bbio->stripes[i].dev = map->stripes[stripe_index].dev; if (map->type & BTRFS_BLOCK_GROUP_RAID0) { u64 stripes; @@ -2991,16 +3079,16 @@ again: } stripes = stripe_nr_end - 1 - j; do_div(stripes, map->num_stripes); - multi->stripes[i].length = map->stripe_len * + bbio->stripes[i].length = map->stripe_len * (stripes - stripe_nr + 1); if (i == 0) { - multi->stripes[i].length -= + bbio->stripes[i].length -= stripe_offset; stripe_offset = 0; } if (stripe_index == last_stripe) - multi->stripes[i].length -= + bbio->stripes[i].length -= stripe_end_offset; } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { u64 stripes; @@ -3025,11 +3113,11 @@ again: } stripes = stripe_nr_end - 1 - j; do_div(stripes, factor); - multi->stripes[i].length = map->stripe_len * + bbio->stripes[i].length = map->stripe_len * (stripes - stripe_nr + 1); if (i < map->sub_stripes) { - multi->stripes[i].length -= + bbio->stripes[i].length -= stripe_offset; if (i == map->sub_stripes - 1) stripe_offset = 0; @@ -3037,11 +3125,11 @@ again: if (stripe_index >= last_stripe && stripe_index <= (last_stripe + map->sub_stripes - 1)) { - multi->stripes[i].length -= + bbio->stripes[i].length -= stripe_end_offset; } } else - multi->stripes[i].length = *length; + bbio->stripes[i].length = *length; stripe_index++; if (stripe_index == map->num_stripes) { @@ -3052,19 +3140,20 @@ again: } } else { for (i = 0; i < num_stripes; i++) { - multi->stripes[i].physical = + bbio->stripes[i].physical = map->stripes[stripe_index].physical + stripe_offset + stripe_nr * map->stripe_len; - multi->stripes[i].dev = + bbio->stripes[i].dev = map->stripes[stripe_index].dev; stripe_index++; } } - if (multi_ret) { - *multi_ret = multi; - multi->num_stripes = num_stripes; - multi->max_errors = max_errors; + if (bbio_ret) { + *bbio_ret = bbio; + bbio->num_stripes = num_stripes; + bbio->max_errors = max_errors; + bbio->mirror_num = mirror_num; } out: free_extent_map(em); @@ -3073,9 +3162,9 @@ out: int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, u64 logical, u64 *length, - struct btrfs_multi_bio **multi_ret, int mirror_num) + struct btrfs_bio **bbio_ret, int mirror_num) { - return __btrfs_map_block(map_tree, rw, logical, length, multi_ret, + return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret, mirror_num); } @@ -3144,28 +3233,30 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, return 0; } -static void end_bio_multi_stripe(struct bio *bio, int err) +static void btrfs_end_bio(struct bio *bio, int err) { - struct btrfs_multi_bio *multi = bio->bi_private; + struct btrfs_bio *bbio = bio->bi_private; int is_orig_bio = 0; if (err) - atomic_inc(&multi->error); + atomic_inc(&bbio->error); - if (bio == multi->orig_bio) + if (bio == bbio->orig_bio) is_orig_bio = 1; - if (atomic_dec_and_test(&multi->stripes_pending)) { + if (atomic_dec_and_test(&bbio->stripes_pending)) { if (!is_orig_bio) { bio_put(bio); - bio = multi->orig_bio; + bio = bbio->orig_bio; } - bio->bi_private = multi->private; - bio->bi_end_io = multi->end_io; + bio->bi_private = bbio->private; + bio->bi_end_io = bbio->end_io; + bio->bi_bdev = (struct block_device *) + (unsigned long)bbio->mirror_num; /* only send an error to the higher layers if it is * beyond the tolerance of the multi-bio */ - if (atomic_read(&multi->error) > multi->max_errors) { + if (atomic_read(&bbio->error) > bbio->max_errors) { err = -EIO; } else if (err) { /* @@ -3175,7 +3266,7 @@ static void end_bio_multi_stripe(struct bio *bio, int err) set_bit(BIO_UPTODATE, &bio->bi_flags); err = 0; } - kfree(multi); + kfree(bbio); bio_endio(bio, err); } else if (!is_orig_bio) { @@ -3255,20 +3346,20 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, u64 logical = (u64)bio->bi_sector << 9; u64 length = 0; u64 map_length; - struct btrfs_multi_bio *multi = NULL; int ret; int dev_nr = 0; int total_devs = 1; + struct btrfs_bio *bbio = NULL; length = bio->bi_size; map_tree = &root->fs_info->mapping_tree; map_length = length; - ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi, + ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio, mirror_num); BUG_ON(ret); - total_devs = multi->num_stripes; + total_devs = bbio->num_stripes; if (map_length < length) { printk(KERN_CRIT "mapping failed logical %llu bio len %llu " "len %llu\n", (unsigned long long)logical, @@ -3276,25 +3367,28 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, (unsigned long long)map_length); BUG(); } - multi->end_io = first_bio->bi_end_io; - multi->private = first_bio->bi_private; - multi->orig_bio = first_bio; - atomic_set(&multi->stripes_pending, multi->num_stripes); + + bbio->orig_bio = first_bio; + bbio->private = first_bio->bi_private; + bbio->end_io = first_bio->bi_end_io; + atomic_set(&bbio->stripes_pending, bbio->num_stripes); while (dev_nr < total_devs) { - if (total_devs > 1) { - if (dev_nr < total_devs - 1) { - bio = bio_clone(first_bio, GFP_NOFS); - BUG_ON(!bio); - } else { - bio = first_bio; - } - bio->bi_private = multi; - bio->bi_end_io = end_bio_multi_stripe; + if (dev_nr < total_devs - 1) { + bio = bio_clone(first_bio, GFP_NOFS); + BUG_ON(!bio); + } else { + bio = first_bio; } - bio->bi_sector = multi->stripes[dev_nr].physical >> 9; - dev = multi->stripes[dev_nr].dev; + bio->bi_private = bbio; + bio->bi_end_io = btrfs_end_bio; + bio->bi_sector = bbio->stripes[dev_nr].physical >> 9; + dev = bbio->stripes[dev_nr].dev; if (dev && dev->bdev && (rw != WRITE || dev->writeable)) { + pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu " + "(%s id %llu), size=%u\n", rw, + (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev, + dev->name, dev->devid, bio->bi_size); bio->bi_bdev = dev->bdev; if (async_submit) schedule_bio(root, dev, rw, bio); @@ -3307,8 +3401,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, } dev_nr++; } - if (total_devs == 1) - kfree(multi); return 0; } @@ -3569,15 +3661,20 @@ static int read_one_dev(struct btrfs_root *root, fill_device_from_item(leaf, dev_item, device); device->dev_root = root->fs_info->dev_root; device->in_fs_metadata = 1; - if (device->writeable) + if (device->writeable) { device->fs_devices->total_rw_bytes += device->total_bytes; + spin_lock(&root->fs_info->free_chunk_lock); + root->fs_info->free_chunk_space += device->total_bytes - + device->bytes_used; + spin_unlock(&root->fs_info->free_chunk_lock); + } ret = 0; return ret; } int btrfs_read_sys_array(struct btrfs_root *root) { - struct btrfs_super_block *super_copy = &root->fs_info->super_copy; + struct btrfs_super_block *super_copy = root->fs_info->super_copy; struct extent_buffer *sb; struct btrfs_disk_key *disk_key; struct btrfs_chunk *chunk; @@ -3595,7 +3692,7 @@ int btrfs_read_sys_array(struct btrfs_root *root) if (!sb) return -ENOMEM; btrfs_set_buffer_uptodate(sb); - btrfs_set_buffer_lockdep_class(sb, 0); + btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0); write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); array_size = btrfs_super_sys_array_size(super_copy); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 7c12d61ae7ae..78f2d4d4f37f 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -48,6 +48,7 @@ struct btrfs_device { int writeable; int in_fs_metadata; int missing; + int can_discard; spinlock_t io_lock; @@ -91,6 +92,20 @@ struct btrfs_device { struct btrfs_work work; struct rcu_head rcu; struct work_struct rcu_work; + + /* readahead state */ + spinlock_t reada_lock; + atomic_t reada_in_flight; + u64 reada_next; + struct reada_zone *reada_curr_zone; + struct radix_tree_root reada_zones; + struct radix_tree_root reada_extents; + + /* for sending down flush barriers */ + struct bio *flush_bio; + struct completion flush_wait; + int nobarriers; + }; struct btrfs_fs_devices { @@ -104,6 +119,7 @@ struct btrfs_fs_devices { u64 rw_devices; u64 missing_devices; u64 total_rw_bytes; + u64 num_can_discard; struct block_device *latest_bdev; /* all of the devices in the FS, protected by a mutex @@ -134,7 +150,10 @@ struct btrfs_bio_stripe { u64 length; /* only used for discard mappings */ }; -struct btrfs_multi_bio { +struct btrfs_bio; +typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err); + +struct btrfs_bio { atomic_t stripes_pending; bio_end_io_t *end_io; struct bio *orig_bio; @@ -142,6 +161,7 @@ struct btrfs_multi_bio { atomic_t error; int max_errors; int num_stripes; + int mirror_num; struct btrfs_bio_stripe stripes[]; }; @@ -169,7 +189,7 @@ struct map_lookup { int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, u64 end, u64 *length); -#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \ +#define btrfs_bio_size(n) (sizeof(struct btrfs_bio) + \ (sizeof(struct btrfs_bio_stripe) * (n))) int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, @@ -178,7 +198,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, u64 chunk_offset, u64 start, u64 num_bytes); int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, u64 logical, u64 *length, - struct btrfs_multi_bio **multi_ret, int mirror_num); + struct btrfs_bio **bbio_ret, int mirror_num); int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, u64 chunk_start, u64 physical, u64 devid, u64 **logical, int *naddrs, int *stripe_len); diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 5366fe452ab0..3848b04e310e 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -102,48 +102,82 @@ static int do_setxattr(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - /* first lets see if we already have this xattr */ - di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), name, - strlen(name), -1); - if (IS_ERR(di)) { - ret = PTR_ERR(di); - goto out; - } - - /* ok we already have this xattr, lets remove it */ - if (di) { - /* if we want create only exit */ - if (flags & XATTR_CREATE) { - ret = -EEXIST; + if (flags & XATTR_REPLACE) { + di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), name, + name_len, -1); + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } else if (!di) { + ret = -ENODATA; goto out; } - ret = btrfs_delete_one_dir_name(trans, root, path, di); - BUG_ON(ret); + if (ret) + goto out; btrfs_release_path(path); - /* if we don't have a value then we are removing the xattr */ + /* + * remove the attribute + */ if (!value) goto out; - } else { + } + +again: + ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode), + name, name_len, value, size); + /* + * If we're setting an xattr to a new value but the new value is say + * exactly BTRFS_MAX_XATTR_SIZE, we could end up with EOVERFLOW getting + * back from split_leaf. This is because it thinks we'll be extending + * the existing item size, but we're asking for enough space to add the + * item itself. So if we get EOVERFLOW just set ret to EEXIST and let + * the rest of the function figure it out. + */ + if (ret == -EOVERFLOW) + ret = -EEXIST; + + if (ret == -EEXIST) { + if (flags & XATTR_CREATE) + goto out; + /* + * We can't use the path we already have since we won't have the + * proper locking for a delete, so release the path and + * re-lookup to delete the thing. + */ btrfs_release_path(path); + di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), + name, name_len, -1); + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } else if (!di) { + /* Shouldn't happen but just in case... */ + btrfs_release_path(path); + goto again; + } - if (flags & XATTR_REPLACE) { - /* we couldn't find the attr to replace */ - ret = -ENODATA; + ret = btrfs_delete_one_dir_name(trans, root, path, di); + if (ret) goto out; + + /* + * We have a value to set, so go back and try to insert it now. + */ + if (value) { + btrfs_release_path(path); + goto again; } } - - /* ok we have to create a completely new xattr */ - ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode), - name, name_len, value, size); - BUG_ON(ret); out: btrfs_free_path(path); return ret; } +/* + * @value: "" makes the attribute to empty, NULL removes it + */ int __btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, const char *name, const void *value, size_t size, int flags) @@ -360,36 +394,36 @@ int btrfs_removexattr(struct dentry *dentry, const char *name) XATTR_REPLACE); } -int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, - struct inode *inode, struct inode *dir, - const struct qstr *qstr) +int btrfs_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) { - int err; - size_t len; - void *value; - char *suffix; + const struct xattr *xattr; + struct btrfs_trans_handle *trans = fs_info; char *name; + int err = 0; - err = security_inode_init_security(inode, dir, qstr, &suffix, &value, - &len); - if (err) { - if (err == -EOPNOTSUPP) - return 0; - return err; - } - - name = kmalloc(XATTR_SECURITY_PREFIX_LEN + strlen(suffix) + 1, - GFP_NOFS); - if (!name) { - err = -ENOMEM; - } else { + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + name = kmalloc(XATTR_SECURITY_PREFIX_LEN + + strlen(xattr->name) + 1, GFP_NOFS); + if (!name) { + err = -ENOMEM; + break; + } strcpy(name, XATTR_SECURITY_PREFIX); - strcpy(name + XATTR_SECURITY_PREFIX_LEN, suffix); - err = __btrfs_setxattr(trans, inode, name, value, len, 0); + strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name); + err = __btrfs_setxattr(trans, inode, name, + xattr->value, xattr->value_len, 0); kfree(name); + if (err < 0) + break; } - - kfree(suffix); - kfree(value); return err; } + +int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *dir, + const struct qstr *qstr) +{ + return security_inode_init_security(inode, dir, qstr, + &btrfs_initxattrs, trans); +} diff --git a/fs/buffer.c b/fs/buffer.c index 1a80b048ade8..19d8eb7fdc81 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -213,13 +213,16 @@ __find_get_block_slow(struct block_device *bdev, sector_t block) * elsewhere, don't buffer_error if we had some unmapped buffers */ if (all_mapped) { + char b[BDEVNAME_SIZE]; + printk("__find_get_block_slow() failed. " "block=%llu, b_blocknr=%llu\n", (unsigned long long)block, (unsigned long long)bh->b_blocknr); printk("b_state=0x%08lx, b_size=%zu\n", bh->b_state, bh->b_size); - printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits); + printk("device %s blocksize: %d\n", bdevname(bdev, b), + 1 << bd_inode->i_blkbits); } out_unlock: spin_unlock(&bd_mapping->private_lock); @@ -285,7 +288,7 @@ static void free_more_memory(void) struct zone *zone; int nid; - wakeup_flusher_threads(1024); + wakeup_flusher_threads(1024, WB_REASON_FREE_MORE_MEM); yield(); for_each_online_node(nid) { @@ -1470,13 +1473,13 @@ static void discard_buffer(struct buffer_head * bh) } /** - * block_invalidatepage - invalidate part of all of a buffer-backed page + * block_invalidatepage - invalidate part or all of a buffer-backed page * * @page: the page which is affected * @offset: the index of the truncation point * * block_invalidatepage() is called when all or part of the page has become - * invalidatedby a truncate operation. + * invalidated by a truncate operation. * * block_invalidatepage() does not have to release all buffers, but it must * ensure that no dirty buffer is left outside @offset and that no I/O diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 5a3953db8118..4144caf2f9d3 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -228,102 +228,155 @@ static int ceph_readpage(struct file *filp, struct page *page) } /* - * Build a vector of contiguous pages from the provided page list. + * Finish an async read(ahead) op. */ -static struct page **page_vector_from_list(struct list_head *page_list, - unsigned *nr_pages) +static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) { - struct page **pages; - struct page *page; - int next_index, contig_pages = 0; + struct inode *inode = req->r_inode; + struct ceph_osd_reply_head *replyhead; + int rc, bytes; + int i; - /* build page vector */ - pages = kmalloc(sizeof(*pages) * *nr_pages, GFP_NOFS); - if (!pages) - return ERR_PTR(-ENOMEM); + /* parse reply */ + replyhead = msg->front.iov_base; + WARN_ON(le32_to_cpu(replyhead->num_ops) == 0); + rc = le32_to_cpu(replyhead->result); + bytes = le32_to_cpu(msg->hdr.data_len); - BUG_ON(list_empty(page_list)); - next_index = list_entry(page_list->prev, struct page, lru)->index; - list_for_each_entry_reverse(page, page_list, lru) { - if (page->index == next_index) { - dout("readpages page %d %p\n", contig_pages, page); - pages[contig_pages] = page; - contig_pages++; - next_index++; - } else { - break; + dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); + + /* unlock all pages, zeroing any data we didn't read */ + for (i = 0; i < req->r_num_pages; i++, bytes -= PAGE_CACHE_SIZE) { + struct page *page = req->r_pages[i]; + + if (bytes < (int)PAGE_CACHE_SIZE) { + /* zero (remainder of) page */ + int s = bytes < 0 ? 0 : bytes; + zero_user_segment(page, s, PAGE_CACHE_SIZE); } + dout("finish_read %p uptodate %p idx %lu\n", inode, page, + page->index); + flush_dcache_page(page); + SetPageUptodate(page); + unlock_page(page); + page_cache_release(page); } - *nr_pages = contig_pages; - return pages; + kfree(req->r_pages); } /* - * Read multiple pages. Leave pages we don't read + unlock in page_list; - * the caller (VM) cleans them up. + * start an async read(ahead) operation. return nr_pages we submitted + * a read for on success, or negative error code. */ -static int ceph_readpages(struct file *file, struct address_space *mapping, - struct list_head *page_list, unsigned nr_pages) +static int start_read(struct inode *inode, struct list_head *page_list, int max) { - struct inode *inode = file->f_dentry->d_inode; - struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->client->osdc; - int rc = 0; - struct page **pages; - loff_t offset; + struct ceph_inode_info *ci = ceph_inode(inode); + struct page *page = list_entry(page_list->prev, struct page, lru); + struct ceph_osd_request *req; + u64 off; u64 len; + int i; + struct page **pages; + pgoff_t next_index; + int nr_pages = 0; + int ret; - dout("readpages %p file %p nr_pages %d\n", - inode, file, nr_pages); - - pages = page_vector_from_list(page_list, &nr_pages); - if (IS_ERR(pages)) - return PTR_ERR(pages); + off = page->index << PAGE_CACHE_SHIFT; - /* guess read extent */ - offset = pages[0]->index << PAGE_CACHE_SHIFT; + /* count pages */ + next_index = page->index; + list_for_each_entry_reverse(page, page_list, lru) { + if (page->index != next_index) + break; + nr_pages++; + next_index++; + if (max && nr_pages == max) + break; + } len = nr_pages << PAGE_CACHE_SHIFT; - rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, - offset, &len, - ci->i_truncate_seq, ci->i_truncate_size, - pages, nr_pages, 0); - if (rc == -ENOENT) - rc = 0; - if (rc < 0) - goto out; - - for (; !list_empty(page_list) && len > 0; - rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) { - struct page *page = - list_entry(page_list->prev, struct page, lru); + dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages, + off, len); + + req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode), + off, &len, + CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, + NULL, 0, + ci->i_truncate_seq, ci->i_truncate_size, + NULL, false, 1, 0); + if (!req) + return -ENOMEM; + /* build page vector */ + nr_pages = len >> PAGE_CACHE_SHIFT; + pages = kmalloc(sizeof(*pages) * nr_pages, GFP_NOFS); + ret = -ENOMEM; + if (!pages) + goto out; + for (i = 0; i < nr_pages; ++i) { + page = list_entry(page_list->prev, struct page, lru); + BUG_ON(PageLocked(page)); list_del(&page->lru); - - if (rc < (int)PAGE_CACHE_SIZE) { - /* zero (remainder of) page */ - int s = rc < 0 ? 0 : rc; - zero_user_segment(page, s, PAGE_CACHE_SIZE); - } - - if (add_to_page_cache_lru(page, mapping, page->index, + + dout("start_read %p adding %p idx %lu\n", inode, page, + page->index); + if (add_to_page_cache_lru(page, &inode->i_data, page->index, GFP_NOFS)) { page_cache_release(page); - dout("readpages %p add_to_page_cache failed %p\n", + dout("start_read %p add_to_page_cache failed %p\n", inode, page); - continue; + nr_pages = i; + goto out_pages; } - dout("readpages %p adding %p idx %lu\n", inode, page, - page->index); - flush_dcache_page(page); - SetPageUptodate(page); - unlock_page(page); - page_cache_release(page); + pages[i] = page; } - rc = 0; + req->r_pages = pages; + req->r_num_pages = nr_pages; + req->r_callback = finish_read; + req->r_inode = inode; + + dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len); + ret = ceph_osdc_start_request(osdc, req, false); + if (ret < 0) + goto out_pages; + ceph_osdc_put_request(req); + return nr_pages; +out_pages: + ceph_release_page_vector(pages, nr_pages); +out: + ceph_osdc_put_request(req); + return ret; +} + + +/* + * Read multiple pages. Leave pages we don't read + unlock in page_list; + * the caller (VM) cleans them up. + */ +static int ceph_readpages(struct file *file, struct address_space *mapping, + struct list_head *page_list, unsigned nr_pages) +{ + struct inode *inode = file->f_dentry->d_inode; + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + int rc = 0; + int max = 0; + + if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE) + max = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1) + >> PAGE_SHIFT; + + dout("readpages %p file %p nr_pages %d max %d\n", inode, file, nr_pages, + max); + while (!list_empty(page_list)) { + rc = start_read(inode, page_list, max); + if (rc < 0) + goto out; + BUG_ON(rc == 0); + } out: - kfree(pages); + dout("readpages %p file %p ret %d\n", inode, file, rc); return rc; } diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 8d74ad7ba556..0f327c6c9679 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -487,17 +487,15 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, ci->i_rdcache_gen++; /* - * if we are newly issued FILE_SHARED, clear I_COMPLETE; we + * if we are newly issued FILE_SHARED, clear D_COMPLETE; we * don't know what happened to this directory while we didn't * have the cap. */ if ((issued & CEPH_CAP_FILE_SHARED) && (had & CEPH_CAP_FILE_SHARED) == 0) { ci->i_shared_gen++; - if (S_ISDIR(ci->vfs_inode.i_mode)) { - dout(" marking %p NOT complete\n", &ci->vfs_inode); - ci->i_ceph_flags &= ~CEPH_I_COMPLETE; - } + if (S_ISDIR(ci->vfs_inode.i_mode)) + ceph_dir_clear_complete(&ci->vfs_inode); } } @@ -945,7 +943,7 @@ static int send_cap_msg(struct ceph_mds_session *session, seq, issue_seq, mseq, follows, size, max_size, xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0); - msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS); + msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS, false); if (!msg) return -ENOMEM; @@ -2363,7 +2361,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, } if ((issued & CEPH_CAP_LINK_EXCL) == 0) - inode->i_nlink = le32_to_cpu(grant->nlink); + set_nlink(inode, le32_to_cpu(grant->nlink)); if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) { int len = le32_to_cpu(grant->xattr_len); diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 0dba6915712b..fb962efdacee 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -102,7 +102,7 @@ static int mdsc_show(struct seq_file *s, void *p) path = NULL; spin_lock(&req->r_old_dentry->d_lock); seq_printf(s, " #%llx/%.*s (%s)", - ceph_ino(req->r_old_dentry->d_parent->d_inode), + ceph_ino(req->r_old_dentry_dir), req->r_old_dentry->d_name.len, req->r_old_dentry->d_name.name, path ? path : ""); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 1065ac779840..bca3948e9dbf 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -40,14 +40,6 @@ int ceph_init_dentry(struct dentry *dentry) if (dentry->d_fsdata) return 0; - if (dentry->d_parent == NULL || /* nfs fh_to_dentry */ - ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) - d_set_d_op(dentry, &ceph_dentry_ops); - else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR) - d_set_d_op(dentry, &ceph_snapdir_dentry_ops); - else - d_set_d_op(dentry, &ceph_snap_dentry_ops); - di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO); if (!di) return -ENOMEM; /* oh well */ @@ -58,16 +50,42 @@ int ceph_init_dentry(struct dentry *dentry) kmem_cache_free(ceph_dentry_cachep, di); goto out_unlock; } + + if (dentry->d_parent == NULL || /* nfs fh_to_dentry */ + ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) + d_set_d_op(dentry, &ceph_dentry_ops); + else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR) + d_set_d_op(dentry, &ceph_snapdir_dentry_ops); + else + d_set_d_op(dentry, &ceph_snap_dentry_ops); + di->dentry = dentry; di->lease_session = NULL; - dentry->d_fsdata = di; dentry->d_time = jiffies; + /* avoid reordering d_fsdata setup so that the check above is safe */ + smp_mb(); + dentry->d_fsdata = di; ceph_dentry_lru_add(dentry); out_unlock: spin_unlock(&dentry->d_lock); return 0; } +struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry) +{ + struct inode *inode = NULL; + + if (!dentry) + return NULL; + + spin_lock(&dentry->d_lock); + if (dentry->d_parent) { + inode = dentry->d_parent->d_inode; + ihold(inode); + } + spin_unlock(&dentry->d_lock); + return inode; +} /* @@ -90,7 +108,7 @@ static unsigned fpos_off(loff_t p) * falling back to a "normal" sync readdir if any dentries in the dir * are dropped. * - * I_COMPLETE tells indicates we have all dentries in the dir. It is + * D_COMPLETE tells indicates we have all dentries in the dir. It is * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by * the MDS if/when the directory is modified). */ @@ -133,7 +151,7 @@ more: d_unhashed(dentry) ? "!hashed" : "hashed", parent->d_subdirs.prev, parent->d_subdirs.next); if (p == &parent->d_subdirs) { - fi->at_end = 1; + fi->flags |= CEPH_F_ATEND; goto out_unlock; } spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); @@ -181,8 +199,8 @@ more: filp->f_pos++; /* make sure a dentry wasn't dropped while we didn't have parent lock */ - if (!ceph_i_test(dir, CEPH_I_COMPLETE)) { - dout(" lost I_COMPLETE on %p; falling back to mds\n", dir); + if (!ceph_dir_test_complete(dir)) { + dout(" lost D_COMPLETE on %p; falling back to mds\n", dir); err = -EAGAIN; goto out; } @@ -234,7 +252,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir) const int max_bytes = fsc->mount_options->max_readdir_bytes; dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off); - if (fi->at_end) + if (fi->flags & CEPH_F_ATEND) return 0; /* always start with . and .. */ @@ -267,7 +285,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir) if ((filp->f_pos == 2 || fi->dentry) && !ceph_test_mount_opt(fsc, NOASYNCREADDIR) && ceph_snap(inode) != CEPH_SNAPDIR && - (ci->i_ceph_flags & CEPH_I_COMPLETE) && + ceph_dir_test_complete(inode) && __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { spin_unlock(&inode->i_lock); err = __dcache_readdir(filp, dirent, filldir); @@ -333,7 +351,7 @@ more: if (!req->r_did_prepopulate) { dout("readdir !did_prepopulate"); - fi->dir_release_count--; /* preclude I_COMPLETE */ + fi->dir_release_count--; /* preclude D_COMPLETE */ } /* note next offset and last dentry name */ @@ -403,7 +421,7 @@ more: dout("readdir next frag is %x\n", frag); goto more; } - fi->at_end = 1; + fi->flags |= CEPH_F_ATEND; /* * if dir_release_count still matches the dir, no dentries @@ -412,8 +430,7 @@ more: */ spin_lock(&inode->i_lock); if (ci->i_release_count == fi->dir_release_count) { - dout(" marking %p complete\n", inode); - /* ci->i_ceph_flags |= CEPH_I_COMPLETE; */ + ceph_dir_set_complete(inode); ci->i_max_offset = filp->f_pos; } spin_unlock(&inode->i_lock); @@ -435,7 +452,7 @@ static void reset_readdir(struct ceph_file_info *fi) dput(fi->dentry); fi->dentry = NULL; } - fi->at_end = 0; + fi->flags &= ~CEPH_F_ATEND; } static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin) @@ -463,7 +480,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin) if (offset != file->f_pos) { file->f_pos = offset; file->f_version = 0; - fi->at_end = 0; + fi->flags &= ~CEPH_F_ATEND; } retval = offset; @@ -488,21 +505,13 @@ out: } /* - * Process result of a lookup/open request. - * - * Mainly, make sure we return the final req->r_dentry (if it already - * existed) in place of the original VFS-provided dentry when they - * differ. - * - * Gracefully handle the case where the MDS replies with -ENOENT and - * no trace (which it may do, at its discretion, e.g., if it doesn't - * care to issue a lease on the negative dentry). + * Handle lookups for the hidden .snap directory. */ -struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, - struct dentry *dentry, int err) +int ceph_handle_snapdir(struct ceph_mds_request *req, + struct dentry *dentry, int err) { struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); - struct inode *parent = dentry->d_parent->d_inode; + struct inode *parent = dentry->d_parent->d_inode; /* we hold i_mutex */ /* .snap dir? */ if (err == -ENOENT && @@ -516,7 +525,23 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, d_add(dentry, inode); err = 0; } + return err; +} +/* + * Figure out final result of a lookup/open request. + * + * Mainly, make sure we return the final req->r_dentry (if it already + * existed) in place of the original VFS-provided dentry when they + * differ. + * + * Gracefully handle the case where the MDS replies with -ENOENT and + * no trace (which it may do, at its discretion, e.g., if it doesn't + * care to issue a lease on the negative dentry). + */ +struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, + struct dentry *dentry, int err) +{ if (err == -ENOENT) { /* no trace? */ err = 0; @@ -588,7 +613,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, fsc->mount_options->snapdir_name, dentry->d_name.len) && !is_root_ceph_dentry(dir, dentry) && - (ci->i_ceph_flags & CEPH_I_COMPLETE) && + ceph_dir_test_complete(dir) && (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) { spin_unlock(&dir->i_lock); dout(" dir %p complete, -ENOENT\n", dir); @@ -610,6 +635,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE); req->r_locked_dir = dir; err = ceph_mdsc_do_request(mdsc, NULL, req); + err = ceph_handle_snapdir(req, dentry, err); dentry = ceph_finish_lookup(req, dentry, err); ceph_mdsc_put_request(req); /* will dput(dentry) */ dout("lookup result=%p\n", dentry); @@ -789,6 +815,7 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir, req->r_dentry = dget(dentry); req->r_num_caps = 2; req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */ + req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry); req->r_locked_dir = dir; req->r_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; @@ -887,6 +914,7 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, req->r_dentry = dget(new_dentry); req->r_num_caps = 2; req->r_old_dentry = dget(old_dentry); + req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry); req->r_locked_dir = new_dir; req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL; @@ -905,7 +933,7 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, */ /* d_move screws up d_subdirs order */ - ceph_i_clear(new_dir, CEPH_I_COMPLETE); + ceph_dir_clear_complete(new_dir); d_move(old_dentry, new_dentry); @@ -1002,36 +1030,38 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry) */ static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd) { + int valid = 0; struct inode *dir; if (nd && nd->flags & LOOKUP_RCU) return -ECHILD; - dir = dentry->d_parent->d_inode; - dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode, ceph_dentry(dentry)->offset); + dir = ceph_get_dentry_parent_inode(dentry); + /* always trust cached snapped dentries, snapdir dentry */ if (ceph_snap(dir) != CEPH_NOSNAP) { dout("d_revalidate %p '%.*s' inode %p is SNAPPED\n", dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode); - goto out_touch; + valid = 1; + } else if (dentry->d_inode && + ceph_snap(dentry->d_inode) == CEPH_SNAPDIR) { + valid = 1; + } else if (dentry_lease_is_valid(dentry) || + dir_lease_is_valid(dir, dentry)) { + valid = 1; } - if (dentry->d_inode && ceph_snap(dentry->d_inode) == CEPH_SNAPDIR) - goto out_touch; - if (dentry_lease_is_valid(dentry) || - dir_lease_is_valid(dir, dentry)) - goto out_touch; - - dout("d_revalidate %p invalid\n", dentry); - d_drop(dentry); - return 0; -out_touch: - ceph_dentry_lru_touch(dentry); - return 1; + dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid"); + if (valid) + ceph_dentry_lru_touch(dentry); + else + d_drop(dentry); + iput(dir); + return valid; } /* @@ -1061,7 +1091,75 @@ static int ceph_snapdir_d_revalidate(struct dentry *dentry, return 1; } +/* + * Set/clear/test dir complete flag on the dir's dentry. + */ +static struct dentry * __d_find_any_alias(struct inode *inode) +{ + struct dentry *alias; + if (list_empty(&inode->i_dentry)) + return NULL; + alias = list_first_entry(&inode->i_dentry, struct dentry, d_alias); + return alias; +} + +void ceph_dir_set_complete(struct inode *inode) +{ + struct dentry *dentry = __d_find_any_alias(inode); + + if (dentry && ceph_dentry(dentry)) { + dout(" marking %p (%p) complete\n", inode, dentry); + set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags); + } +} + +void ceph_dir_clear_complete(struct inode *inode) +{ + struct dentry *dentry = __d_find_any_alias(inode); + + if (dentry && ceph_dentry(dentry)) { + dout(" marking %p (%p) NOT complete\n", inode, dentry); + clear_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags); + } +} + +bool ceph_dir_test_complete(struct inode *inode) +{ + struct dentry *dentry = __d_find_any_alias(inode); + + if (dentry && ceph_dentry(dentry)) + return test_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags); + return false; +} + +/* + * When the VFS prunes a dentry from the cache, we need to clear the + * complete flag on the parent directory. + * + * Called under dentry->d_lock. + */ +static void ceph_d_prune(struct dentry *dentry) +{ + struct ceph_dentry_info *di; + + dout("ceph_d_prune %p\n", dentry); + + /* do we have a valid parent? */ + if (!dentry->d_parent || IS_ROOT(dentry)) + return; + + /* if we are not hashed, we don't affect D_COMPLETE */ + if (d_unhashed(dentry)) + return; + + /* + * we hold d_lock, so d_parent is stable, and d_fsdata is never + * cleared until d_release + */ + di = ceph_dentry(dentry->d_parent); + clear_bit(CEPH_D_COMPLETE, &di->flags); +} /* * read() on a dir. This weird interface hack only works if mounted @@ -1228,9 +1326,8 @@ void ceph_dentry_lru_del(struct dentry *dn) * Return name hash for a given dentry. This is dependent on * the parent directory's hash function. */ -unsigned ceph_dentry_hash(struct dentry *dn) +unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn) { - struct inode *dir = dn->d_parent->d_inode; struct ceph_inode_info *dci = ceph_inode(dir); switch (dci->i_dir_layout.dl_dir_hash) { @@ -1276,6 +1373,7 @@ const struct inode_operations ceph_dir_iops = { const struct dentry_operations ceph_dentry_ops = { .d_revalidate = ceph_d_revalidate, .d_release = ceph_d_release, + .d_prune = ceph_d_prune, }; const struct dentry_operations ceph_snapdir_dentry_ops = { @@ -1285,4 +1383,5 @@ const struct dentry_operations ceph_snapdir_dentry_ops = { const struct dentry_operations ceph_snap_dentry_ops = { .d_release = ceph_d_release, + .d_prune = ceph_d_prune, }; diff --git a/fs/ceph/export.c b/fs/ceph/export.c index f67b687550de..9fbcdecaaccd 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -46,7 +46,7 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len, int type; struct ceph_nfs_fh *fh = (void *)rawfh; struct ceph_nfs_confh *cfh = (void *)rawfh; - struct dentry *parent = dentry->d_parent; + struct dentry *parent; struct inode *inode = dentry->d_inode; int connected_handle_length = sizeof(*cfh)/4; int handle_length = sizeof(*fh)/4; @@ -55,26 +55,33 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len, if (ceph_snap(inode) != CEPH_NOSNAP) return -EINVAL; + spin_lock(&dentry->d_lock); + parent = dget(dentry->d_parent); + spin_unlock(&dentry->d_lock); + if (*max_len >= connected_handle_length) { dout("encode_fh %p connectable\n", dentry); cfh->ino = ceph_ino(dentry->d_inode); cfh->parent_ino = ceph_ino(parent->d_inode); - cfh->parent_name_hash = ceph_dentry_hash(parent); + cfh->parent_name_hash = ceph_dentry_hash(parent->d_inode, + dentry); *max_len = connected_handle_length; type = 2; } else if (*max_len >= handle_length) { if (connectable) { *max_len = connected_handle_length; - return 255; + type = 255; + } else { + dout("encode_fh %p\n", dentry); + fh->ino = ceph_ino(dentry->d_inode); + *max_len = handle_length; + type = 1; } - dout("encode_fh %p\n", dentry); - fh->ino = ceph_ino(dentry->d_inode); - *max_len = handle_length; - type = 1; } else { *max_len = handle_length; - return 255; + type = 255; } + dput(parent); return type; } @@ -123,7 +130,6 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, return dentry; } err = ceph_init_dentry(dentry); - if (err < 0) { iput(inode); return ERR_PTR(err); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 0d0eae05598f..ce549d31eeb7 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -122,7 +122,7 @@ int ceph_open(struct inode *inode, struct file *file) struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; struct ceph_file_info *cf = file->private_data; - struct inode *parent_inode = file->f_dentry->d_parent->d_inode; + struct inode *parent_inode = NULL; int err; int flags, fmode, wanted; @@ -194,7 +194,10 @@ int ceph_open(struct inode *inode, struct file *file) req->r_inode = inode; ihold(inode); req->r_num_caps = 1; + if (flags & (O_CREAT|O_TRUNC)) + parent_inode = ceph_get_dentry_parent_inode(file->f_dentry); err = ceph_mdsc_do_request(mdsc, parent_inode, req); + iput(parent_inode); if (!err) err = ceph_init_file(inode, file, req->r_fmode); ceph_mdsc_put_request(req); @@ -222,9 +225,9 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry, { struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); struct ceph_mds_client *mdsc = fsc->mdsc; - struct file *file = nd->intent.open.file; - struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry); + struct file *file; struct ceph_mds_request *req; + struct dentry *ret; int err; int flags = nd->intent.open.flags; @@ -242,16 +245,24 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry, req->r_dentry_unless = CEPH_CAP_FILE_EXCL; } req->r_locked_dir = dir; /* caller holds dir->i_mutex */ - err = ceph_mdsc_do_request(mdsc, parent_inode, req); - dentry = ceph_finish_lookup(req, dentry, err); - if (!err && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry) + err = ceph_mdsc_do_request(mdsc, + (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, + req); + err = ceph_handle_snapdir(req, dentry, err); + if (err) + goto out; + if ((flags & O_CREAT) && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); - if (!err) - err = ceph_init_file(req->r_dentry->d_inode, file, - req->r_fmode); + if (err) + goto out; + file = lookup_instantiate_filp(nd, req->r_dentry, ceph_open); + if (IS_ERR(file)) + err = PTR_ERR(file); +out: + ret = ceph_finish_lookup(req, dentry, err); ceph_mdsc_put_request(req); - dout("ceph_lookup_open result=%p\n", dentry); - return dentry; + dout("ceph_lookup_open result=%p\n", ret); + return ret; } int ceph_release(struct inode *inode, struct file *file) @@ -643,7 +654,8 @@ again: if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 || (iocb->ki_filp->f_flags & O_DIRECT) || - (inode->i_sb->s_flags & MS_SYNCHRONOUS)) + (inode->i_sb->s_flags & MS_SYNCHRONOUS) || + (fi->flags & CEPH_F_SYNC)) /* hmm, this isn't really async... */ ret = ceph_sync_read(filp, base, len, ppos, &checkeof); else @@ -712,7 +724,7 @@ retry_snap: want = CEPH_CAP_FILE_BUFFER; ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff); if (ret < 0) - goto out; + goto out_put; dout("aio_write %p %llx.%llx %llu~%u got cap refs on %s\n", inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, @@ -720,12 +732,23 @@ retry_snap: if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 || (iocb->ki_filp->f_flags & O_DIRECT) || - (inode->i_sb->s_flags & MS_SYNCHRONOUS)) { + (inode->i_sb->s_flags & MS_SYNCHRONOUS) || + (fi->flags & CEPH_F_SYNC)) { ret = ceph_sync_write(file, iov->iov_base, iov->iov_len, &iocb->ki_pos); } else { - ret = generic_file_aio_write(iocb, iov, nr_segs, pos); + /* + * buffered write; drop Fw early to avoid slow + * revocation if we get stuck on balance_dirty_pages + */ + int dirty; + spin_lock(&inode->i_lock); + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); + spin_unlock(&inode->i_lock); + ceph_put_cap_refs(ci, got); + + ret = generic_file_aio_write(iocb, iov, nr_segs, pos); if ((ret >= 0 || ret == -EIOCBQUEUED) && ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) { @@ -733,7 +756,12 @@ retry_snap: if (err < 0) ret = err; } + + if (dirty) + __mark_inode_dirty(inode, dirty); + goto out; } + if (ret >= 0) { int dirty; spin_lock(&inode->i_lock); @@ -743,12 +771,13 @@ retry_snap: __mark_inode_dirty(inode, dirty); } -out: +out_put: dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n", inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, ceph_cap_string(got)); ceph_put_cap_refs(ci, got); +out: if (ret == -EOLDSNAPC) { dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n", inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index dfb2831d8d85..116f36502f17 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -9,7 +9,6 @@ #include <linux/namei.h> #include <linux/writeback.h> #include <linux/vmalloc.h> -#include <linux/pagevec.h> #include "super.h" #include "mds_client.h" @@ -560,7 +559,8 @@ static int fill_inode(struct inode *inode, struct ceph_mds_reply_inode *info = iinfo->in; struct ceph_inode_info *ci = ceph_inode(inode); int i; - int issued, implemented; + int issued = 0, implemented; + int updating_inode = 0; struct timespec mtime, atime, ctime; u32 nsplits; struct ceph_buffer *xattr_blob = NULL; @@ -599,7 +599,8 @@ static int fill_inode(struct inode *inode, if (le64_to_cpu(info->version) > 0 && (ci->i_version & ~1) >= le64_to_cpu(info->version)) goto no_change; - + + updating_inode = 1; issued = __ceph_caps_issued(ci, &implemented); issued |= implemented | __ceph_caps_dirty(ci); @@ -617,7 +618,7 @@ static int fill_inode(struct inode *inode, } if ((issued & CEPH_CAP_LINK_EXCL) == 0) - inode->i_nlink = le32_to_cpu(info->nlink); + set_nlink(inode, le32_to_cpu(info->nlink)); /* be careful with mtime, atime, size */ ceph_decode_timespec(&atime, &info->atime); @@ -707,17 +708,6 @@ static int fill_inode(struct inode *inode, ci->i_rfiles = le64_to_cpu(info->rfiles); ci->i_rsubdirs = le64_to_cpu(info->rsubdirs); ceph_decode_timespec(&ci->i_rctime, &info->rctime); - - /* set dir completion flag? */ - if (ci->i_files == 0 && ci->i_subdirs == 0 && - ceph_snap(inode) == CEPH_NOSNAP && - (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) && - (issued & CEPH_CAP_FILE_EXCL) == 0 && - (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) { - dout(" marking %p complete (empty)\n", inode); - /* ci->i_ceph_flags |= CEPH_I_COMPLETE; */ - ci->i_max_offset = 2; - } break; default: pr_err("fill_inode %llx.%llx BAD mode 0%o\n", @@ -774,6 +764,19 @@ no_change: __ceph_get_fmode(ci, cap_fmode); } + /* set dir completion flag? */ + if (S_ISDIR(inode->i_mode) && + updating_inode && /* didn't jump to no_change */ + ci->i_files == 0 && ci->i_subdirs == 0 && + ceph_snap(inode) == CEPH_NOSNAP && + (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) && + (issued & CEPH_CAP_FILE_EXCL) == 0 && + !ceph_dir_test_complete(inode)) { + dout(" marking %p complete (empty)\n", inode); + ceph_dir_set_complete(inode); + ci->i_max_offset = 2; + } + /* update delegation info? */ if (dirinfo) ceph_fill_dirfrag(inode, dirinfo); @@ -805,14 +808,14 @@ static void update_dentry_lease(struct dentry *dentry, return; spin_lock(&dentry->d_lock); - dout("update_dentry_lease %p mask %d duration %lu ms ttl %lu\n", - dentry, le16_to_cpu(lease->mask), duration, ttl); + dout("update_dentry_lease %p duration %lu ms ttl %lu\n", + dentry, duration, ttl); /* make lease_rdcache_gen match directory */ dir = dentry->d_parent->d_inode; di->lease_shared_gen = ceph_inode(dir)->i_shared_gen; - if (lease->mask == 0) + if (duration == 0) goto out_unlock; if (di->lease_gen == session->s_cap_gen && @@ -839,11 +842,13 @@ out_unlock: /* * Set dentry's directory position based on the current dir's max, and * order it in d_subdirs, so that dcache_readdir behaves. + * + * Always called under directory's i_mutex. */ static void ceph_set_dentry_offset(struct dentry *dn) { struct dentry *dir = dn->d_parent; - struct inode *inode = dn->d_parent->d_inode; + struct inode *inode = dir->d_inode; struct ceph_dentry_info *di; BUG_ON(!inode); @@ -851,7 +856,7 @@ static void ceph_set_dentry_offset(struct dentry *dn) di = ceph_dentry(dn); spin_lock(&inode->i_lock); - if ((ceph_inode(inode)->i_ceph_flags & CEPH_I_COMPLETE) == 0) { + if (!ceph_dir_test_complete(inode)) { spin_unlock(&inode->i_lock); return; } @@ -1022,9 +1027,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, /* do we have a dn lease? */ have_lease = have_dir_cap || - (le16_to_cpu(rinfo->dlease->mask) & - CEPH_LOCK_DN); - + le32_to_cpu(rinfo->dlease->duration_ms); if (!have_lease) dout("fill_trace no dentry lease or dir cap\n"); @@ -1053,7 +1056,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, * d_move() puts the renamed dentry at the end of * d_subdirs. We need to assign it an appropriate * directory offset so we can behave when holding - * I_COMPLETE. + * D_COMPLETE. */ ceph_set_dentry_offset(req->r_old_dentry); dout("dn %p gets new offset %lld\n", req->r_old_dentry, @@ -1325,12 +1328,13 @@ int ceph_inode_set_size(struct inode *inode, loff_t size) */ void ceph_queue_writeback(struct inode *inode) { + ihold(inode); if (queue_work(ceph_inode_to_client(inode)->wb_wq, &ceph_inode(inode)->i_wb_work)) { dout("ceph_queue_writeback %p\n", inode); - ihold(inode); } else { dout("ceph_queue_writeback %p failed\n", inode); + iput(inode); } } @@ -1350,55 +1354,13 @@ static void ceph_writeback_work(struct work_struct *work) */ void ceph_queue_invalidate(struct inode *inode) { + ihold(inode); if (queue_work(ceph_inode_to_client(inode)->pg_inv_wq, &ceph_inode(inode)->i_pg_inv_work)) { dout("ceph_queue_invalidate %p\n", inode); - ihold(inode); } else { dout("ceph_queue_invalidate %p failed\n", inode); - } -} - -/* - * invalidate any pages that are not dirty or under writeback. this - * includes pages that are clean and mapped. - */ -static void ceph_invalidate_nondirty_pages(struct address_space *mapping) -{ - struct pagevec pvec; - pgoff_t next = 0; - int i; - - pagevec_init(&pvec, 0); - while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; - pgoff_t index; - int skip_page = - (PageDirty(page) || PageWriteback(page)); - - if (!skip_page) - skip_page = !trylock_page(page); - - /* - * We really shouldn't be looking at the ->index of an - * unlocked page. But we're not allowed to lock these - * pages. So we rely upon nobody altering the ->index - * of this (pinned-by-us) page. - */ - index = page->index; - if (index > next) - next = index; - next++; - - if (skip_page) - continue; - - generic_error_remove_page(mapping, page); - unlock_page(page); - } - pagevec_release(&pvec); - cond_resched(); + iput(inode); } } @@ -1425,7 +1387,7 @@ static void ceph_invalidate_work(struct work_struct *work) orig_gen = ci->i_rdcache_gen; spin_unlock(&inode->i_lock); - ceph_invalidate_nondirty_pages(inode->i_mapping); + truncate_inode_pages(&inode->i_data, 0); spin_lock(&inode->i_lock); if (orig_gen == ci->i_rdcache_gen && @@ -1474,13 +1436,14 @@ void ceph_queue_vmtruncate(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); + ihold(inode); if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq, &ci->i_vmtruncate_work)) { dout("ceph_queue_vmtruncate %p\n", inode); - ihold(inode); } else { dout("ceph_queue_vmtruncate %p failed, pending=%d\n", inode, ci->i_truncate_pending); + iput(inode); } } @@ -1560,7 +1523,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct inode *parent_inode = dentry->d_parent->d_inode; + struct inode *parent_inode; const unsigned int ia_valid = attr->ia_valid; struct ceph_mds_request *req; struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc; @@ -1743,7 +1706,9 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) req->r_inode_drop = release; req->r_args.setattr.mask = cpu_to_le32(mask); req->r_num_caps = 1; + parent_inode = ceph_get_dentry_parent_inode(dentry); err = ceph_mdsc_do_request(mdsc, parent_inode, req); + iput(parent_inode); } dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err, ceph_cap_string(dirtied), mask); diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index ef0b5f48e13a..5a14c29cbba6 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -38,21 +38,43 @@ static long ceph_ioctl_get_layout(struct file *file, void __user *arg) static long ceph_ioctl_set_layout(struct file *file, void __user *arg) { struct inode *inode = file->f_dentry->d_inode; - struct inode *parent_inode = file->f_dentry->d_parent->d_inode; + struct inode *parent_inode; struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_mds_request *req; struct ceph_ioctl_layout l; + struct ceph_inode_info *ci = ceph_inode(file->f_dentry->d_inode); + struct ceph_ioctl_layout nl; int err, i; - /* copy and validate */ if (copy_from_user(&l, arg, sizeof(l))) return -EFAULT; - if ((l.object_size & ~PAGE_MASK) || - (l.stripe_unit & ~PAGE_MASK) || - !l.stripe_unit || - (l.object_size && - (unsigned)l.object_size % (unsigned)l.stripe_unit)) + /* validate changed params against current layout */ + err = ceph_do_getattr(file->f_dentry->d_inode, CEPH_STAT_CAP_LAYOUT); + if (!err) { + nl.stripe_unit = ceph_file_layout_su(ci->i_layout); + nl.stripe_count = ceph_file_layout_stripe_count(ci->i_layout); + nl.object_size = ceph_file_layout_object_size(ci->i_layout); + nl.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool); + nl.preferred_osd = + (s32)le32_to_cpu(ci->i_layout.fl_pg_preferred); + } else + return err; + + if (l.stripe_count) + nl.stripe_count = l.stripe_count; + if (l.stripe_unit) + nl.stripe_unit = l.stripe_unit; + if (l.object_size) + nl.object_size = l.object_size; + if (l.data_pool) + nl.data_pool = l.data_pool; + if (l.preferred_osd) + nl.preferred_osd = l.preferred_osd; + + if ((nl.object_size & ~PAGE_MASK) || + (nl.stripe_unit & ~PAGE_MASK) || + ((unsigned)nl.object_size % (unsigned)nl.stripe_unit)) return -EINVAL; /* make sure it's a valid data pool */ @@ -87,7 +109,9 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg) req->r_args.setlayout.layout.fl_pg_preferred = cpu_to_le32(l.preferred_osd); + parent_inode = ceph_get_dentry_parent_inode(file->f_dentry); err = ceph_mdsc_do_request(mdsc, parent_inode, req); + iput(parent_inode); ceph_mdsc_put_request(req); return err; } @@ -231,6 +255,14 @@ static long ceph_ioctl_lazyio(struct file *file) return 0; } +static long ceph_ioctl_syncio(struct file *file) +{ + struct ceph_file_info *fi = file->private_data; + + fi->flags |= CEPH_F_SYNC; + return 0; +} + long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg); @@ -249,6 +281,9 @@ long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case CEPH_IOC_LAZYIO: return ceph_ioctl_lazyio(file); + + case CEPH_IOC_SYNCIO: + return ceph_ioctl_syncio(file); } return -ENOTTY; diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h index 52e8fd74d450..be4a60487333 100644 --- a/fs/ceph/ioctl.h +++ b/fs/ceph/ioctl.h @@ -6,7 +6,31 @@ #define CEPH_IOCTL_MAGIC 0x97 -/* just use u64 to align sanely on all archs */ +/* + * CEPH_IOC_GET_LAYOUT - get file layout or dir layout policy + * CEPH_IOC_SET_LAYOUT - set file layout + * CEPH_IOC_SET_LAYOUT_POLICY - set dir layout policy + * + * The file layout specifies how file data is striped over objects in + * the distributed object store, which object pool they belong to (if + * it differs from the default), and an optional 'preferred osd' to + * store them on. + * + * Files get a new layout based on the policy set on the containing + * directory or one of its ancestors. The GET_LAYOUT ioctl will let + * you examine the layout for a file or the policy on a directory. + * + * SET_LAYOUT will let you set a layout on a newly created file. This + * only works immediately after the file is created and before any + * data is written to it. + * + * SET_LAYOUT_POLICY will let you set a layout policy (default layout) + * on a directory that will apply to any new files created in that + * directory (or any child directory that doesn't specify a layout of + * its own). + */ + +/* use u64 to align sanely on all archs */ struct ceph_ioctl_layout { __u64 stripe_unit, stripe_count, object_size; __u64 data_pool; @@ -21,6 +45,8 @@ struct ceph_ioctl_layout { struct ceph_ioctl_layout) /* + * CEPH_IOC_GET_DATALOC - get location of file data in the cluster + * * Extract identity, address of the OSD and object storing a given * file offset. */ @@ -39,6 +65,34 @@ struct ceph_ioctl_dataloc { #define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3, \ struct ceph_ioctl_dataloc) +/* + * CEPH_IOC_LAZYIO - relax consistency + * + * Normally Ceph switches to synchronous IO when multiple clients have + * the file open (and or more for write). Reads and writes bypass the + * page cache and go directly to the OSD. Setting this flag on a file + * descriptor will allow buffered IO for this file in cases where the + * application knows it won't interfere with other nodes (or doesn't + * care). + */ #define CEPH_IOC_LAZYIO _IO(CEPH_IOCTL_MAGIC, 4) +/* + * CEPH_IOC_SYNCIO - force synchronous IO + * + * This ioctl sets a file flag that forces the synchronous IO that + * bypasses the page cache, even if it is not necessary. This is + * essentially the opposite behavior of IOC_LAZYIO. This forces the + * same read/write path as a file opened by multiple clients when one + * or more of those clients is opened for write. + * + * Note that this type of sync IO takes a different path than a file + * opened with O_SYNC/D_SYNC (writes hit the page cache and are + * immediately flushed on page boundaries). It is very similar to + * O_DIRECT (writes bypass the page cache) excep that O_DIRECT writes + * are not copied (user page must remain stable) and O_DIRECT writes + * have alignment restrictions (on the buffer and file offset). + */ +#define CEPH_IOC_SYNCIO _IO(CEPH_IOCTL_MAGIC, 5) + #endif diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 0c1d91756528..264ab701154f 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -483,22 +483,26 @@ void ceph_mdsc_release_request(struct kref *kref) destroy_reply_info(&req->r_reply_info); } if (req->r_inode) { - ceph_put_cap_refs(ceph_inode(req->r_inode), - CEPH_CAP_PIN); + ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); iput(req->r_inode); } if (req->r_locked_dir) - ceph_put_cap_refs(ceph_inode(req->r_locked_dir), - CEPH_CAP_PIN); + ceph_put_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN); if (req->r_target_inode) iput(req->r_target_inode); if (req->r_dentry) dput(req->r_dentry); if (req->r_old_dentry) { - ceph_put_cap_refs( - ceph_inode(req->r_old_dentry->d_parent->d_inode), - CEPH_CAP_PIN); + /* + * track (and drop pins for) r_old_dentry_dir + * separately, since r_old_dentry's d_parent may have + * changed between the dir mutex being dropped and + * this request being freed. + */ + ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir), + CEPH_CAP_PIN); dput(req->r_old_dentry); + iput(req->r_old_dentry_dir); } kfree(req->r_path1); kfree(req->r_path2); @@ -615,8 +619,14 @@ static void __unregister_request(struct ceph_mds_client *mdsc, * * Called under mdsc->mutex. */ -struct dentry *get_nonsnap_parent(struct dentry *dentry) +static struct dentry *get_nonsnap_parent(struct dentry *dentry) { + /* + * we don't need to worry about protecting the d_parent access + * here because we never renaming inside the snapped namespace + * except to resplice to another snapdir, and either the old or new + * result is a valid result. + */ while (!IS_ROOT(dentry) && ceph_snap(dentry->d_inode) != CEPH_NOSNAP) dentry = dentry->d_parent; return dentry; @@ -652,7 +662,9 @@ static int __choose_mds(struct ceph_mds_client *mdsc, if (req->r_inode) { inode = req->r_inode; } else if (req->r_dentry) { - struct inode *dir = req->r_dentry->d_parent->d_inode; + /* ignore race with rename; old or new d_parent is okay */ + struct dentry *parent = req->r_dentry->d_parent; + struct inode *dir = parent->d_inode; if (dir->i_sb != mdsc->fsc->sb) { /* not this fs! */ @@ -660,8 +672,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc, } else if (ceph_snap(dir) != CEPH_NOSNAP) { /* direct snapped/virtual snapdir requests * based on parent dir inode */ - struct dentry *dn = - get_nonsnap_parent(req->r_dentry->d_parent); + struct dentry *dn = get_nonsnap_parent(parent); inode = dn->d_inode; dout("__choose_mds using nonsnap parent %p\n", inode); } else if (req->r_dentry->d_inode) { @@ -670,7 +681,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc, } else { /* dir + name */ inode = dir; - hash = ceph_dentry_hash(req->r_dentry); + hash = ceph_dentry_hash(dir, req->r_dentry); is_hash = true; } } @@ -753,7 +764,8 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq) struct ceph_msg *msg; struct ceph_mds_session_head *h; - msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS); + msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS, + false); if (!msg) { pr_err("create_session_msg ENOMEM creating msg\n"); return NULL; @@ -1229,7 +1241,7 @@ int ceph_add_cap_releases(struct ceph_mds_client *mdsc, while (session->s_num_cap_releases < session->s_nr_caps + extra) { spin_unlock(&session->s_cap_lock); msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE, - GFP_NOFS); + GFP_NOFS, false); if (!msg) goto out_unlocked; dout("add_cap_releases %p msg %p now %d\n", session, msg, @@ -1584,7 +1596,7 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, r = build_dentry_path(rdentry, ppath, pathlen, ino, freepath); dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen, *ppath); - } else if (rpath) { + } else if (rpath || rino) { *ino = rino; *ppath = rpath; *pathlen = strlen(rpath); @@ -1641,7 +1653,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, if (req->r_old_dentry_drop) len += req->r_old_dentry->d_name.len; - msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS); + msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS, false); if (!msg) { msg = ERR_PTR(-ENOMEM); goto out_free2; @@ -1931,9 +1943,8 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, if (req->r_locked_dir) ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN); if (req->r_old_dentry) - ceph_get_cap_refs( - ceph_inode(req->r_old_dentry->d_parent->d_inode), - CEPH_CAP_PIN); + ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir), + CEPH_CAP_PIN); /* issue */ mutex_lock(&mdsc->mutex); @@ -1991,7 +2002,7 @@ out: } /* - * Invalidate dir I_COMPLETE, dentry lease state on an aborted MDS + * Invalidate dir D_COMPLETE, dentry lease state on an aborted MDS * namespace request. */ void ceph_invalidate_dir_request(struct ceph_mds_request *req) @@ -1999,9 +2010,9 @@ void ceph_invalidate_dir_request(struct ceph_mds_request *req) struct inode *inode = req->r_locked_dir; struct ceph_inode_info *ci = ceph_inode(inode); - dout("invalidate_dir_request %p (I_COMPLETE, lease(s))\n", inode); + dout("invalidate_dir_request %p (D_COMPLETE, lease(s))\n", inode); spin_lock(&inode->i_lock); - ci->i_ceph_flags &= ~CEPH_I_COMPLETE; + ceph_dir_clear_complete(inode); ci->i_release_count++; spin_unlock(&inode->i_lock); @@ -2508,7 +2519,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, goto fail_nopagelist; ceph_pagelist_init(pagelist); - reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS); + reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS, false); if (!reply) goto fail_nomsg; @@ -2714,7 +2725,6 @@ static void handle_lease(struct ceph_mds_client *mdsc, struct ceph_mds_lease *h = msg->front.iov_base; u32 seq; struct ceph_vino vino; - int mask; struct qstr dname; int release = 0; @@ -2725,7 +2735,6 @@ static void handle_lease(struct ceph_mds_client *mdsc, goto bad; vino.ino = le64_to_cpu(h->ino); vino.snap = CEPH_NOSNAP; - mask = le16_to_cpu(h->mask); seq = le32_to_cpu(h->seq); dname.name = (void *)h + sizeof(*h) + sizeof(u32); dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32); @@ -2737,8 +2746,8 @@ static void handle_lease(struct ceph_mds_client *mdsc, /* lookup inode */ inode = ceph_find_inode(sb, vino); - dout("handle_lease %s, mask %d, ino %llx %p %.*s\n", - ceph_lease_op_name(h->action), mask, vino.ino, inode, + dout("handle_lease %s, ino %llx %p %.*s\n", + ceph_lease_op_name(h->action), vino.ino, inode, dname.len, dname.name); if (inode == NULL) { dout("handle_lease no inode %llx\n", vino.ino); @@ -2823,12 +2832,11 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, dnamelen = dentry->d_name.len; len += dnamelen; - msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS); + msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS, false); if (!msg) return; lease = msg->front.iov_base; lease->action = action; - lease->mask = cpu_to_le16(1); lease->ino = cpu_to_le64(ceph_vino(inode).ino); lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap); lease->seq = cpu_to_le32(seq); @@ -2850,7 +2858,7 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, * Pass @inode always, @dentry is optional. */ void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode, - struct dentry *dentry, int mask) + struct dentry *dentry) { struct ceph_dentry_info *di; struct ceph_mds_session *session; @@ -2858,7 +2866,6 @@ void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode, BUG_ON(inode == NULL); BUG_ON(dentry == NULL); - BUG_ON(mask == 0); /* is dentry lease valid? */ spin_lock(&dentry->d_lock); @@ -2868,8 +2875,8 @@ void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode, di->lease_gen != di->lease_session->s_cap_gen || !time_before(jiffies, dentry->d_time)) { dout("lease_release inode %p dentry %p -- " - "no lease on %d\n", - inode, dentry, mask); + "no lease\n", + inode, dentry); spin_unlock(&dentry->d_lock); return; } @@ -2880,8 +2887,8 @@ void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode, __ceph_mdsc_drop_dentry_lease(dentry); spin_unlock(&dentry->d_lock); - dout("lease_release inode %p dentry %p mask %d to mds%d\n", - inode, dentry, mask, session->s_mds); + dout("lease_release inode %p dentry %p to mds%d\n", + inode, dentry, session->s_mds); ceph_mdsc_lease_send_msg(session, inode, dentry, CEPH_MDS_LEASE_RELEASE, seq); ceph_put_mds_session(session); @@ -3147,7 +3154,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) /* * true if all sessions are closed, or we force unmount */ -bool done_closing_sessions(struct ceph_mds_client *mdsc) +static bool done_closing_sessions(struct ceph_mds_client *mdsc) { int i, n = 0; diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 7d8a0d662d56..4bb239921dbd 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -171,6 +171,7 @@ struct ceph_mds_request { struct inode *r_inode; /* arg1 */ struct dentry *r_dentry; /* arg1 */ struct dentry *r_old_dentry; /* arg2: rename from or link from */ + struct inode *r_old_dentry_dir; /* arg2: old dentry's parent dir */ char *r_path1, *r_path2; struct ceph_vino r_ino1, r_ino2; @@ -333,7 +334,7 @@ extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc); extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode, - struct dentry *dn, int mask); + struct dentry *dn); extern void ceph_invalidate_dir_request(struct ceph_mds_request *req); diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 54b14de2e729..e26437191333 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -449,6 +449,15 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) spin_lock(&inode->i_lock); used = __ceph_caps_used(ci); dirty = __ceph_caps_dirty(ci); + + /* + * If there is a write in progress, treat that as a dirty Fw, + * even though it hasn't completed yet; by the time we finish + * up this capsnap it will be. + */ + if (used & CEPH_CAP_FILE_WR) + dirty |= CEPH_CAP_FILE_WR; + if (__ceph_have_pending_cap_snap(ci)) { /* there is no point in queuing multiple "pending" cap_snaps, as no new writes are allowed to start when pending, so any @@ -456,13 +465,19 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) cap_snap. lucky us. */ dout("queue_cap_snap %p already pending\n", inode); kfree(capsnap); - } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR) || - (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL| - CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR))) { + } else if (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL| + CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR)) { struct ceph_snap_context *snapc = ci->i_head_snapc; - dout("queue_cap_snap %p cap_snap %p queuing under %p\n", inode, - capsnap, snapc); + /* + * if we are a sync write, we may need to go to the snaprealm + * to get the current snapc. + */ + if (!snapc) + snapc = ci->i_snap_realm->cached_context; + + dout("queue_cap_snap %p cap_snap %p queuing under %p %s\n", + inode, capsnap, snapc, ceph_cap_string(dirty)); ihold(inode); atomic_set(&capsnap->nref, 1); diff --git a/fs/ceph/super.c b/fs/ceph/super.c index f2f77fd3c14c..8dc73a594a90 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -73,8 +73,7 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) */ buf->f_bsize = 1 << CEPH_BLOCK_SHIFT; buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10); - buf->f_bfree = (le64_to_cpu(st.kb) - le64_to_cpu(st.kb_used)) >> - (CEPH_BLOCK_SHIFT-10); + buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10); buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10); buf->f_files = le64_to_cpu(st.num_objects); @@ -115,6 +114,7 @@ static int ceph_sync_fs(struct super_block *sb, int wait) enum { Opt_wsize, Opt_rsize, + Opt_rasize, Opt_caps_wanted_delay_min, Opt_caps_wanted_delay_max, Opt_cap_release_safety, @@ -137,6 +137,7 @@ enum { static match_table_t fsopt_tokens = { {Opt_wsize, "wsize=%d"}, {Opt_rsize, "rsize=%d"}, + {Opt_rasize, "rasize=%d"}, {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"}, {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"}, {Opt_cap_release_safety, "cap_release_safety=%d"}, @@ -197,6 +198,9 @@ static int parse_fsopt_token(char *c, void *private) case Opt_rsize: fsopt->rsize = intval; break; + case Opt_rasize: + fsopt->rasize = intval; + break; case Opt_caps_wanted_delay_min: fsopt->caps_wanted_delay_min = intval; break; @@ -290,28 +294,29 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, dout("parse_mount_options %p, dev_name '%s'\n", fsopt, dev_name); - fsopt->sb_flags = flags; - fsopt->flags = CEPH_MOUNT_OPT_DEFAULT; + fsopt->sb_flags = flags; + fsopt->flags = CEPH_MOUNT_OPT_DEFAULT; - fsopt->rsize = CEPH_RSIZE_DEFAULT; - fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); + fsopt->rsize = CEPH_RSIZE_DEFAULT; + fsopt->rasize = CEPH_RASIZE_DEFAULT; + fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT; fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; - fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT; - fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT; - fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; - fsopt->congestion_kb = default_congestion_kb(); - - /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */ - err = -EINVAL; - if (!dev_name) - goto out; - *path = strstr(dev_name, ":/"); - if (*path == NULL) { - pr_err("device name is missing path (no :/ in %s)\n", - dev_name); - goto out; - } + fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT; + fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT; + fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; + fsopt->congestion_kb = default_congestion_kb(); + + /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */ + err = -EINVAL; + if (!dev_name) + goto out; + *path = strstr(dev_name, ":/"); + if (*path == NULL) { + pr_err("device name is missing path (no :/ in %s)\n", + dev_name); + goto out; + } dev_name_end = *path; dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name); @@ -377,6 +382,8 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt) seq_printf(m, ",wsize=%d", fsopt->wsize); if (fsopt->rsize != CEPH_RSIZE_DEFAULT) seq_printf(m, ",rsize=%d", fsopt->rsize); + if (fsopt->rasize != CEPH_RASIZE_DEFAULT) + seq_printf(m, ",rasize=%d", fsopt->rsize); if (fsopt->congestion_kb != default_congestion_kb()) seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb); if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT) @@ -419,24 +426,27 @@ static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg) /* * create a new fs client */ -struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, +static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, struct ceph_options *opt) { struct ceph_fs_client *fsc; + const unsigned supported_features = + CEPH_FEATURE_FLOCK | + CEPH_FEATURE_DIRLAYOUTHASH; + const unsigned required_features = 0; int err = -ENOMEM; fsc = kzalloc(sizeof(*fsc), GFP_KERNEL); if (!fsc) return ERR_PTR(-ENOMEM); - fsc->client = ceph_create_client(opt, fsc); + fsc->client = ceph_create_client(opt, fsc, supported_features, + required_features); if (IS_ERR(fsc->client)) { err = PTR_ERR(fsc->client); goto fail; } fsc->client->extra_mon_dispatch = extra_mon_dispatch; - fsc->client->supported_features |= CEPH_FEATURE_FLOCK | - CEPH_FEATURE_DIRLAYOUTHASH; fsc->client->monc.want_mdsmap = 1; fsc->mount_options = fsopt; @@ -492,7 +502,7 @@ fail: return ERR_PTR(err); } -void destroy_fs_client(struct ceph_fs_client *fsc) +static void destroy_fs_client(struct ceph_fs_client *fsc) { dout("destroy_fs_client %p\n", fsc); @@ -628,10 +638,12 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc, if (err == 0) { dout("open_root_inode success\n"); if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT && - fsc->sb->s_root == NULL) + fsc->sb->s_root == NULL) { root = d_alloc_root(req->r_target_inode); - else + ceph_init_dentry(root); + } else { root = d_obtain_alias(req->r_target_inode); + } req->r_target_inode = NULL; dout("open_root_inode success, root dentry is %p\n", root); } else { @@ -775,11 +787,15 @@ static int ceph_register_bdi(struct super_block *sb, { int err; - /* set ra_pages based on rsize mount option? */ - if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE) + /* set ra_pages based on rasize mount option? */ + if (fsc->mount_options->rasize >= PAGE_CACHE_SIZE) fsc->backing_dev_info.ra_pages = - (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1) + (fsc->mount_options->rasize + PAGE_CACHE_SIZE - 1) >> PAGE_SHIFT; + else + fsc->backing_dev_info.ra_pages = + default_backing_dev_info.ra_pages; + err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%d", atomic_long_inc_return(&bdi_seq)); if (!err) @@ -810,8 +826,8 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type, fsc = create_fs_client(fsopt, opt); if (IS_ERR(fsc)) { res = ERR_CAST(fsc); - kfree(fsopt); - kfree(opt); + destroy_mount_options(fsopt); + ceph_destroy_options(opt); goto out_final; } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 30446b144e3d..01bf189e08a9 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -36,7 +36,8 @@ #define ceph_test_mount_opt(fsc, opt) \ (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt)) -#define CEPH_RSIZE_DEFAULT (512*1024) /* readahead */ +#define CEPH_RSIZE_DEFAULT 0 /* max read size */ +#define CEPH_RASIZE_DEFAULT (8192*1024) /* readahead */ #define CEPH_MAX_READDIR_DEFAULT 1024 #define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024) #define CEPH_SNAPDIRNAME_DEFAULT ".snap" @@ -45,8 +46,9 @@ struct ceph_mount_options { int flags; int sb_flags; - int wsize; - int rsize; /* max readahead */ + int wsize; /* max write size */ + int rsize; /* max read size */ + int rasize; /* max readahead */ int congestion_kb; /* max writeback in flight */ int caps_wanted_delay_min, caps_wanted_delay_max; int cap_release_safety; @@ -201,6 +203,7 @@ struct ceph_inode_xattr { * Ceph dentry state */ struct ceph_dentry_info { + unsigned long flags; struct ceph_mds_session *lease_session; u32 lease_gen, lease_shared_gen; u32 lease_seq; @@ -211,6 +214,18 @@ struct ceph_dentry_info { u64 offset; }; +/* + * dentry flags + * + * The locking for D_COMPLETE is a bit odd: + * - we can clear it at almost any time (see ceph_d_prune) + * - it is only meaningful if: + * - we hold dir inode i_lock + * - we hold dir FILE_SHARED caps + * - the dentry D_COMPLETE is set + */ +#define CEPH_D_COMPLETE 1 /* if set, d_u.d_subdirs is complete directory */ + struct ceph_inode_xattrs_info { /* * (still encoded) xattr blob. we avoid the overhead of parsing @@ -249,7 +264,7 @@ struct ceph_inode_info { struct timespec i_rctime; u64 i_rbytes, i_rfiles, i_rsubdirs; u64 i_files, i_subdirs; - u64 i_max_offset; /* largest readdir offset, set with I_COMPLETE */ + u64 i_max_offset; /* largest readdir offset, set with D_COMPLETE */ struct rb_root i_fragtree; struct mutex i_fragtree_mutex; @@ -344,9 +359,10 @@ static inline struct ceph_vino ceph_vino(struct inode *inode) * x86_64+ino32 64 32 * x86_64 64 64 */ -static inline u32 ceph_ino_to_ino32(ino_t ino) +static inline u32 ceph_ino_to_ino32(__u64 vino) { - ino ^= ino >> (sizeof(ino) * 8 - 32); + u32 ino = vino & 0xffffffff; + ino ^= vino >> 32; if (!ino) ino = 1; return ino; @@ -357,11 +373,11 @@ static inline u32 ceph_ino_to_ino32(ino_t ino) */ static inline ino_t ceph_vino_to_ino(struct ceph_vino vino) { - ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */ #if BITS_PER_LONG == 32 - ino = ceph_ino_to_ino32(ino); + return ceph_ino_to_ino32(vino.ino); +#else + return (ino_t)vino.ino; #endif - return ino; } /* @@ -413,7 +429,6 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, /* * Ceph inode. */ -#define CEPH_I_COMPLETE 1 /* we have complete directory cached */ #define CEPH_I_NODELAY 4 /* do not delay cap release */ #define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */ #define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */ @@ -471,6 +486,13 @@ static inline loff_t ceph_make_fpos(unsigned frag, unsigned off) } /* + * set/clear directory D_COMPLETE flag + */ +void ceph_dir_set_complete(struct inode *inode); +void ceph_dir_clear_complete(struct inode *inode); +bool ceph_dir_test_complete(struct inode *inode); + +/* * caps helpers */ static inline bool __ceph_is_any_real_caps(struct ceph_inode_info *ci) @@ -543,13 +565,16 @@ extern void ceph_reservation_status(struct ceph_fs_client *client, /* * we keep buffered readdir results attached to file->private_data */ +#define CEPH_F_SYNC 1 +#define CEPH_F_ATEND 2 + struct ceph_file_info { - int fmode; /* initialized on open */ + short fmode; /* initialized on open */ + short flags; /* CEPH_F_* */ /* readdir: position within the dir */ u32 frag; struct ceph_mds_request *last_readdir; - int at_end; /* readdir: position within a frag */ unsigned offset; /* offset of last chunk, adjusted for . and .. */ @@ -789,6 +814,8 @@ extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops, ceph_snapdir_dentry_ops; extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry); +extern int ceph_handle_snapdir(struct ceph_mds_request *req, + struct dentry *dentry, int err); extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, struct dentry *dentry, int err); @@ -796,7 +823,8 @@ extern void ceph_dentry_lru_add(struct dentry *dn); extern void ceph_dentry_lru_touch(struct dentry *dn); extern void ceph_dentry_lru_del(struct dentry *dn); extern void ceph_invalidate_dentry_lease(struct dentry *dentry); -extern unsigned ceph_dentry_hash(struct dentry *dn); +extern unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn); +extern struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry); /* * our d_ops vary depending on whether the inode is live, @@ -819,14 +847,6 @@ extern int ceph_encode_locks(struct inode *i, struct ceph_pagelist *p, int p_locks, int f_locks); extern int lock_to_ceph_filelock(struct file_lock *fl, struct ceph_filelock *c); -static inline struct inode *get_dentry_parent_inode(struct dentry *dentry) -{ - if (dentry && dentry->d_parent) - return dentry->d_parent->d_inode; - - return NULL; -} - /* debugfs.c */ extern int ceph_fs_debugfs_init(struct ceph_fs_client *client); extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client); diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index f42d730f1b66..96c6739a0280 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -629,7 +629,7 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name, struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); struct inode *inode = dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct inode *parent_inode = dentry->d_parent->d_inode; + struct inode *parent_inode; struct ceph_mds_request *req; struct ceph_mds_client *mdsc = fsc->mdsc; int err; @@ -677,7 +677,9 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name, req->r_data_len = size; dout("xattr.ver (before): %lld\n", ci->i_xattrs.version); + parent_inode = ceph_get_dentry_parent_inode(dentry); err = ceph_mdsc_do_request(mdsc, parent_inode, req); + iput(parent_inode); ceph_mdsc_put_request(req); dout("xattr.ver (after): %lld\n", ci->i_xattrs.version); @@ -788,7 +790,7 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name) struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); struct ceph_mds_client *mdsc = fsc->mdsc; struct inode *inode = dentry->d_inode; - struct inode *parent_inode = dentry->d_parent->d_inode; + struct inode *parent_inode; struct ceph_mds_request *req; int err; @@ -802,7 +804,9 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name) req->r_num_caps = 1; req->r_path2 = kstrdup(name, GFP_NOFS); + parent_inode = ceph_get_dentry_parent_inode(dentry); err = ceph_mdsc_do_request(mdsc, parent_inode, req); + iput(parent_inode); ceph_mdsc_put_request(req); return err; } diff --git a/fs/cifs/README b/fs/cifs/README index c5c2c5e5f0f2..895da1dc1550 100644 --- a/fs/cifs/README +++ b/fs/cifs/README @@ -745,4 +745,18 @@ installed and something like the following lines should be added to the create cifs.spnego * * /usr/local/sbin/cifs.upcall %k create dns_resolver * * /usr/local/sbin/cifs.upcall %k +CIFS kernel module parameters +============================= +These module parameters can be specified or modified either during the time of +module loading or during the runtime by using the interface + /proc/module/cifs/parameters/<param> + +i.e. echo "value" > /sys/module/cifs/parameters/<param> + +1. echo_retries - The number of echo attempts before giving up and + reconnecting to the server. The default is 5. The value 0 + means never reconnect. + +2. enable_oplocks - Enable or disable oplocks. Oplocks are enabled by default. + [Y/y/1]. To disable use any of [N/n/0]. diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 2fe3cf13b2e9..84e8c0724704 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -176,7 +176,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) #ifdef CONFIG_CIFS_STATS2 seq_printf(m, " In Send: %d In MaxReq Wait: %d", - atomic_read(&server->inSend), + atomic_read(&server->in_send), atomic_read(&server->num_waiters)); #endif @@ -511,7 +511,7 @@ static const struct file_operations cifsFYI_proc_fops = { static int cifs_oplock_proc_show(struct seq_file *m, void *v) { - seq_printf(m, "%d\n", oplockEnabled); + seq_printf(m, "%d\n", enable_oplocks); return 0; } @@ -526,13 +526,16 @@ static ssize_t cifs_oplock_proc_write(struct file *file, char c; int rc; + printk(KERN_WARNING "CIFS: The /proc/fs/cifs/OplockEnabled interface " + "will be removed in kernel version 3.4. Please migrate to " + "using the 'enable_oplocks' module parameter in cifs.ko.\n"); rc = get_user(c, buffer); if (rc) return rc; if (c == '0' || c == 'n' || c == 'N') - oplockEnabled = 0; + enable_oplocks = false; else if (c == '1' || c == 'y' || c == 'Y') - oplockEnabled = 1; + enable_oplocks = true; return count; } diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index 8d8f28c94c0f..6873bb634a97 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -141,10 +141,11 @@ char *cifs_compose_mount_options(const char *sb_mountdata, rc = dns_resolve_server_name_to_ip(*devname, &srvIP); if (rc < 0) { - cERROR(1, "%s: Failed to resolve server part of %s to IP: %d", - __func__, *devname, rc); + cFYI(1, "%s: Failed to resolve server part of %s to IP: %d", + __func__, *devname, rc); goto compose_mount_options_err; } + /* md_len = strlen(...) + 12 for 'sep+prefixpath=' * assuming that we have 'unc=' and 'ip=' in * the original sb_mountdata diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h index 7260e11e21f8..500d65859279 100644 --- a/fs/cifs/cifs_fs_sb.h +++ b/fs/cifs/cifs_fs_sb.h @@ -43,6 +43,8 @@ #define CIFS_MOUNT_STRICT_IO 0x40000 /* strict cache mode */ #define CIFS_MOUNT_RWPIDFORWARD 0x80000 /* use pid forwarding for rw */ #define CIFS_MOUNT_POSIXACL 0x100000 /* mirror of MS_POSIXACL in mnt_cifs_flags */ +#define CIFS_MOUNT_CIFS_BACKUPUID 0x200000 /* backup intent bit for a user */ +#define CIFS_MOUNT_CIFS_BACKUPGID 0x400000 /* backup intent bit for a group */ struct cifs_sb_info { struct rb_root tlink_tree; @@ -55,6 +57,8 @@ struct cifs_sb_info { atomic_t active; uid_t mnt_uid; gid_t mnt_gid; + uid_t mnt_backupuid; + gid_t mnt_backupgid; mode_t mnt_file_mode; mode_t mnt_dir_mode; unsigned int mnt_cifs_flags; diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 21de1d6d5849..72ddf23ef6f7 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -91,9 +91,76 @@ cifs_idmap_shrinker(struct shrinker *shrink, struct shrink_control *sc) shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del); spin_unlock(&sidgidlock); + root = &siduidtree; + spin_lock(&uidsidlock); + shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del); + spin_unlock(&uidsidlock); + + root = &sidgidtree; + spin_lock(&gidsidlock); + shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del); + spin_unlock(&gidsidlock); + return nr_rem; } +static void +sid_rb_insert(struct rb_root *root, unsigned long cid, + struct cifs_sid_id **psidid, char *typestr) +{ + char *strptr; + struct rb_node *node = root->rb_node; + struct rb_node *parent = NULL; + struct rb_node **linkto = &(root->rb_node); + struct cifs_sid_id *lsidid; + + while (node) { + lsidid = rb_entry(node, struct cifs_sid_id, rbnode); + parent = node; + if (cid > lsidid->id) { + linkto = &(node->rb_left); + node = node->rb_left; + } + if (cid < lsidid->id) { + linkto = &(node->rb_right); + node = node->rb_right; + } + } + + (*psidid)->id = cid; + (*psidid)->time = jiffies - (SID_MAP_RETRY + 1); + (*psidid)->refcount = 0; + + sprintf((*psidid)->sidstr, "%s", typestr); + strptr = (*psidid)->sidstr + strlen((*psidid)->sidstr); + sprintf(strptr, "%ld", cid); + + clear_bit(SID_ID_PENDING, &(*psidid)->state); + clear_bit(SID_ID_MAPPED, &(*psidid)->state); + + rb_link_node(&(*psidid)->rbnode, parent, linkto); + rb_insert_color(&(*psidid)->rbnode, root); +} + +static struct cifs_sid_id * +sid_rb_search(struct rb_root *root, unsigned long cid) +{ + struct rb_node *node = root->rb_node; + struct cifs_sid_id *lsidid; + + while (node) { + lsidid = rb_entry(node, struct cifs_sid_id, rbnode); + if (cid > lsidid->id) + node = node->rb_left; + else if (cid < lsidid->id) + node = node->rb_right; + else /* node found */ + return lsidid; + } + + return NULL; +} + static struct shrinker cifs_shrinker = { .shrink = cifs_idmap_shrinker, .seeks = DEFAULT_SEEKS, @@ -110,6 +177,7 @@ cifs_idmap_key_instantiate(struct key *key, const void *data, size_t datalen) memcpy(payload, data, datalen); key->payload.data = payload; + key->datalen = datalen; return 0; } @@ -224,6 +292,120 @@ sidid_pending_wait(void *unused) } static int +id_to_sid(unsigned long cid, uint sidtype, struct cifs_sid *ssid) +{ + int rc = 0; + struct key *sidkey; + const struct cred *saved_cred; + struct cifs_sid *lsid; + struct cifs_sid_id *psidid, *npsidid; + struct rb_root *cidtree; + spinlock_t *cidlock; + + if (sidtype == SIDOWNER) { + cidlock = &siduidlock; + cidtree = &uidtree; + } else if (sidtype == SIDGROUP) { + cidlock = &sidgidlock; + cidtree = &gidtree; + } else + return -EINVAL; + + spin_lock(cidlock); + psidid = sid_rb_search(cidtree, cid); + + if (!psidid) { /* node does not exist, allocate one & attempt adding */ + spin_unlock(cidlock); + npsidid = kzalloc(sizeof(struct cifs_sid_id), GFP_KERNEL); + if (!npsidid) + return -ENOMEM; + + npsidid->sidstr = kmalloc(SIDLEN, GFP_KERNEL); + if (!npsidid->sidstr) { + kfree(npsidid); + return -ENOMEM; + } + + spin_lock(cidlock); + psidid = sid_rb_search(cidtree, cid); + if (psidid) { /* node happened to get inserted meanwhile */ + ++psidid->refcount; + spin_unlock(cidlock); + kfree(npsidid->sidstr); + kfree(npsidid); + } else { + psidid = npsidid; + sid_rb_insert(cidtree, cid, &psidid, + sidtype == SIDOWNER ? "oi:" : "gi:"); + ++psidid->refcount; + spin_unlock(cidlock); + } + } else { + ++psidid->refcount; + spin_unlock(cidlock); + } + + /* + * If we are here, it is safe to access psidid and its fields + * since a reference was taken earlier while holding the spinlock. + * A reference on the node is put without holding the spinlock + * and it is OK to do so in this case, shrinker will not erase + * this node until all references are put and we do not access + * any fields of the node after a reference is put . + */ + if (test_bit(SID_ID_MAPPED, &psidid->state)) { + memcpy(ssid, &psidid->sid, sizeof(struct cifs_sid)); + psidid->time = jiffies; /* update ts for accessing */ + goto id_sid_out; + } + + if (time_after(psidid->time + SID_MAP_RETRY, jiffies)) { + rc = -EINVAL; + goto id_sid_out; + } + + if (!test_and_set_bit(SID_ID_PENDING, &psidid->state)) { + saved_cred = override_creds(root_cred); + sidkey = request_key(&cifs_idmap_key_type, psidid->sidstr, ""); + if (IS_ERR(sidkey)) { + rc = -EINVAL; + cFYI(1, "%s: Can't map and id to a SID", __func__); + } else { + lsid = (struct cifs_sid *)sidkey->payload.data; + memcpy(&psidid->sid, lsid, + sidkey->datalen < sizeof(struct cifs_sid) ? + sidkey->datalen : sizeof(struct cifs_sid)); + memcpy(ssid, &psidid->sid, + sidkey->datalen < sizeof(struct cifs_sid) ? + sidkey->datalen : sizeof(struct cifs_sid)); + set_bit(SID_ID_MAPPED, &psidid->state); + key_put(sidkey); + kfree(psidid->sidstr); + } + psidid->time = jiffies; /* update ts for accessing */ + revert_creds(saved_cred); + clear_bit(SID_ID_PENDING, &psidid->state); + wake_up_bit(&psidid->state, SID_ID_PENDING); + } else { + rc = wait_on_bit(&psidid->state, SID_ID_PENDING, + sidid_pending_wait, TASK_INTERRUPTIBLE); + if (rc) { + cFYI(1, "%s: sidid_pending_wait interrupted %d", + __func__, rc); + --psidid->refcount; + return rc; + } + if (test_bit(SID_ID_MAPPED, &psidid->state)) + memcpy(ssid, &psidid->sid, sizeof(struct cifs_sid)); + else + rc = -EINVAL; + } +id_sid_out: + --psidid->refcount; + return rc; +} + +static int sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid, struct cifs_fattr *fattr, uint sidtype) { @@ -383,6 +565,10 @@ init_cifs_idmap(void) spin_lock_init(&sidgidlock); gidtree = RB_ROOT; + spin_lock_init(&uidsidlock); + siduidtree = RB_ROOT; + spin_lock_init(&gidsidlock); + sidgidtree = RB_ROOT; register_shrinker(&cifs_shrinker); cFYI(1, "cifs idmap keyring: %d\n", key_serial(keyring)); @@ -422,6 +608,18 @@ cifs_destroy_idmaptrees(void) while ((node = rb_first(root))) rb_erase(node, root); spin_unlock(&sidgidlock); + + root = &siduidtree; + spin_lock(&uidsidlock); + while ((node = rb_first(root))) + rb_erase(node, root); + spin_unlock(&uidsidlock); + + root = &sidgidtree; + spin_lock(&gidsidlock); + while ((node = rb_first(root))) + rb_erase(node, root); + spin_unlock(&gidsidlock); } /* if the two SIDs (roughly equivalent to a UUID for a user or group) are @@ -706,7 +904,7 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl, acl_size = sizeof(struct cifs_acl); num_aces = le32_to_cpu(pdacl->num_aces); - if (num_aces > 0) { + if (num_aces > 0) { umode_t user_mask = S_IRWXU; umode_t group_mask = S_IRWXG; umode_t other_mask = S_IRWXU | S_IRWXG | S_IRWXO; @@ -868,52 +1066,82 @@ static int parse_sec_desc(struct cifs_sb_info *cifs_sb, else cFYI(1, "no ACL"); /* BB grant all or default perms? */ -/* cifscred->uid = owner_sid_ptr->rid; - cifscred->gid = group_sid_ptr->rid; - memcpy((void *)(&(cifscred->osid)), (void *)owner_sid_ptr, - sizeof(struct cifs_sid)); - memcpy((void *)(&(cifscred->gsid)), (void *)group_sid_ptr, - sizeof(struct cifs_sid)); */ - return rc; } - /* Convert permission bits from mode to equivalent CIFS ACL */ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, - struct inode *inode, __u64 nmode) + __u32 secdesclen, __u64 nmode, uid_t uid, gid_t gid, int *aclflag) { int rc = 0; __u32 dacloffset; __u32 ndacloffset; __u32 sidsoffset; struct cifs_sid *owner_sid_ptr, *group_sid_ptr; + struct cifs_sid *nowner_sid_ptr, *ngroup_sid_ptr; struct cifs_acl *dacl_ptr = NULL; /* no need for SACL ptr */ struct cifs_acl *ndacl_ptr = NULL; /* no need for SACL ptr */ - if ((inode == NULL) || (pntsd == NULL) || (pnntsd == NULL)) - return -EIO; - - owner_sid_ptr = (struct cifs_sid *)((char *)pntsd + + if (nmode != NO_CHANGE_64) { /* chmod */ + owner_sid_ptr = (struct cifs_sid *)((char *)pntsd + le32_to_cpu(pntsd->osidoffset)); - group_sid_ptr = (struct cifs_sid *)((char *)pntsd + + group_sid_ptr = (struct cifs_sid *)((char *)pntsd + le32_to_cpu(pntsd->gsidoffset)); - - dacloffset = le32_to_cpu(pntsd->dacloffset); - dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset); - - ndacloffset = sizeof(struct cifs_ntsd); - ndacl_ptr = (struct cifs_acl *)((char *)pnntsd + ndacloffset); - ndacl_ptr->revision = dacl_ptr->revision; - ndacl_ptr->size = 0; - ndacl_ptr->num_aces = 0; - - rc = set_chmod_dacl(ndacl_ptr, owner_sid_ptr, group_sid_ptr, nmode); - - sidsoffset = ndacloffset + le16_to_cpu(ndacl_ptr->size); - - /* copy security descriptor control portion and owner and group sid */ - copy_sec_desc(pntsd, pnntsd, sidsoffset); + dacloffset = le32_to_cpu(pntsd->dacloffset); + dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset); + ndacloffset = sizeof(struct cifs_ntsd); + ndacl_ptr = (struct cifs_acl *)((char *)pnntsd + ndacloffset); + ndacl_ptr->revision = dacl_ptr->revision; + ndacl_ptr->size = 0; + ndacl_ptr->num_aces = 0; + + rc = set_chmod_dacl(ndacl_ptr, owner_sid_ptr, group_sid_ptr, + nmode); + sidsoffset = ndacloffset + le16_to_cpu(ndacl_ptr->size); + /* copy sec desc control portion & owner and group sids */ + copy_sec_desc(pntsd, pnntsd, sidsoffset); + *aclflag = CIFS_ACL_DACL; + } else { + memcpy(pnntsd, pntsd, secdesclen); + if (uid != NO_CHANGE_32) { /* chown */ + owner_sid_ptr = (struct cifs_sid *)((char *)pnntsd + + le32_to_cpu(pnntsd->osidoffset)); + nowner_sid_ptr = kmalloc(sizeof(struct cifs_sid), + GFP_KERNEL); + if (!nowner_sid_ptr) + return -ENOMEM; + rc = id_to_sid(uid, SIDOWNER, nowner_sid_ptr); + if (rc) { + cFYI(1, "%s: Mapping error %d for owner id %d", + __func__, rc, uid); + kfree(nowner_sid_ptr); + return rc; + } + memcpy(owner_sid_ptr, nowner_sid_ptr, + sizeof(struct cifs_sid)); + kfree(nowner_sid_ptr); + *aclflag = CIFS_ACL_OWNER; + } + if (gid != NO_CHANGE_32) { /* chgrp */ + group_sid_ptr = (struct cifs_sid *)((char *)pnntsd + + le32_to_cpu(pnntsd->gsidoffset)); + ngroup_sid_ptr = kmalloc(sizeof(struct cifs_sid), + GFP_KERNEL); + if (!ngroup_sid_ptr) + return -ENOMEM; + rc = id_to_sid(gid, SIDGROUP, ngroup_sid_ptr); + if (rc) { + cFYI(1, "%s: Mapping error %d for group id %d", + __func__, rc, gid); + kfree(ngroup_sid_ptr); + return rc; + } + memcpy(group_sid_ptr, ngroup_sid_ptr, + sizeof(struct cifs_sid)); + kfree(ngroup_sid_ptr); + *aclflag = CIFS_ACL_GROUP; + } + } return rc; } @@ -945,7 +1173,7 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, { struct cifs_ntsd *pntsd = NULL; int oplock = 0; - int xid, rc; + int xid, rc, create_options = 0; __u16 fid; struct cifs_tcon *tcon; struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); @@ -956,9 +1184,12 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, tcon = tlink_tcon(tlink); xid = GetXid(); - rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, READ_CONTROL, 0, - &fid, &oplock, NULL, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + if (backup_cred(cifs_sb)) + create_options |= CREATE_OPEN_BACKUP_INTENT; + + rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, READ_CONTROL, + create_options, &fid, &oplock, NULL, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); if (!rc) { rc = CIFSSMBGetCIFSACL(xid, tcon, fid, &pntsd, pacllen); CIFSSMBClose(xid, tcon, fid); @@ -991,31 +1222,15 @@ struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb, return pntsd; } -static int set_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, __u16 fid, - struct cifs_ntsd *pnntsd, u32 acllen) -{ - int xid, rc; - struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); - - if (IS_ERR(tlink)) - return PTR_ERR(tlink); - - xid = GetXid(); - rc = CIFSSMBSetCIFSACL(xid, tlink_tcon(tlink), fid, pnntsd, acllen); - FreeXid(xid); - cifs_put_tlink(tlink); - - cFYI(DBG2, "SetCIFSACL rc = %d", rc); - return rc; -} - -static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path, - struct cifs_ntsd *pnntsd, u32 acllen) + /* Set an ACL on the server */ +int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen, + struct inode *inode, const char *path, int aclflag) { int oplock = 0; - int xid, rc; + int xid, rc, access_flags, create_options = 0; __u16 fid; struct cifs_tcon *tcon; + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) @@ -1024,15 +1239,23 @@ static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path, tcon = tlink_tcon(tlink); xid = GetXid(); - rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, WRITE_DAC, 0, - &fid, &oplock, NULL, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + if (backup_cred(cifs_sb)) + create_options |= CREATE_OPEN_BACKUP_INTENT; + + if (aclflag == CIFS_ACL_OWNER || aclflag == CIFS_ACL_GROUP) + access_flags = WRITE_OWNER; + else + access_flags = WRITE_DAC; + + rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, access_flags, + create_options, &fid, &oplock, NULL, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); if (rc) { cERROR(1, "Unable to open file to set ACL"); goto out; } - rc = CIFSSMBSetCIFSACL(xid, tcon, fid, pnntsd, acllen); + rc = CIFSSMBSetCIFSACL(xid, tcon, fid, pnntsd, acllen, aclflag); cFYI(DBG2, "SetCIFSACL rc = %d", rc); CIFSSMBClose(xid, tcon, fid); @@ -1042,25 +1265,6 @@ out: return rc; } -/* Set an ACL on the server */ -int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen, - struct inode *inode, const char *path) -{ - struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); - struct cifsFileInfo *open_file; - int rc; - - cFYI(DBG2, "set ACL for %s from mode 0x%x", path, inode->i_mode); - - open_file = find_readable_file(CIFS_I(inode), true); - if (!open_file) - return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen); - - rc = set_cifs_acl_by_fid(cifs_sb, open_file->netfid, pnntsd, acllen); - cifsFileInfo_put(open_file); - return rc; -} - /* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */ int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, @@ -1092,9 +1296,12 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, } /* Convert mode bits to an ACL so we can update the ACL on the server */ -int mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode) +int +id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode, + uid_t uid, gid_t gid) { int rc = 0; + int aclflag = CIFS_ACL_DACL; /* default flag to set */ __u32 secdesclen = 0; struct cifs_ntsd *pntsd = NULL; /* acl obtained from server */ struct cifs_ntsd *pnntsd = NULL; /* modified acl to be sent to server */ @@ -1124,13 +1331,15 @@ int mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode) return -ENOMEM; } - rc = build_sec_desc(pntsd, pnntsd, inode, nmode); + rc = build_sec_desc(pntsd, pnntsd, secdesclen, nmode, uid, gid, + &aclflag); cFYI(DBG2, "build_sec_desc rc: %d", rc); if (!rc) { /* Set the security descriptor */ - rc = set_cifs_acl(pnntsd, secdesclen, inode, path); + rc = set_cifs_acl(pnntsd, secdesclen, inode, + path, aclflag); cFYI(DBG2, "set_cifs_acl rc: %d", rc); } diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 5a0ee7f2af06..5d9b9acc5fce 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -37,12 +37,13 @@ * the sequence number before this function is called. Also, this function * should be called with the server->srv_mutex held. */ -static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu, - struct TCP_Server_Info *server, char *signature) +static int cifs_calc_signature(const struct kvec *iov, int n_vec, + struct TCP_Server_Info *server, char *signature) { + int i; int rc; - if (cifs_pdu == NULL || signature == NULL || server == NULL) + if (iov == NULL || signature == NULL || server == NULL) return -EINVAL; if (!server->secmech.sdescmd5) { @@ -52,73 +53,17 @@ static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu, rc = crypto_shash_init(&server->secmech.sdescmd5->shash); if (rc) { - cERROR(1, "%s: Oould not init md5\n", __func__); + cERROR(1, "%s: Could not init md5\n", __func__); return rc; } - crypto_shash_update(&server->secmech.sdescmd5->shash, + rc = crypto_shash_update(&server->secmech.sdescmd5->shash, server->session_key.response, server->session_key.len); - - crypto_shash_update(&server->secmech.sdescmd5->shash, - cifs_pdu->Protocol, be32_to_cpu(cifs_pdu->smb_buf_length)); - - rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature); - - return 0; -} - -/* must be called with server->srv_mutex held */ -int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server, - __u32 *pexpected_response_sequence_number) -{ - int rc = 0; - char smb_signature[20]; - - if ((cifs_pdu == NULL) || (server == NULL)) - return -EINVAL; - - if ((cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) == 0) - return rc; - - cifs_pdu->Signature.Sequence.SequenceNumber = - cpu_to_le32(server->sequence_number); - cifs_pdu->Signature.Sequence.Reserved = 0; - - *pexpected_response_sequence_number = server->sequence_number++; - server->sequence_number++; - - rc = cifs_calculate_signature(cifs_pdu, server, smb_signature); - if (rc) - memset(cifs_pdu->Signature.SecuritySignature, 0, 8); - else - memcpy(cifs_pdu->Signature.SecuritySignature, smb_signature, 8); - - return rc; -} - -static int cifs_calc_signature2(const struct kvec *iov, int n_vec, - struct TCP_Server_Info *server, char *signature) -{ - int i; - int rc; - - if (iov == NULL || signature == NULL || server == NULL) - return -EINVAL; - - if (!server->secmech.sdescmd5) { - cERROR(1, "%s: Can't generate signature\n", __func__); - return -1; - } - - rc = crypto_shash_init(&server->secmech.sdescmd5->shash); if (rc) { - cERROR(1, "%s: Oould not init md5\n", __func__); + cERROR(1, "%s: Could not update with response\n", __func__); return rc; } - crypto_shash_update(&server->secmech.sdescmd5->shash, - server->session_key.response, server->session_key.len); - for (i = 0; i < n_vec; i++) { if (iov[i].iov_len == 0) continue; @@ -131,14 +76,24 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec, if (i == 0) { if (iov[0].iov_len <= 8) /* cmd field at offset 9 */ break; /* nothing to sign or corrupt header */ + rc = crypto_shash_update(&server->secmech.sdescmd5->shash, iov[i].iov_base + 4, iov[i].iov_len - 4); - } else + } else { + rc = crypto_shash_update(&server->secmech.sdescmd5->shash, iov[i].iov_base, iov[i].iov_len); + } + if (rc) { + cERROR(1, "%s: Could not update with payload\n", + __func__); + return rc; + } } rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature); + if (rc) + cERROR(1, "%s: Could not generate md5 hash\n", __func__); return rc; } @@ -149,14 +104,20 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server, { int rc = 0; char smb_signature[20]; - struct smb_hdr *cifs_pdu = iov[0].iov_base; + struct smb_hdr *cifs_pdu = (struct smb_hdr *)iov[0].iov_base; if ((cifs_pdu == NULL) || (server == NULL)) return -EINVAL; - if ((cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) == 0) + if (!(cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) || + server->tcpStatus == CifsNeedNegotiate) return rc; + if (!server->session_estab) { + memcpy(cifs_pdu->Signature.SecuritySignature, "BSRSPYL", 8); + return rc; + } + cifs_pdu->Signature.Sequence.SequenceNumber = cpu_to_le32(server->sequence_number); cifs_pdu->Signature.Sequence.Reserved = 0; @@ -164,7 +125,7 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server, *pexpected_response_sequence_number = server->sequence_number++; server->sequence_number++; - rc = cifs_calc_signature2(iov, n_vec, server, smb_signature); + rc = cifs_calc_signature(iov, n_vec, server, smb_signature); if (rc) memset(cifs_pdu->Signature.SecuritySignature, 0, 8); else @@ -173,13 +134,27 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server, return rc; } -int cifs_verify_signature(struct smb_hdr *cifs_pdu, +/* must be called with server->srv_mutex held */ +int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server, + __u32 *pexpected_response_sequence_number) +{ + struct kvec iov; + + iov.iov_base = cifs_pdu; + iov.iov_len = be32_to_cpu(cifs_pdu->smb_buf_length) + 4; + + return cifs_sign_smb2(&iov, 1, server, + pexpected_response_sequence_number); +} + +int cifs_verify_signature(struct kvec *iov, unsigned int nr_iov, struct TCP_Server_Info *server, __u32 expected_sequence_number) { unsigned int rc; char server_response_sig[8]; char what_we_think_sig_should_be[20]; + struct smb_hdr *cifs_pdu = (struct smb_hdr *)iov[0].iov_base; if (cifs_pdu == NULL || server == NULL) return -EINVAL; @@ -211,8 +186,8 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu, cifs_pdu->Signature.Sequence.Reserved = 0; mutex_lock(&server->srv_mutex); - rc = cifs_calculate_signature(cifs_pdu, server, - what_we_think_sig_should_be); + rc = cifs_calc_signature(iov, nr_iov, server, + what_we_think_sig_should_be); mutex_unlock(&server->srv_mutex); if (rc) @@ -229,7 +204,7 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu, } /* first calculate 24 bytes ntlm response and then 16 byte session key */ -int setup_ntlm_response(struct cifs_ses *ses) +int setup_ntlm_response(struct cifs_ses *ses, const struct nls_table *nls_cp) { int rc = 0; unsigned int temp_len = CIFS_SESS_KEY_SIZE + CIFS_AUTH_RESP_SIZE; @@ -246,14 +221,14 @@ int setup_ntlm_response(struct cifs_ses *ses) ses->auth_key.len = temp_len; rc = SMBNTencrypt(ses->password, ses->server->cryptkey, - ses->auth_key.response + CIFS_SESS_KEY_SIZE); + ses->auth_key.response + CIFS_SESS_KEY_SIZE, nls_cp); if (rc) { cFYI(1, "%s Can't generate NTLM response, error: %d", __func__, rc); return rc; } - rc = E_md4hash(ses->password, temp_key); + rc = E_md4hash(ses->password, temp_key, nls_cp); if (rc) { cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc); return rc; @@ -315,9 +290,7 @@ static int build_avpair_blob(struct cifs_ses *ses, const struct nls_table *nls_cp) { unsigned int dlen; - unsigned int wlen; - unsigned int size = 6 * sizeof(struct ntlmssp2_name); - __le64 curtime; + unsigned int size = 2 * sizeof(struct ntlmssp2_name); char *defdmname = "WORKGROUP"; unsigned char *blobptr; struct ntlmssp2_name *attrptr; @@ -329,15 +302,14 @@ build_avpair_blob(struct cifs_ses *ses, const struct nls_table *nls_cp) } dlen = strlen(ses->domainName); - wlen = strlen(ses->server->hostname); - /* The length of this blob is a size which is - * six times the size of a structure which holds name/size + - * two times the unicode length of a domain name + - * two times the unicode length of a server name + - * size of a timestamp (which is 8 bytes). + /* + * The length of this blob is two times the size of a + * structure (av pair) which holds name/size + * ( for NTLMSSP_AV_NB_DOMAIN_NAME followed by NTLMSSP_AV_EOL ) + + * unicode length of a netbios domain name */ - ses->auth_key.len = size + 2 * (2 * dlen) + 2 * (2 * wlen) + 8; + ses->auth_key.len = size + 2 * dlen; ses->auth_key.response = kzalloc(ses->auth_key.len, GFP_KERNEL); if (!ses->auth_key.response) { ses->auth_key.len = 0; @@ -348,44 +320,15 @@ build_avpair_blob(struct cifs_ses *ses, const struct nls_table *nls_cp) blobptr = ses->auth_key.response; attrptr = (struct ntlmssp2_name *) blobptr; + /* + * As defined in MS-NTLM 3.3.2, just this av pair field + * is sufficient as part of the temp + */ attrptr->type = cpu_to_le16(NTLMSSP_AV_NB_DOMAIN_NAME); attrptr->length = cpu_to_le16(2 * dlen); blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name); cifs_strtoUCS((__le16 *)blobptr, ses->domainName, dlen, nls_cp); - blobptr += 2 * dlen; - attrptr = (struct ntlmssp2_name *) blobptr; - - attrptr->type = cpu_to_le16(NTLMSSP_AV_NB_COMPUTER_NAME); - attrptr->length = cpu_to_le16(2 * wlen); - blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name); - cifs_strtoUCS((__le16 *)blobptr, ses->server->hostname, wlen, nls_cp); - - blobptr += 2 * wlen; - attrptr = (struct ntlmssp2_name *) blobptr; - - attrptr->type = cpu_to_le16(NTLMSSP_AV_DNS_DOMAIN_NAME); - attrptr->length = cpu_to_le16(2 * dlen); - blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name); - cifs_strtoUCS((__le16 *)blobptr, ses->domainName, dlen, nls_cp); - - blobptr += 2 * dlen; - attrptr = (struct ntlmssp2_name *) blobptr; - - attrptr->type = cpu_to_le16(NTLMSSP_AV_DNS_COMPUTER_NAME); - attrptr->length = cpu_to_le16(2 * wlen); - blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name); - cifs_strtoUCS((__le16 *)blobptr, ses->server->hostname, wlen, nls_cp); - - blobptr += 2 * wlen; - attrptr = (struct ntlmssp2_name *) blobptr; - - attrptr->type = cpu_to_le16(NTLMSSP_AV_TIMESTAMP); - attrptr->length = cpu_to_le16(sizeof(__le64)); - blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name); - curtime = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME)); - memcpy(blobptr, &curtime, sizeof(__le64)); - return 0; } @@ -461,10 +404,14 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, } /* calculate md4 hash of password */ - E_md4hash(ses->password, nt_hash); + E_md4hash(ses->password, nt_hash, nls_cp); - crypto_shash_setkey(ses->server->secmech.hmacmd5, nt_hash, + rc = crypto_shash_setkey(ses->server->secmech.hmacmd5, nt_hash, CIFS_NTHASH_SIZE); + if (rc) { + cERROR(1, "%s: Could not set NT Hash as a key", __func__); + return rc; + } rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash); if (rc) { @@ -478,13 +425,18 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, if (user == NULL) { cERROR(1, "calc_ntlmv2_hash: user mem alloc failure\n"); rc = -ENOMEM; - goto calc_exit_2; + return rc; } len = cifs_strtoUCS((__le16 *)user, ses->user_name, len, nls_cp); UniStrupr(user); - crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, + rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, (char *)user, 2 * len); + kfree(user); + if (rc) { + cERROR(1, "%s: Could not update with user\n", __func__); + return rc; + } /* convert ses->domainName to unicode and uppercase */ if (ses->domainName) { @@ -494,13 +446,19 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, if (domain == NULL) { cERROR(1, "calc_ntlmv2_hash: domain mem alloc failure"); rc = -ENOMEM; - goto calc_exit_1; + return rc; } len = cifs_strtoUCS((__le16 *)domain, ses->domainName, len, nls_cp); + rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, (char *)domain, 2 * len); kfree(domain); + if (rc) { + cERROR(1, "%s: Could not update with domain\n", + __func__); + return rc; + } } else if (ses->serverName) { len = strlen(ses->serverName); @@ -508,21 +466,26 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, if (server == NULL) { cERROR(1, "calc_ntlmv2_hash: server mem alloc failure"); rc = -ENOMEM; - goto calc_exit_1; + return rc; } len = cifs_strtoUCS((__le16 *)server, ses->serverName, len, nls_cp); + rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, (char *)server, 2 * len); kfree(server); + if (rc) { + cERROR(1, "%s: Could not update with server\n", + __func__); + return rc; + } } rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash, ntlmv2_hash); + if (rc) + cERROR(1, "%s: Could not generate md5 hash\n", __func__); -calc_exit_1: - kfree(user); -calc_exit_2: return rc; } @@ -537,8 +500,12 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash) return -1; } - crypto_shash_setkey(ses->server->secmech.hmacmd5, + rc = crypto_shash_setkey(ses->server->secmech.hmacmd5, ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); + if (rc) { + cERROR(1, "%s: Could not set NTLMV2 Hash as a key", __func__); + return rc; + } rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash); if (rc) { @@ -552,11 +519,17 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash) else memcpy(ses->auth_key.response + offset, ses->server->cryptkey, CIFS_SERVER_CHALLENGE_SIZE); - crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, + rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, ses->auth_key.response + offset, ses->auth_key.len - offset); + if (rc) { + cERROR(1, "%s: Could not update with response\n", __func__); + return rc; + } rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash, ses->auth_key.response + CIFS_SESS_KEY_SIZE); + if (rc) + cERROR(1, "%s: Could not generate md5 hash\n", __func__); return rc; } @@ -626,8 +599,12 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) } /* now calculate the session key for NTLMv2 */ - crypto_shash_setkey(ses->server->secmech.hmacmd5, + rc = crypto_shash_setkey(ses->server->secmech.hmacmd5, ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); + if (rc) { + cERROR(1, "%s: Could not set NTLMV2 Hash as a key", __func__); + goto setup_ntlmv2_rsp_ret; + } rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash); if (rc) { @@ -635,12 +612,18 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) goto setup_ntlmv2_rsp_ret; } - crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, + rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, ses->auth_key.response + CIFS_SESS_KEY_SIZE, CIFS_HMAC_MD5_HASH_SIZE); + if (rc) { + cERROR(1, "%s: Could not update with response\n", __func__); + goto setup_ntlmv2_rsp_ret; + } rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash, ses->auth_key.response); + if (rc) + cERROR(1, "%s: Could not generate md5 hash\n", __func__); setup_ntlmv2_rsp_ret: kfree(tiblob); @@ -668,8 +651,12 @@ calc_seckey(struct cifs_ses *ses) desc.tfm = tfm_arc4; - crypto_blkcipher_setkey(tfm_arc4, ses->auth_key.response, + rc = crypto_blkcipher_setkey(tfm_arc4, ses->auth_key.response, CIFS_SESS_KEY_SIZE); + if (rc) { + cERROR(1, "%s: Could not set response as a key", __func__); + return rc; + } sg_init_one(&sgin, sec_key, CIFS_SESS_KEY_SIZE); sg_init_one(&sgout, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE); @@ -688,7 +675,7 @@ calc_seckey(struct cifs_ses *ses) crypto_free_blkcipher(tfm_arc4); - return 0; + return rc; } void diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 865517470967..8f1fe324162b 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -53,7 +53,7 @@ int cifsFYI = 0; int cifsERROR = 1; int traceSMB = 0; -unsigned int oplockEnabled = 1; +bool enable_oplocks = true; unsigned int linuxExtEnabled = 1; unsigned int lookupCacheEnabled = 1; unsigned int multiuser_mount = 0; @@ -74,7 +74,7 @@ module_param(cifs_min_small, int, 0); MODULE_PARM_DESC(cifs_min_small, "Small network buffers in pool. Default: 30 " "Range: 2 to 256"); unsigned int cifs_max_pending = CIFS_MAX_REQ; -module_param(cifs_max_pending, int, 0); +module_param(cifs_max_pending, int, 0444); MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. " "Default: 50 Range: 2 to 256"); unsigned short echo_retries = 5; @@ -82,28 +82,14 @@ module_param(echo_retries, ushort, 0644); MODULE_PARM_DESC(echo_retries, "Number of echo attempts before giving up and " "reconnecting server. Default: 5. 0 means " "never reconnect."); +module_param(enable_oplocks, bool, 0644); +MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks (bool). Default:" + "y/Y/1"); + extern mempool_t *cifs_sm_req_poolp; extern mempool_t *cifs_req_poolp; extern mempool_t *cifs_mid_poolp; -void -cifs_sb_active(struct super_block *sb) -{ - struct cifs_sb_info *server = CIFS_SB(sb); - - if (atomic_inc_return(&server->active) == 1) - atomic_inc(&sb->s_active); -} - -void -cifs_sb_deactive(struct super_block *sb) -{ - struct cifs_sb_info *server = CIFS_SB(sb); - - if (atomic_dec_and_test(&server->active)) - deactivate_super(sb); -} - static int cifs_read_super(struct super_block *sb) { @@ -150,12 +136,12 @@ cifs_read_super(struct super_block *sb) else sb->s_d_op = &cifs_dentry_ops; -#ifdef CIFS_NFSD_EXPORT +#ifdef CONFIG_CIFS_NFSD_EXPORT if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { cFYI(1, "export ops supported"); sb->s_export_op = &cifs_export_ops; } -#endif /* CIFS_NFSD_EXPORT */ +#endif /* CONFIG_CIFS_NFSD_EXPORT */ return 0; @@ -450,6 +436,12 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m) seq_printf(s, ",mfsymlinks"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE) seq_printf(s, ",fsc"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC) + seq_printf(s, ",nostrictsync"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) + seq_printf(s, ",noperm"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) + seq_printf(s, ",strictcache"); seq_printf(s, ",rsize=%d", cifs_sb->rsize); seq_printf(s, ",wsize=%d", cifs_sb->wsize); @@ -548,7 +540,6 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb) char *full_path = NULL; char *s, *p; char sep; - int xid; full_path = cifs_build_path_to_root(vol, cifs_sb, cifs_sb_master_tcon(cifs_sb)); @@ -557,7 +548,6 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb) cFYI(1, "Get root dentry for %s", full_path); - xid = GetXid(); sep = CIFS_DIR_SEP(cifs_sb); dentry = dget(sb->s_root); p = s = full_path; @@ -566,6 +556,12 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb) struct inode *dir = dentry->d_inode; struct dentry *child; + if (!dir) { + dput(dentry); + dentry = ERR_PTR(-ENOENT); + break; + } + /* skip separators */ while (*s == sep) s++; @@ -582,7 +578,6 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb) dput(dentry); dentry = child; } while (!IS_ERR(dentry)); - _FreeXid(xid); kfree(full_path); return dentry; } @@ -735,7 +730,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin) if (rc < 0) return (loff_t)rc; } - return generic_file_llseek_unlocked(file, offset, origin); + return generic_file_llseek(file, offset, origin); } static int cifs_setlease(struct file *file, long arg, struct file_lock **lease) @@ -954,7 +949,8 @@ cifs_init_once(void *inode) struct cifsInodeInfo *cifsi = inode; inode_init_once(&cifsi->vfs_inode); - INIT_LIST_HEAD(&cifsi->lockList); + INIT_LIST_HEAD(&cifsi->llist); + mutex_init(&cifsi->lock_mutex); } static int diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index fbd050c8d52a..30ff56005d8f 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -41,10 +41,6 @@ extern struct file_system_type cifs_fs_type; extern const struct address_space_operations cifs_addr_ops; extern const struct address_space_operations cifs_addr_ops_smallbuf; -/* Functions related to super block operations */ -extern void cifs_sb_active(struct super_block *sb); -extern void cifs_sb_deactive(struct super_block *sb); - /* Functions related to inodes */ extern const struct inode_operations cifs_dir_inode_ops; extern struct inode *cifs_root_iget(struct super_block *); @@ -125,9 +121,9 @@ extern ssize_t cifs_getxattr(struct dentry *, const char *, void *, size_t); extern ssize_t cifs_listxattr(struct dentry *, char *, size_t); extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); -#ifdef CIFS_NFSD_EXPORT +#ifdef CONFIG_CIFS_NFSD_EXPORT extern const struct export_operations cifs_export_ops; -#endif /* CIFS_NFSD_EXPORT */ +#endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "1.74" +#define CIFS_VERSION "1.76" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 6255fa812c7a..8238aa13e01c 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -167,6 +167,8 @@ struct smb_vol { uid_t cred_uid; uid_t linux_uid; gid_t linux_gid; + uid_t backupuid; + gid_t backupgid; mode_t file_mode; mode_t dir_mode; unsigned secFlg; @@ -179,6 +181,8 @@ struct smb_vol { bool noperm:1; bool no_psx_acl:1; /* set if posix acl support should be disabled */ bool cifs_acl:1; + bool backupuid_specified; /* mount option backupuid is specified */ + bool backupgid_specified; /* mount option backupgid is specified */ bool no_xattr:1; /* set if xattr (EA) support should be disabled*/ bool server_ino:1; /* use inode numbers from server ie UniqueId */ bool direct_io:1; @@ -219,7 +223,8 @@ struct smb_vol { CIFS_MOUNT_OVERR_GID | CIFS_MOUNT_DYNPERM | \ CIFS_MOUNT_NOPOSIXBRL | CIFS_MOUNT_NOSSYNC | \ CIFS_MOUNT_FSCACHE | CIFS_MOUNT_MF_SYMLINKS | \ - CIFS_MOUNT_MULTIUSER | CIFS_MOUNT_STRICT_IO) + CIFS_MOUNT_MULTIUSER | CIFS_MOUNT_STRICT_IO | \ + CIFS_MOUNT_CIFS_BACKUPUID | CIFS_MOUNT_CIFS_BACKUPGID) #define CIFS_MS_MASK (MS_RDONLY | MS_MANDLOCK | MS_NOEXEC | MS_NOSUID | \ MS_NODEV | MS_SYNCHRONOUS) @@ -286,12 +291,18 @@ struct TCP_Server_Info { bool sec_kerberosu2u; /* supports U2U Kerberos */ bool sec_kerberos; /* supports plain Kerberos */ bool sec_mskerberos; /* supports legacy MS Kerberos */ + bool large_buf; /* is current buffer large? */ struct delayed_work echo; /* echo ping workqueue job */ + struct kvec *iov; /* reusable kvec array for receives */ + unsigned int nr_iov; /* number of kvecs in array */ + char *smallbuf; /* pointer to current "small" buffer */ + char *bigbuf; /* pointer to current "big" buffer */ + unsigned int total_read; /* total amount of data read in this pass */ #ifdef CONFIG_CIFS_FSCACHE struct fscache_cookie *fscache; /* client index cache cookie */ #endif #ifdef CONFIG_CIFS_STATS2 - atomic_t inSend; /* requests trying to send */ + atomic_t in_send; /* requests trying to send */ atomic_t num_waiters; /* blocked waiting to get in sendrecv */ #endif }; @@ -485,9 +496,13 @@ extern struct cifs_tcon *cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb); */ struct cifsLockInfo { struct list_head llist; /* pointer to next cifsLockInfo */ + struct list_head blist; /* pointer to locks blocked on this */ + wait_queue_head_t block_q; __u64 offset; __u64 length; + __u32 pid; __u8 type; + __u16 netfid; }; /* @@ -501,7 +516,7 @@ struct cifs_search_info { char *ntwrk_buf_start; char *srch_entries_start; char *last_entry; - char *presume_name; + const char *presume_name; unsigned int resume_name_len; bool endOfSearch:1; bool emptyDir:1; @@ -520,8 +535,6 @@ struct cifsFileInfo { struct dentry *dentry; unsigned int f_flags; struct tcon_link *tlink; - struct mutex lock_mutex; - struct list_head llist; /* list of byte range locks we have. */ bool invalidHandle:1; /* file closed via session abend */ bool oplock_break_cancelled:1; int count; /* refcount protected by cifs_file_list_lock */ @@ -554,7 +567,9 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file); */ struct cifsInodeInfo { - struct list_head lockList; + struct list_head llist; /* brlocks for this inode */ + bool can_cache_brlcks; + struct mutex lock_mutex; /* protect two fields above */ /* BB add in lists for dirty pages i.e. write caching info for oplock */ struct list_head openFileList; __u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */ @@ -643,8 +658,24 @@ static inline void cifs_stats_bytes_read(struct cifs_tcon *tcon, struct mid_q_entry; /* - * This is the prototype for the mid callback function. When creating one, - * take special care to avoid deadlocks. Things to bear in mind: + * This is the prototype for the mid receive function. This function is for + * receiving the rest of the SMB frame, starting with the WordCount (which is + * just after the MID in struct smb_hdr). Note: + * + * - This will be called by cifsd, with no locks held. + * - The mid will still be on the pending_mid_q. + * - mid->resp_buf will point to the current buffer. + * + * Returns zero on a successful receive, or an error. The receive state in + * the TCP_Server_Info will also be updated. + */ +typedef int (mid_receive_t)(struct TCP_Server_Info *server, + struct mid_q_entry *mid); + +/* + * This is the prototype for the mid callback function. This is called once the + * mid has been received off of the socket. When creating one, take special + * care to avoid deadlocks. Things to bear in mind: * * - it will be called by cifsd, with no locks held * - the mid will be removed from any lists @@ -662,9 +693,10 @@ struct mid_q_entry { unsigned long when_sent; /* time when smb send finished */ unsigned long when_received; /* when demux complete (taken off wire) */ #endif + mid_receive_t *receive; /* call receive callback */ mid_callback_t *callback; /* call completion callback */ void *callback_data; /* general purpose pointer for callback */ - struct smb_hdr *resp_buf; /* response buffer */ + struct smb_hdr *resp_buf; /* pointer to received SMB header */ int midState; /* wish this were enum but can not pass to wait_event */ __u8 command; /* smb command code */ bool largeBuf:1; /* if valid response, is pointer to large buf */ @@ -672,12 +704,54 @@ struct mid_q_entry { bool multiEnd:1; /* both received */ }; -struct oplock_q_entry { - struct list_head qhead; - struct inode *pinode; - struct cifs_tcon *tcon; - __u16 netfid; -}; +/* Make code in transport.c a little cleaner by moving + update of optional stats into function below */ +#ifdef CONFIG_CIFS_STATS2 + +static inline void cifs_in_send_inc(struct TCP_Server_Info *server) +{ + atomic_inc(&server->in_send); +} + +static inline void cifs_in_send_dec(struct TCP_Server_Info *server) +{ + atomic_dec(&server->in_send); +} + +static inline void cifs_num_waiters_inc(struct TCP_Server_Info *server) +{ + atomic_inc(&server->num_waiters); +} + +static inline void cifs_num_waiters_dec(struct TCP_Server_Info *server) +{ + atomic_dec(&server->num_waiters); +} + +static inline void cifs_save_when_sent(struct mid_q_entry *mid) +{ + mid->when_sent = jiffies; +} +#else +static inline void cifs_in_send_inc(struct TCP_Server_Info *server) +{ +} +static inline void cifs_in_send_dec(struct TCP_Server_Info *server) +{ +} + +static inline void cifs_num_waiters_inc(struct TCP_Server_Info *server) +{ +} + +static inline void cifs_num_waiters_dec(struct TCP_Server_Info *server) +{ +} + +static inline void cifs_save_when_sent(struct mid_q_entry *mid) +{ +} +#endif /* for pending dnotify requests */ struct dir_notify_req { @@ -922,7 +996,8 @@ GLOBAL_EXTERN unsigned int multiuser_mount; /* if enabled allows new sessions to be established on existing mount if we have the uid/password or Kerberos credential or equivalent for current user */ -GLOBAL_EXTERN unsigned int oplockEnabled; +/* enable or disable oplocks */ +GLOBAL_EXTERN bool enable_oplocks; GLOBAL_EXTERN unsigned int lookupCacheEnabled; GLOBAL_EXTERN unsigned int global_secflags; /* if on, session setup sent with more secure ntlmssp2 challenge/resp */ @@ -936,14 +1011,18 @@ GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/ /* reconnect after this many failed echo attempts */ GLOBAL_EXTERN unsigned short echo_retries; +#ifdef CONFIG_CIFS_ACL GLOBAL_EXTERN struct rb_root uidtree; GLOBAL_EXTERN struct rb_root gidtree; GLOBAL_EXTERN spinlock_t siduidlock; GLOBAL_EXTERN spinlock_t sidgidlock; +GLOBAL_EXTERN struct rb_root siduidtree; +GLOBAL_EXTERN struct rb_root sidgidtree; +GLOBAL_EXTERN spinlock_t uidsidlock; +GLOBAL_EXTERN spinlock_t gidsidlock; +#endif /* CONFIG_CIFS_ACL */ void cifs_oplock_break(struct work_struct *work); -void cifs_oplock_break_get(struct cifsFileInfo *cfile); -void cifs_oplock_break_put(struct cifsFileInfo *cfile); extern const struct slow_work_ops cifs_oplock_break_ops; diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index de3aa285de03..3fb03e2c8e86 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -1089,9 +1089,7 @@ typedef struct smb_com_read_rsp { __le16 DataLengthHigh; __u64 Reserved2; __u16 ByteCount; - __u8 Pad; /* BB check for whether padded to DWORD - boundary and optimum performance here */ - char Data[1]; + /* read response data immediately follows */ } __attribute__((packed)) READ_RSP; typedef struct locking_andx_range { @@ -1913,6 +1911,10 @@ typedef struct whoami_rsp_data { /* Query level 0x202 */ /* SETFSInfo Levels */ #define SMB_SET_CIFS_UNIX_INFO 0x200 +/* level 0x203 is defined above in list of QFS info levels */ +/* #define SMB_REQUEST_TRANSPORT_ENCRYPTION 0x203 */ + +/* Level 0x200 request structure follows */ typedef struct smb_com_transaction2_setfsi_req { struct smb_hdr hdr; /* wct = 15 */ __le16 TotalParameterCount; @@ -1940,13 +1942,39 @@ typedef struct smb_com_transaction2_setfsi_req { __le64 ClientUnixCap; /* Data end */ } __attribute__((packed)) TRANSACTION2_SETFSI_REQ; +/* level 0x203 request structure follows */ +typedef struct smb_com_transaction2_setfs_enc_req { + struct smb_hdr hdr; /* wct = 15 */ + __le16 TotalParameterCount; + __le16 TotalDataCount; + __le16 MaxParameterCount; + __le16 MaxDataCount; + __u8 MaxSetupCount; + __u8 Reserved; + __le16 Flags; + __le32 Timeout; + __u16 Reserved2; + __le16 ParameterCount; /* 4 */ + __le16 ParameterOffset; + __le16 DataCount; /* 12 */ + __le16 DataOffset; + __u8 SetupCount; /* one */ + __u8 Reserved3; + __le16 SubCommand; /* TRANS2_SET_FS_INFORMATION */ + __le16 ByteCount; + __u8 Pad; + __u16 Reserved4; /* Parameters start. */ + __le16 InformationLevel;/* Parameters end. */ + /* NTLMSSP Blob, Data start. */ +} __attribute__((packed)) TRANSACTION2_SETFSI_ENC_REQ; + +/* response for setfsinfo levels 0x200 and 0x203 */ typedef struct smb_com_transaction2_setfsi_rsp { struct smb_hdr hdr; /* wct = 10 */ struct trans2_resp t2; __u16 ByteCount; } __attribute__((packed)) TRANSACTION2_SETFSI_RSP; - typedef struct smb_com_transaction2_get_dfs_refer_req { struct smb_hdr hdr; /* wct = 15 */ __le16 TotalParameterCount; @@ -2098,13 +2126,13 @@ typedef struct { #define CIFS_UNIX_PROXY_CAP 0x00000400 /* Proxy cap: 0xACE ioctl and QFS PROXY call */ #ifdef CONFIG_CIFS_POSIX -/* Can not set pathnames cap yet until we send new posix create SMB since - otherwise server can treat such handles opened with older ntcreatex - (by a new client which knows how to send posix path ops) - as non-posix handles (can affect write behavior with byte range locks. - We can add back in POSIX_PATH_OPS cap when Posix Create/Mkdir finished */ +/* presumably don't need the 0x20 POSIX_PATH_OPS_CAP since we never send + LockingX instead of posix locking call on unix sess (and we do not expect + LockingX to use different (ie Windows) semantics than posix locking on + the same session (if WINE needs to do this later, we can add this cap + back in later */ /* #define CIFS_UNIX_CAP_MASK 0x000000fb */ -#define CIFS_UNIX_CAP_MASK 0x000000db +#define CIFS_UNIX_CAP_MASK 0x000003db #else #define CIFS_UNIX_CAP_MASK 0x00000013 #endif /* CONFIG_CIFS_POSIX */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 8df28e925e5b..6f4e243e0f62 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -69,8 +69,9 @@ extern struct mid_q_entry *AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server); extern void DeleteMidQEntry(struct mid_q_entry *midEntry); extern int cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov, - unsigned int nvec, mid_callback_t *callback, - void *cbdata, bool ignore_pend); + unsigned int nvec, mid_receive_t *receive, + mid_callback_t *callback, void *cbdata, + bool ignore_pend); extern int SendReceive(const unsigned int /* xid */ , struct cifs_ses *, struct smb_hdr * /* input */ , struct smb_hdr * /* out */ , @@ -90,6 +91,7 @@ extern int SendReceiveBlockingLock(const unsigned int xid, extern int checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length); extern bool is_valid_oplock_break(struct smb_hdr *smb, struct TCP_Server_Info *); +extern bool backup_cred(struct cifs_sb_info *); extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof); extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset, unsigned int bytes_written); @@ -145,12 +147,19 @@ extern int cifs_get_inode_info_unix(struct inode **pinode, extern int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, struct inode *inode, const char *path, const __u16 *pfid); -extern int mode_to_cifs_acl(struct inode *inode, const char *path, __u64); +extern int id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64, + uid_t, gid_t); extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *, const char *, u32 *); extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *, - const char *); + const char *, int); +extern void dequeue_mid(struct mid_q_entry *mid, bool malformed); +extern int cifs_read_from_socket(struct TCP_Server_Info *server, char *buf, + unsigned int to_read); +extern int cifs_readv_from_socket(struct TCP_Server_Info *server, + struct kvec *iov_orig, unsigned int nr_segs, + unsigned int to_read); extern void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, struct cifs_sb_info *cifs_sb); extern int cifs_match_super(struct super_block *, void *); @@ -359,14 +368,17 @@ extern int CIFSGetSrvInodeNumber(const int xid, struct cifs_tcon *tcon, const struct nls_table *nls_codepage, int remap_special_chars); +extern int cifs_lockv(const int xid, struct cifs_tcon *tcon, const __u16 netfid, + const __u8 lock_type, const __u32 num_unlock, + const __u32 num_lock, LOCKING_ANDX_RANGE *buf); extern int CIFSSMBLock(const int xid, struct cifs_tcon *tcon, - const __u16 netfid, const __u64 len, + const __u16 netfid, const __u32 netpid, const __u64 len, const __u64 offset, const __u32 numUnlock, const __u32 numLock, const __u8 lockType, const bool waitFlag, const __u8 oplock_level); extern int CIFSSMBPosixLock(const int xid, struct cifs_tcon *tcon, - const __u16 smb_file_id, const int get_flag, - const __u64 len, struct file_lock *, + const __u16 smb_file_id, const __u32 netpid, + const int get_flag, const __u64 len, struct file_lock *, const __u16 lock_type, const bool waitFlag); extern int CIFSSMBTDis(const int xid, struct cifs_tcon *tcon); extern int CIFSSMBEcho(struct TCP_Server_Info *server); @@ -380,11 +392,12 @@ extern void tconInfoFree(struct cifs_tcon *); extern int cifs_sign_smb(struct smb_hdr *, struct TCP_Server_Info *, __u32 *); extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *, __u32 *); -extern int cifs_verify_signature(struct smb_hdr *, +extern int cifs_verify_signature(struct kvec *iov, unsigned int nr_iov, struct TCP_Server_Info *server, __u32 expected_sequence_number); -extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *); -extern int setup_ntlm_response(struct cifs_ses *); +extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *, + const struct nls_table *); +extern int setup_ntlm_response(struct cifs_ses *, const struct nls_table *); extern int setup_ntlmv2_rsp(struct cifs_ses *, const struct nls_table *); extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *); extern void cifs_crypto_shash_release(struct TCP_Server_Info *); @@ -419,7 +432,7 @@ extern int CIFSSMBSetEA(const int xid, struct cifs_tcon *tcon, extern int CIFSSMBGetCIFSACL(const int xid, struct cifs_tcon *tcon, __u16 fid, struct cifs_ntsd **acl_inf, __u32 *buflen); extern int CIFSSMBSetCIFSACL(const int, struct cifs_tcon *, __u16, - struct cifs_ntsd *, __u32); + struct cifs_ntsd *, __u32, int); extern int CIFSSMBGetPosixACL(const int xid, struct cifs_tcon *tcon, const unsigned char *searchName, char *acl_inf, const int buflen, const int acl_type, @@ -436,10 +449,29 @@ extern int CIFSCheckMFSymlink(struct cifs_fattr *fattr, const unsigned char *path, struct cifs_sb_info *cifs_sb, int xid); extern int mdfour(unsigned char *, unsigned char *, int); -extern int E_md4hash(const unsigned char *passwd, unsigned char *p16); +extern int E_md4hash(const unsigned char *passwd, unsigned char *p16, + const struct nls_table *codepage); extern int SMBencrypt(unsigned char *passwd, const unsigned char *c8, unsigned char *p24); +/* asynchronous read support */ +struct cifs_readdata { + struct cifsFileInfo *cfile; + struct address_space *mapping; + __u64 offset; + unsigned int bytes; + pid_t pid; + int result; + struct list_head pages; + struct work_struct work; + unsigned int nr_iov; + struct kvec iov[1]; +}; + +struct cifs_readdata *cifs_readdata_alloc(unsigned int nr_pages); +void cifs_readdata_free(struct cifs_readdata *rdata); +int cifs_async_readv(struct cifs_readdata *rdata); + /* asynchronous write support */ struct cifs_writedata { struct kref refcount; diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 1a9fe7f816d1..6600aa2d2ef3 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -33,6 +33,8 @@ #include <linux/slab.h> #include <linux/posix_acl_xattr.h> #include <linux/pagemap.h> +#include <linux/swap.h> +#include <linux/task_io_accounting_ops.h> #include <asm/uaccess.h> #include "cifspdu.h" #include "cifsglob.h" @@ -40,6 +42,7 @@ #include "cifsproto.h" #include "cifs_unicode.h" #include "cifs_debug.h" +#include "fscache.h" #ifdef CONFIG_CIFS_POSIX static struct { @@ -83,6 +86,9 @@ static struct { #endif /* CONFIG_CIFS_WEAK_PW_HASH */ #endif /* CIFS_POSIX */ +/* Forward declarations */ +static void cifs_readv_complete(struct work_struct *work); + /* Mark as invalid, all open files on tree connections since they were closed when session to server was lost */ static void mark_open_files_invalid(struct cifs_tcon *pTcon) @@ -107,7 +113,7 @@ static void mark_open_files_invalid(struct cifs_tcon *pTcon) static int cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command) { - int rc = 0; + int rc; struct cifs_ses *ses; struct TCP_Server_Info *server; struct nls_table *nls_codepage; @@ -453,8 +459,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifs_ses *ses) } server->sec_mode = (__u8)le16_to_cpu(rsp->SecurityMode); server->maxReq = le16_to_cpu(rsp->MaxMpxCount); - server->maxBuf = min((__u32)le16_to_cpu(rsp->MaxBufSize), - (__u32)CIFSMaxBufSize + MAX_CIFS_HDR_SIZE); + server->maxBuf = le16_to_cpu(rsp->MaxBufSize); server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs); /* even though we do not use raw we might as well set this accurately, in case we ever find a need for it */ @@ -561,8 +566,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifs_ses *ses) little endian */ server->maxReq = le16_to_cpu(pSMBr->MaxMpxCount); /* probably no need to store and check maxvcs */ - server->maxBuf = min(le32_to_cpu(pSMBr->MaxBufferSize), - (__u32) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE); + server->maxBuf = le32_to_cpu(pSMBr->MaxBufferSize); server->max_rw = le32_to_cpu(pSMBr->MaxRawSize); cFYI(DBG2, "Max buf = %d", ses->server->maxBuf); server->capabilities = le32_to_cpu(pSMBr->Capabilities); @@ -739,7 +743,8 @@ CIFSSMBEcho(struct TCP_Server_Info *server) iov.iov_base = smb; iov.iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4; - rc = cifs_call_async(server, &iov, 1, cifs_echo_callback, server, true); + rc = cifs_call_async(server, &iov, 1, NULL, cifs_echo_callback, + server, true); if (rc) cFYI(1, "Echo request failed: %d", rc); @@ -1376,6 +1381,359 @@ openRetry: return rc; } +struct cifs_readdata * +cifs_readdata_alloc(unsigned int nr_pages) +{ + struct cifs_readdata *rdata; + + /* readdata + 1 kvec for each page */ + rdata = kzalloc(sizeof(*rdata) + + sizeof(struct kvec) * nr_pages, GFP_KERNEL); + if (rdata != NULL) { + INIT_WORK(&rdata->work, cifs_readv_complete); + INIT_LIST_HEAD(&rdata->pages); + } + return rdata; +} + +void +cifs_readdata_free(struct cifs_readdata *rdata) +{ + cifsFileInfo_put(rdata->cfile); + kfree(rdata); +} + +/* + * Discard any remaining data in the current SMB. To do this, we borrow the + * current bigbuf. + */ +static int +cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid) +{ + READ_RSP *rsp = (READ_RSP *)server->smallbuf; + unsigned int rfclen = be32_to_cpu(rsp->hdr.smb_buf_length); + int remaining = rfclen + 4 - server->total_read; + struct cifs_readdata *rdata = mid->callback_data; + + while (remaining > 0) { + int length; + + length = cifs_read_from_socket(server, server->bigbuf, + min_t(unsigned int, remaining, + CIFSMaxBufSize + MAX_CIFS_HDR_SIZE)); + if (length < 0) + return length; + server->total_read += length; + remaining -= length; + } + + dequeue_mid(mid, rdata->result); + return 0; +} + +static int +cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) +{ + int length, len; + unsigned int data_offset, remaining, data_len; + struct cifs_readdata *rdata = mid->callback_data; + READ_RSP *rsp = (READ_RSP *)server->smallbuf; + unsigned int rfclen = be32_to_cpu(rsp->hdr.smb_buf_length) + 4; + u64 eof; + pgoff_t eof_index; + struct page *page, *tpage; + + cFYI(1, "%s: mid=%u offset=%llu bytes=%u", __func__, + mid->mid, rdata->offset, rdata->bytes); + + /* + * read the rest of READ_RSP header (sans Data array), or whatever we + * can if there's not enough data. At this point, we've read down to + * the Mid. + */ + len = min_t(unsigned int, rfclen, sizeof(*rsp)) - + sizeof(struct smb_hdr) + 1; + + rdata->iov[0].iov_base = server->smallbuf + sizeof(struct smb_hdr) - 1; + rdata->iov[0].iov_len = len; + + length = cifs_readv_from_socket(server, rdata->iov, 1, len); + if (length < 0) + return length; + server->total_read += length; + + /* Was the SMB read successful? */ + rdata->result = map_smb_to_linux_error(&rsp->hdr, false); + if (rdata->result != 0) { + cFYI(1, "%s: server returned error %d", __func__, + rdata->result); + return cifs_readv_discard(server, mid); + } + + /* Is there enough to get to the rest of the READ_RSP header? */ + if (server->total_read < sizeof(READ_RSP)) { + cFYI(1, "%s: server returned short header. got=%u expected=%zu", + __func__, server->total_read, sizeof(READ_RSP)); + rdata->result = -EIO; + return cifs_readv_discard(server, mid); + } + + data_offset = le16_to_cpu(rsp->DataOffset) + 4; + if (data_offset < server->total_read) { + /* + * win2k8 sometimes sends an offset of 0 when the read + * is beyond the EOF. Treat it as if the data starts just after + * the header. + */ + cFYI(1, "%s: data offset (%u) inside read response header", + __func__, data_offset); + data_offset = server->total_read; + } else if (data_offset > MAX_CIFS_SMALL_BUFFER_SIZE) { + /* data_offset is beyond the end of smallbuf */ + cFYI(1, "%s: data offset (%u) beyond end of smallbuf", + __func__, data_offset); + rdata->result = -EIO; + return cifs_readv_discard(server, mid); + } + + cFYI(1, "%s: total_read=%u data_offset=%u", __func__, + server->total_read, data_offset); + + len = data_offset - server->total_read; + if (len > 0) { + /* read any junk before data into the rest of smallbuf */ + rdata->iov[0].iov_base = server->smallbuf + server->total_read; + rdata->iov[0].iov_len = len; + length = cifs_readv_from_socket(server, rdata->iov, 1, len); + if (length < 0) + return length; + server->total_read += length; + } + + /* set up first iov for signature check */ + rdata->iov[0].iov_base = server->smallbuf; + rdata->iov[0].iov_len = server->total_read; + cFYI(1, "0: iov_base=%p iov_len=%zu", + rdata->iov[0].iov_base, rdata->iov[0].iov_len); + + /* how much data is in the response? */ + data_len = le16_to_cpu(rsp->DataLengthHigh) << 16; + data_len += le16_to_cpu(rsp->DataLength); + if (data_offset + data_len > rfclen) { + /* data_len is corrupt -- discard frame */ + rdata->result = -EIO; + return cifs_readv_discard(server, mid); + } + + /* marshal up the page array */ + len = 0; + remaining = data_len; + rdata->nr_iov = 1; + + /* determine the eof that the server (probably) has */ + eof = CIFS_I(rdata->mapping->host)->server_eof; + eof_index = eof ? (eof - 1) >> PAGE_CACHE_SHIFT : 0; + cFYI(1, "eof=%llu eof_index=%lu", eof, eof_index); + + list_for_each_entry_safe(page, tpage, &rdata->pages, lru) { + if (remaining >= PAGE_CACHE_SIZE) { + /* enough data to fill the page */ + rdata->iov[rdata->nr_iov].iov_base = kmap(page); + rdata->iov[rdata->nr_iov].iov_len = PAGE_CACHE_SIZE; + cFYI(1, "%u: idx=%lu iov_base=%p iov_len=%zu", + rdata->nr_iov, page->index, + rdata->iov[rdata->nr_iov].iov_base, + rdata->iov[rdata->nr_iov].iov_len); + ++rdata->nr_iov; + len += PAGE_CACHE_SIZE; + remaining -= PAGE_CACHE_SIZE; + } else if (remaining > 0) { + /* enough for partial page, fill and zero the rest */ + rdata->iov[rdata->nr_iov].iov_base = kmap(page); + rdata->iov[rdata->nr_iov].iov_len = remaining; + cFYI(1, "%u: idx=%lu iov_base=%p iov_len=%zu", + rdata->nr_iov, page->index, + rdata->iov[rdata->nr_iov].iov_base, + rdata->iov[rdata->nr_iov].iov_len); + memset(rdata->iov[rdata->nr_iov].iov_base + remaining, + '\0', PAGE_CACHE_SIZE - remaining); + ++rdata->nr_iov; + len += remaining; + remaining = 0; + } else if (page->index > eof_index) { + /* + * The VFS will not try to do readahead past the + * i_size, but it's possible that we have outstanding + * writes with gaps in the middle and the i_size hasn't + * caught up yet. Populate those with zeroed out pages + * to prevent the VFS from repeatedly attempting to + * fill them until the writes are flushed. + */ + zero_user(page, 0, PAGE_CACHE_SIZE); + list_del(&page->lru); + lru_cache_add_file(page); + flush_dcache_page(page); + SetPageUptodate(page); + unlock_page(page); + page_cache_release(page); + } else { + /* no need to hold page hostage */ + list_del(&page->lru); + lru_cache_add_file(page); + unlock_page(page); + page_cache_release(page); + } + } + + /* issue the read if we have any iovecs left to fill */ + if (rdata->nr_iov > 1) { + length = cifs_readv_from_socket(server, &rdata->iov[1], + rdata->nr_iov - 1, len); + if (length < 0) + return length; + server->total_read += length; + } else { + length = 0; + } + + rdata->bytes = length; + + cFYI(1, "total_read=%u rfclen=%u remaining=%u", server->total_read, + rfclen, remaining); + + /* discard anything left over */ + if (server->total_read < rfclen) + return cifs_readv_discard(server, mid); + + dequeue_mid(mid, false); + return length; +} + +static void +cifs_readv_complete(struct work_struct *work) +{ + struct cifs_readdata *rdata = container_of(work, + struct cifs_readdata, work); + struct page *page, *tpage; + + list_for_each_entry_safe(page, tpage, &rdata->pages, lru) { + list_del(&page->lru); + lru_cache_add_file(page); + + if (rdata->result == 0) { + kunmap(page); + flush_dcache_page(page); + SetPageUptodate(page); + } + + unlock_page(page); + + if (rdata->result == 0) + cifs_readpage_to_fscache(rdata->mapping->host, page); + + page_cache_release(page); + } + cifs_readdata_free(rdata); +} + +static void +cifs_readv_callback(struct mid_q_entry *mid) +{ + struct cifs_readdata *rdata = mid->callback_data; + struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink); + struct TCP_Server_Info *server = tcon->ses->server; + + cFYI(1, "%s: mid=%u state=%d result=%d bytes=%u", __func__, + mid->mid, mid->midState, rdata->result, rdata->bytes); + + switch (mid->midState) { + case MID_RESPONSE_RECEIVED: + /* result already set, check signature */ + if (server->sec_mode & + (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) { + if (cifs_verify_signature(rdata->iov, rdata->nr_iov, + server, mid->sequence_number + 1)) + cERROR(1, "Unexpected SMB signature"); + } + /* FIXME: should this be counted toward the initiating task? */ + task_io_account_read(rdata->bytes); + cifs_stats_bytes_read(tcon, rdata->bytes); + break; + case MID_REQUEST_SUBMITTED: + case MID_RETRY_NEEDED: + rdata->result = -EAGAIN; + break; + default: + rdata->result = -EIO; + } + + queue_work(system_nrt_wq, &rdata->work); + DeleteMidQEntry(mid); + atomic_dec(&server->inFlight); + wake_up(&server->request_q); +} + +/* cifs_async_readv - send an async write, and set up mid to handle result */ +int +cifs_async_readv(struct cifs_readdata *rdata) +{ + int rc; + READ_REQ *smb = NULL; + int wct; + struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink); + + cFYI(1, "%s: offset=%llu bytes=%u", __func__, + rdata->offset, rdata->bytes); + + if (tcon->ses->capabilities & CAP_LARGE_FILES) + wct = 12; + else { + wct = 10; /* old style read */ + if ((rdata->offset >> 32) > 0) { + /* can not handle this big offset for old */ + return -EIO; + } + } + + rc = small_smb_init(SMB_COM_READ_ANDX, wct, tcon, (void **)&smb); + if (rc) + return rc; + + smb->hdr.Pid = cpu_to_le16((__u16)rdata->pid); + smb->hdr.PidHigh = cpu_to_le16((__u16)(rdata->pid >> 16)); + + smb->AndXCommand = 0xFF; /* none */ + smb->Fid = rdata->cfile->netfid; + smb->OffsetLow = cpu_to_le32(rdata->offset & 0xFFFFFFFF); + if (wct == 12) + smb->OffsetHigh = cpu_to_le32(rdata->offset >> 32); + smb->Remaining = 0; + smb->MaxCount = cpu_to_le16(rdata->bytes & 0xFFFF); + smb->MaxCountHigh = cpu_to_le32(rdata->bytes >> 16); + if (wct == 12) + smb->ByteCount = 0; + else { + /* old style read */ + struct smb_com_readx_req *smbr = + (struct smb_com_readx_req *)smb; + smbr->ByteCount = 0; + } + + /* 4 for RFC1001 length + 1 for BCC */ + rdata->iov[0].iov_base = smb; + rdata->iov[0].iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4; + + rc = cifs_call_async(tcon->ses->server, rdata->iov, 1, + cifs_readv_receive, cifs_readv_callback, + rdata, false); + + if (rc == 0) + cifs_stats_inc(&tcon->num_reads); + + cifs_small_buf_release(smb); + return rc; +} + int CIFSSMBRead(const int xid, struct cifs_io_parms *io_parms, unsigned int *nbytes, char **buf, int *pbuf_type) @@ -1836,7 +2194,7 @@ cifs_async_writev(struct cifs_writedata *wdata) kref_get(&wdata->refcount); rc = cifs_call_async(tcon->ses->server, iov, wdata->nr_pages + 1, - cifs_writev_callback, wdata, false); + NULL, cifs_writev_callback, wdata, false); if (rc == 0) cifs_stats_inc(&tcon->num_writes); @@ -1962,10 +2320,50 @@ CIFSSMBWrite2(const int xid, struct cifs_io_parms *io_parms, return rc; } +int cifs_lockv(const int xid, struct cifs_tcon *tcon, const __u16 netfid, + const __u8 lock_type, const __u32 num_unlock, + const __u32 num_lock, LOCKING_ANDX_RANGE *buf) +{ + int rc = 0; + LOCK_REQ *pSMB = NULL; + struct kvec iov[2]; + int resp_buf_type; + __u16 count; + + cFYI(1, "cifs_lockv num lock %d num unlock %d", num_lock, num_unlock); + + rc = small_smb_init(SMB_COM_LOCKING_ANDX, 8, tcon, (void **) &pSMB); + if (rc) + return rc; + + pSMB->Timeout = 0; + pSMB->NumberOfLocks = cpu_to_le16(num_lock); + pSMB->NumberOfUnlocks = cpu_to_le16(num_unlock); + pSMB->LockType = lock_type; + pSMB->AndXCommand = 0xFF; /* none */ + pSMB->Fid = netfid; /* netfid stays le */ + + count = (num_unlock + num_lock) * sizeof(LOCKING_ANDX_RANGE); + inc_rfc1001_len(pSMB, count); + pSMB->ByteCount = cpu_to_le16(count); + + iov[0].iov_base = (char *)pSMB; + iov[0].iov_len = be32_to_cpu(pSMB->hdr.smb_buf_length) + 4 - + (num_unlock + num_lock) * sizeof(LOCKING_ANDX_RANGE); + iov[1].iov_base = (char *)buf; + iov[1].iov_len = (num_unlock + num_lock) * sizeof(LOCKING_ANDX_RANGE); + + cifs_stats_inc(&tcon->num_locks); + rc = SendReceive2(xid, tcon->ses, iov, 2, &resp_buf_type, CIFS_NO_RESP); + if (rc) + cFYI(1, "Send error in cifs_lockv = %d", rc); + + return rc; +} int CIFSSMBLock(const int xid, struct cifs_tcon *tcon, - const __u16 smb_file_id, const __u64 len, + const __u16 smb_file_id, const __u32 netpid, const __u64 len, const __u64 offset, const __u32 numUnlock, const __u32 numLock, const __u8 lockType, const bool waitFlag, const __u8 oplock_level) @@ -2001,7 +2399,7 @@ CIFSSMBLock(const int xid, struct cifs_tcon *tcon, pSMB->Fid = smb_file_id; /* netfid stays le */ if ((numLock != 0) || (numUnlock != 0)) { - pSMB->Locks[0].Pid = cpu_to_le16(current->tgid); + pSMB->Locks[0].Pid = cpu_to_le16(netpid); /* BB where to store pid high? */ pSMB->Locks[0].LengthLow = cpu_to_le32((u32)len); pSMB->Locks[0].LengthHigh = cpu_to_le32((u32)(len>>32)); @@ -2035,9 +2433,9 @@ CIFSSMBLock(const int xid, struct cifs_tcon *tcon, int CIFSSMBPosixLock(const int xid, struct cifs_tcon *tcon, - const __u16 smb_file_id, const int get_flag, const __u64 len, - struct file_lock *pLockData, const __u16 lock_type, - const bool waitFlag) + const __u16 smb_file_id, const __u32 netpid, const int get_flag, + const __u64 len, struct file_lock *pLockData, + const __u16 lock_type, const bool waitFlag) { struct smb_com_transaction2_sfi_req *pSMB = NULL; struct smb_com_transaction2_sfi_rsp *pSMBr = NULL; @@ -2095,7 +2493,7 @@ CIFSSMBPosixLock(const int xid, struct cifs_tcon *tcon, } else pSMB->Timeout = 0; - parm_data->pid = cpu_to_le32(current->tgid); + parm_data->pid = cpu_to_le32(netpid); parm_data->start = cpu_to_le64(pLockData->fl_start); parm_data->length = cpu_to_le64(len); /* normalize negative numbers */ @@ -2812,8 +3210,7 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifs_tcon *tcon, pSMB->TotalDataCount = 0; pSMB->MaxParameterCount = cpu_to_le32(2); /* BB find exact data count max from sess structure BB */ - pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf - - MAX_CIFS_HDR_SIZE) & 0xFFFFFF00); + pSMB->MaxDataCount = cpu_to_le32(CIFSMaxBufSize & 0xFFFFFF00); pSMB->MaxSetupCount = 4; pSMB->Reserved = 0; pSMB->ParameterOffset = 0; @@ -3306,8 +3703,7 @@ smb_init_nttransact(const __u16 sub_command, const int setup_count, pSMB->Reserved = 0; pSMB->TotalParameterCount = cpu_to_le32(parm_len); pSMB->TotalDataCount = 0; - pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf - - MAX_CIFS_HDR_SIZE) & 0xFFFFFF00); + pSMB->MaxDataCount = cpu_to_le32(CIFSMaxBufSize & 0xFFFFFF00); pSMB->ParameterCount = pSMB->TotalParameterCount; pSMB->DataCount = pSMB->TotalDataCount; temp_offset = offsetof(struct smb_com_ntransact_req, Parms) + @@ -3467,7 +3863,7 @@ qsec_out: int CIFSSMBSetCIFSACL(const int xid, struct cifs_tcon *tcon, __u16 fid, - struct cifs_ntsd *pntsd, __u32 acllen) + struct cifs_ntsd *pntsd, __u32 acllen, int aclflag) { __u16 byte_count, param_count, data_count, param_offset, data_offset; int rc = 0; @@ -3504,7 +3900,7 @@ setCifsAclRetry: pSMB->Fid = fid; /* file handle always le */ pSMB->Reserved2 = 0; - pSMB->AclFlags = cpu_to_le32(CIFS_ACL_DACL); + pSMB->AclFlags = cpu_to_le32(aclflag); if (pntsd && acllen) { memcpy((char *) &pSMBr->hdr.Protocol + data_offset, @@ -3977,8 +4373,7 @@ findFirstRetry: params = 12 + name_len /* includes null */ ; pSMB->TotalDataCount = 0; /* no EAs */ pSMB->MaxParameterCount = cpu_to_le16(10); - pSMB->MaxDataCount = cpu_to_le16((tcon->ses->server->maxBuf - - MAX_CIFS_HDR_SIZE) & 0xFFFFFF00); + pSMB->MaxDataCount = cpu_to_le16(CIFSMaxBufSize & 0xFFFFFF00); pSMB->MaxSetupCount = 0; pSMB->Reserved = 0; pSMB->Flags = 0; @@ -4052,8 +4447,7 @@ findFirstRetry: psrch_inf->index_of_last_entry = 2 /* skip . and .. */ + psrch_inf->entries_in_buffer; lnoff = le16_to_cpu(parms->LastNameOffset); - if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE < - lnoff) { + if (CIFSMaxBufSize < lnoff) { cERROR(1, "ignoring corrupt resume name"); psrch_inf->last_entry = NULL; return rc; @@ -4079,7 +4473,8 @@ int CIFSFindNext(const int xid, struct cifs_tcon *tcon, T2_FNEXT_RSP_PARMS *parms; char *response_data; int rc = 0; - int bytes_returned, name_len; + int bytes_returned; + unsigned int name_len; __u16 params, byte_count; cFYI(1, "In FindNext"); @@ -4096,9 +4491,7 @@ int CIFSFindNext(const int xid, struct cifs_tcon *tcon, byte_count = 0; pSMB->TotalDataCount = 0; /* no EAs */ pSMB->MaxParameterCount = cpu_to_le16(8); - pSMB->MaxDataCount = - cpu_to_le16((tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE) & - 0xFFFFFF00); + pSMB->MaxDataCount = cpu_to_le16(CIFSMaxBufSize & 0xFFFFFF00); pSMB->MaxSetupCount = 0; pSMB->Reserved = 0; pSMB->Flags = 0; @@ -4180,8 +4573,7 @@ int CIFSFindNext(const int xid, struct cifs_tcon *tcon, psrch_inf->index_of_last_entry += psrch_inf->entries_in_buffer; lnoff = le16_to_cpu(parms->LastNameOffset); - if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE < - lnoff) { + if (CIFSMaxBufSize < lnoff) { cERROR(1, "ignoring corrupt resume name"); psrch_inf->last_entry = NULL; return rc; @@ -5720,6 +6112,7 @@ CIFSSMBQAllEAs(const int xid, struct cifs_tcon *tcon, char *temp_ptr; char *end_of_smb; __u16 params, byte_count, data_offset; + unsigned int ea_name_len = ea_name ? strlen(ea_name) : 0; cFYI(1, "In Query All EAs path %s", searchName); QAllEAsRetry: @@ -5837,7 +6230,8 @@ QAllEAsRetry: } if (ea_name) { - if (strncmp(ea_name, temp_ptr, name_len) == 0) { + if (ea_name_len == name_len && + memcmp(ea_name, temp_ptr, name_len) == 0) { temp_ptr += name_len + 1; rc = value_len; if (buf_size == 0) @@ -6032,12 +6426,7 @@ int CIFSSMBNotify(const int xid, struct cifs_tcon *tcon, pSMB->TotalParameterCount = 0 ; pSMB->TotalDataCount = 0; pSMB->MaxParameterCount = cpu_to_le32(2); - /* BB find exact data count max from sess structure BB */ - pSMB->MaxDataCount = 0; /* same in little endian or be */ -/* BB VERIFY verify which is correct for above BB */ - pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf - - MAX_CIFS_HDR_SIZE) & 0xFFFFFF00); - + pSMB->MaxDataCount = cpu_to_le32(CIFSMaxBufSize & 0xFFFFFF00); pSMB->MaxSetupCount = 4; pSMB->Reserved = 0; pSMB->ParameterOffset = 0; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index e66297bad412..d6a972df0338 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -37,6 +37,7 @@ #include <asm/uaccess.h> #include <asm/processor.h> #include <linux/inet.h> +#include <linux/module.h> #include <net/ipv6.h> #include "cifspdu.h" #include "cifsglob.h" @@ -181,7 +182,7 @@ cifs_reconnect(struct TCP_Server_Info *server) -EINVAL = invalid transact2 */ -static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize) +static int check2ndT2(struct smb_hdr *pSMB) { struct smb_t2_rsp *pSMBt; int remaining; @@ -214,9 +215,9 @@ static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize) cFYI(1, "missing %d bytes from transact2, check next response", remaining); - if (total_data_size > maxBufSize) { + if (total_data_size > CIFSMaxBufSize) { cERROR(1, "TotalDataSize %d is over maximum buffer %d", - total_data_size, maxBufSize); + total_data_size, CIFSMaxBufSize); return -EINVAL; } return remaining; @@ -319,344 +320,298 @@ requeue_echo: queue_delayed_work(system_nrt_wq, &server->echo, SMB_ECHO_INTERVAL); } -static int -cifs_demultiplex_thread(void *p) +static bool +allocate_buffers(struct TCP_Server_Info *server) { - int length; - struct TCP_Server_Info *server = p; - unsigned int pdu_length, total_read; - struct smb_hdr *smb_buffer = NULL; - struct smb_hdr *bigbuf = NULL; - struct smb_hdr *smallbuf = NULL; - struct msghdr smb_msg; - struct kvec iov; - struct socket *csocket = server->ssocket; - struct list_head *tmp, *tmp2; - struct task_struct *task_to_wake = NULL; - struct mid_q_entry *mid_entry; - char temp; - bool isLargeBuf = false; - bool isMultiRsp; - int reconnect; + if (!server->bigbuf) { + server->bigbuf = (char *)cifs_buf_get(); + if (!server->bigbuf) { + cERROR(1, "No memory for large SMB response"); + msleep(3000); + /* retry will check if exiting */ + return false; + } + } else if (server->large_buf) { + /* we are reusing a dirty large buf, clear its start */ + memset(server->bigbuf, 0, sizeof(struct smb_hdr)); + } - current->flags |= PF_MEMALLOC; - cFYI(1, "Demultiplex PID: %d", task_pid_nr(current)); + if (!server->smallbuf) { + server->smallbuf = (char *)cifs_small_buf_get(); + if (!server->smallbuf) { + cERROR(1, "No memory for SMB response"); + msleep(1000); + /* retry will check if exiting */ + return false; + } + /* beginning of smb buffer is cleared in our buf_get */ + } else { + /* if existing small buf clear beginning */ + memset(server->smallbuf, 0, sizeof(struct smb_hdr)); + } - length = atomic_inc_return(&tcpSesAllocCount); - if (length > 1) - mempool_resize(cifs_req_poolp, length + cifs_min_rcv, - GFP_KERNEL); + return true; +} - set_freezable(); - while (server->tcpStatus != CifsExiting) { - if (try_to_freeze()) - continue; - if (bigbuf == NULL) { - bigbuf = cifs_buf_get(); - if (!bigbuf) { - cERROR(1, "No memory for large SMB response"); - msleep(3000); - /* retry will check if exiting */ - continue; - } - } else if (isLargeBuf) { - /* we are reusing a dirty large buf, clear its start */ - memset(bigbuf, 0, sizeof(struct smb_hdr)); +static bool +server_unresponsive(struct TCP_Server_Info *server) +{ + if (echo_retries > 0 && server->tcpStatus == CifsGood && + time_after(jiffies, server->lstrp + + (echo_retries * SMB_ECHO_INTERVAL))) { + cERROR(1, "Server %s has not responded in %d seconds. " + "Reconnecting...", server->hostname, + (echo_retries * SMB_ECHO_INTERVAL / HZ)); + cifs_reconnect(server); + wake_up(&server->response_q); + return true; + } + + return false; +} + +/* + * kvec_array_init - clone a kvec array, and advance into it + * @new: pointer to memory for cloned array + * @iov: pointer to original array + * @nr_segs: number of members in original array + * @bytes: number of bytes to advance into the cloned array + * + * This function will copy the array provided in iov to a section of memory + * and advance the specified number of bytes into the new array. It returns + * the number of segments in the new array. "new" must be at least as big as + * the original iov array. + */ +static unsigned int +kvec_array_init(struct kvec *new, struct kvec *iov, unsigned int nr_segs, + size_t bytes) +{ + size_t base = 0; + + while (bytes || !iov->iov_len) { + int copy = min(bytes, iov->iov_len); + + bytes -= copy; + base += copy; + if (iov->iov_len == base) { + iov++; + nr_segs--; + base = 0; } + } + memcpy(new, iov, sizeof(*iov) * nr_segs); + new->iov_base += base; + new->iov_len -= base; + return nr_segs; +} - if (smallbuf == NULL) { - smallbuf = cifs_small_buf_get(); - if (!smallbuf) { - cERROR(1, "No memory for SMB response"); - msleep(1000); - /* retry will check if exiting */ - continue; - } - /* beginning of smb buffer is cleared in our buf_get */ - } else /* if existing small buf clear beginning */ - memset(smallbuf, 0, sizeof(struct smb_hdr)); - - isLargeBuf = false; - isMultiRsp = false; - smb_buffer = smallbuf; - iov.iov_base = smb_buffer; - iov.iov_len = 4; - smb_msg.msg_control = NULL; - smb_msg.msg_controllen = 0; - pdu_length = 4; /* enough to get RFC1001 header */ +static struct kvec * +get_server_iovec(struct TCP_Server_Info *server, unsigned int nr_segs) +{ + struct kvec *new_iov; -incomplete_rcv: - if (echo_retries > 0 && server->tcpStatus == CifsGood && - time_after(jiffies, server->lstrp + - (echo_retries * SMB_ECHO_INTERVAL))) { - cERROR(1, "Server %s has not responded in %d seconds. " - "Reconnecting...", server->hostname, - (echo_retries * SMB_ECHO_INTERVAL / HZ)); - cifs_reconnect(server); - csocket = server->ssocket; - wake_up(&server->response_q); - continue; + if (server->iov && nr_segs <= server->nr_iov) + return server->iov; + + /* not big enough -- allocate a new one and release the old */ + new_iov = kmalloc(sizeof(*new_iov) * nr_segs, GFP_NOFS); + if (new_iov) { + kfree(server->iov); + server->iov = new_iov; + server->nr_iov = nr_segs; + } + return new_iov; +} + +int +cifs_readv_from_socket(struct TCP_Server_Info *server, struct kvec *iov_orig, + unsigned int nr_segs, unsigned int to_read) +{ + int length = 0; + int total_read; + unsigned int segs; + struct msghdr smb_msg; + struct kvec *iov; + + iov = get_server_iovec(server, nr_segs); + if (!iov) + return -ENOMEM; + + smb_msg.msg_control = NULL; + smb_msg.msg_controllen = 0; + + for (total_read = 0; to_read; total_read += length, to_read -= length) { + if (server_unresponsive(server)) { + total_read = -EAGAIN; + break; } - length = - kernel_recvmsg(csocket, &smb_msg, - &iov, 1, pdu_length, 0 /* BB other flags? */); + segs = kvec_array_init(iov, iov_orig, nr_segs, total_read); + + length = kernel_recvmsg(server->ssocket, &smb_msg, + iov, segs, to_read, 0); if (server->tcpStatus == CifsExiting) { + total_read = -ESHUTDOWN; break; } else if (server->tcpStatus == CifsNeedReconnect) { - cFYI(1, "Reconnect after server stopped responding"); cifs_reconnect(server); - cFYI(1, "call to reconnect done"); - csocket = server->ssocket; - continue; + total_read = -EAGAIN; + break; } else if (length == -ERESTARTSYS || length == -EAGAIN || length == -EINTR) { - msleep(1); /* minimum sleep to prevent looping - allowing socket to clear and app threads to set - tcpStatus CifsNeedReconnect if server hung */ - if (pdu_length < 4) { - iov.iov_base = (4 - pdu_length) + - (char *)smb_buffer; - iov.iov_len = pdu_length; - smb_msg.msg_control = NULL; - smb_msg.msg_controllen = 0; - goto incomplete_rcv; - } else - continue; - } else if (length <= 0) { - cFYI(1, "Reconnect after unexpected peek error %d", - length); - cifs_reconnect(server); - csocket = server->ssocket; - wake_up(&server->response_q); - continue; - } else if (length < pdu_length) { - cFYI(1, "requested %d bytes but only got %d bytes", - pdu_length, length); - pdu_length -= length; - msleep(1); - goto incomplete_rcv; - } - - /* The right amount was read from socket - 4 bytes */ - /* so we can now interpret the length field */ - - /* the first byte big endian of the length field, - is actually not part of the length but the type - with the most common, zero, as regular data */ - temp = *((char *) smb_buffer); - - /* Note that FC 1001 length is big endian on the wire, - but we convert it here so it is always manipulated - as host byte order */ - pdu_length = be32_to_cpu(smb_buffer->smb_buf_length); - - cFYI(1, "rfc1002 length 0x%x", pdu_length+4); - - if (temp == (char) RFC1002_SESSION_KEEP_ALIVE) { - continue; - } else if (temp == (char)RFC1002_POSITIVE_SESSION_RESPONSE) { - cFYI(1, "Good RFC 1002 session rsp"); - continue; - } else if (temp == (char)RFC1002_NEGATIVE_SESSION_RESPONSE) { - /* we get this from Windows 98 instead of - an error on SMB negprot response */ - cFYI(1, "Negative RFC1002 Session Response Error 0x%x)", - pdu_length); - /* give server a second to clean up */ - msleep(1000); - /* always try 445 first on reconnect since we get NACK - * on some if we ever connected to port 139 (the NACK - * is since we do not begin with RFC1001 session - * initialize frame) + /* + * Minimum sleep to prevent looping, allowing socket + * to clear and app threads to set tcpStatus + * CifsNeedReconnect if server hung. */ - cifs_set_port((struct sockaddr *) - &server->dstaddr, CIFS_PORT); - cifs_reconnect(server); - csocket = server->ssocket; - wake_up(&server->response_q); - continue; - } else if (temp != (char) 0) { - cERROR(1, "Unknown RFC 1002 frame"); - cifs_dump_mem(" Received Data: ", (char *)smb_buffer, - length); - cifs_reconnect(server); - csocket = server->ssocket; + usleep_range(1000, 2000); + length = 0; continue; - } - - /* else we have an SMB response */ - if ((pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) || - (pdu_length < sizeof(struct smb_hdr) - 1 - 4)) { - cERROR(1, "Invalid size SMB length %d pdu_length %d", - length, pdu_length+4); + } else if (length <= 0) { + cFYI(1, "Received no data or error: expecting %d " + "got %d", to_read, length); cifs_reconnect(server); - csocket = server->ssocket; - wake_up(&server->response_q); - continue; + total_read = -EAGAIN; + break; } + } + return total_read; +} - /* else length ok */ - reconnect = 0; - - if (pdu_length > MAX_CIFS_SMALL_BUFFER_SIZE - 4) { - isLargeBuf = true; - memcpy(bigbuf, smallbuf, 4); - smb_buffer = bigbuf; - } - length = 0; - iov.iov_base = 4 + (char *)smb_buffer; - iov.iov_len = pdu_length; - for (total_read = 0; total_read < pdu_length; - total_read += length) { - length = kernel_recvmsg(csocket, &smb_msg, &iov, 1, - pdu_length - total_read, 0); - if (server->tcpStatus == CifsExiting) { - /* then will exit */ - reconnect = 2; - break; - } else if (server->tcpStatus == CifsNeedReconnect) { - cifs_reconnect(server); - csocket = server->ssocket; - /* Reconnect wakes up rspns q */ - /* Now we will reread sock */ - reconnect = 1; - break; - } else if (length == -ERESTARTSYS || - length == -EAGAIN || - length == -EINTR) { - msleep(1); /* minimum sleep to prevent looping, - allowing socket to clear and app - threads to set tcpStatus - CifsNeedReconnect if server hung*/ - length = 0; - continue; - } else if (length <= 0) { - cERROR(1, "Received no data, expecting %d", - pdu_length - total_read); - cifs_reconnect(server); - csocket = server->ssocket; - reconnect = 1; - break; - } - } - if (reconnect == 2) - break; - else if (reconnect == 1) - continue; +int +cifs_read_from_socket(struct TCP_Server_Info *server, char *buf, + unsigned int to_read) +{ + struct kvec iov; - total_read += 4; /* account for rfc1002 hdr */ + iov.iov_base = buf; + iov.iov_len = to_read; - dump_smb(smb_buffer, total_read); + return cifs_readv_from_socket(server, &iov, 1, to_read); +} +static bool +is_smb_response(struct TCP_Server_Info *server, unsigned char type) +{ + /* + * The first byte big endian of the length field, + * is actually not part of the length but the type + * with the most common, zero, as regular data. + */ + switch (type) { + case RFC1002_SESSION_MESSAGE: + /* Regular SMB response */ + return true; + case RFC1002_SESSION_KEEP_ALIVE: + cFYI(1, "RFC 1002 session keep alive"); + break; + case RFC1002_POSITIVE_SESSION_RESPONSE: + cFYI(1, "RFC 1002 positive session response"); + break; + case RFC1002_NEGATIVE_SESSION_RESPONSE: /* - * We know that we received enough to get to the MID as we - * checked the pdu_length earlier. Now check to see - * if the rest of the header is OK. We borrow the length - * var for the rest of the loop to avoid a new stack var. - * - * 48 bytes is enough to display the header and a little bit - * into the payload for debugging purposes. + * We get this from Windows 98 instead of an error on + * SMB negprot response. */ - length = checkSMB(smb_buffer, smb_buffer->Mid, total_read); - if (length != 0) - cifs_dump_mem("Bad SMB: ", smb_buffer, - min_t(unsigned int, total_read, 48)); + cFYI(1, "RFC 1002 negative session response"); + /* give server a second to clean up */ + msleep(1000); + /* + * Always try 445 first on reconnect since we get NACK + * on some if we ever connected to port 139 (the NACK + * is since we do not begin with RFC1001 session + * initialize frame). + */ + cifs_set_port((struct sockaddr *)&server->dstaddr, CIFS_PORT); + cifs_reconnect(server); + wake_up(&server->response_q); + break; + default: + cERROR(1, "RFC 1002 unknown response type 0x%x", type); + cifs_reconnect(server); + } - mid_entry = NULL; - server->lstrp = jiffies; + return false; +} - spin_lock(&GlobalMid_Lock); - list_for_each_safe(tmp, tmp2, &server->pending_mid_q) { - mid_entry = list_entry(tmp, struct mid_q_entry, qhead); +static struct mid_q_entry * +find_mid(struct TCP_Server_Info *server, struct smb_hdr *buf) +{ + struct mid_q_entry *mid; - if (mid_entry->mid != smb_buffer->Mid || - mid_entry->midState != MID_REQUEST_SUBMITTED || - mid_entry->command != smb_buffer->Command) { - mid_entry = NULL; - continue; - } + spin_lock(&GlobalMid_Lock); + list_for_each_entry(mid, &server->pending_mid_q, qhead) { + if (mid->mid == buf->Mid && + mid->midState == MID_REQUEST_SUBMITTED && + mid->command == buf->Command) { + spin_unlock(&GlobalMid_Lock); + return mid; + } + } + spin_unlock(&GlobalMid_Lock); + return NULL; +} - if (length == 0 && - check2ndT2(smb_buffer, server->maxBuf) > 0) { - /* We have a multipart transact2 resp */ - isMultiRsp = true; - if (mid_entry->resp_buf) { - /* merge response - fix up 1st*/ - length = coalesce_t2(smb_buffer, - mid_entry->resp_buf); - if (length > 0) { - length = 0; - mid_entry->multiRsp = true; - break; - } else { - /* all parts received or - * packet is malformed - */ - mid_entry->multiEnd = true; - goto multi_t2_fnd; - } - } else { - if (!isLargeBuf) { - /* - * FIXME: switch to already - * allocated largebuf? - */ - cERROR(1, "1st trans2 resp " - "needs bigbuf"); - } else { - /* Have first buffer */ - mid_entry->resp_buf = - smb_buffer; - mid_entry->largeBuf = true; - bigbuf = NULL; - } - } - break; - } - mid_entry->resp_buf = smb_buffer; - mid_entry->largeBuf = isLargeBuf; -multi_t2_fnd: - if (length == 0) - mid_entry->midState = MID_RESPONSE_RECEIVED; - else - mid_entry->midState = MID_RESPONSE_MALFORMED; +void +dequeue_mid(struct mid_q_entry *mid, bool malformed) +{ #ifdef CONFIG_CIFS_STATS2 - mid_entry->when_received = jiffies; + mid->when_received = jiffies; #endif - list_del_init(&mid_entry->qhead); - break; - } - spin_unlock(&GlobalMid_Lock); - - if (mid_entry != NULL) { - mid_entry->callback(mid_entry); - /* Was previous buf put in mpx struct for multi-rsp? */ - if (!isMultiRsp) { - /* smb buffer will be freed by user thread */ - if (isLargeBuf) - bigbuf = NULL; - else - smallbuf = NULL; - } - } else if (length != 0) { - /* response sanity checks failed */ - continue; - } else if (!is_valid_oplock_break(smb_buffer, server) && - !isMultiRsp) { - cERROR(1, "No task to wake, unknown frame received! " - "NumMids %d", atomic_read(&midCount)); - cifs_dump_mem("Received Data is: ", (char *)smb_buffer, - sizeof(struct smb_hdr)); -#ifdef CONFIG_CIFS_DEBUG2 - cifs_dump_detail(smb_buffer); - cifs_dump_mids(server); -#endif /* CIFS_DEBUG2 */ + spin_lock(&GlobalMid_Lock); + if (!malformed) + mid->midState = MID_RESPONSE_RECEIVED; + else + mid->midState = MID_RESPONSE_MALFORMED; + list_del_init(&mid->qhead); + spin_unlock(&GlobalMid_Lock); +} +static void +handle_mid(struct mid_q_entry *mid, struct TCP_Server_Info *server, + struct smb_hdr *buf, int malformed) +{ + if (malformed == 0 && check2ndT2(buf) > 0) { + mid->multiRsp = true; + if (mid->resp_buf) { + /* merge response - fix up 1st*/ + malformed = coalesce_t2(buf, mid->resp_buf); + if (malformed > 0) + return; + + /* All parts received or packet is malformed. */ + mid->multiEnd = true; + return dequeue_mid(mid, malformed); } - } /* end while !EXITING */ + if (!server->large_buf) { + /*FIXME: switch to already allocated largebuf?*/ + cERROR(1, "1st trans2 resp needs bigbuf"); + } else { + /* Have first buffer */ + mid->resp_buf = buf; + mid->largeBuf = true; + server->bigbuf = NULL; + } + return; + } + mid->resp_buf = buf; + mid->largeBuf = server->large_buf; + /* Was previous buf put in mpx struct for multi-rsp? */ + if (!mid->multiRsp) { + /* smb buffer will be freed by user thread */ + if (server->large_buf) + server->bigbuf = NULL; + else + server->smallbuf = NULL; + } + dequeue_mid(mid, malformed); +} + +static void clean_demultiplex_info(struct TCP_Server_Info *server) +{ + int length; /* take it off the list, if it's not already */ spin_lock(&cifs_tcp_ses_lock); @@ -668,35 +623,39 @@ multi_t2_fnd: spin_unlock(&GlobalMid_Lock); wake_up_all(&server->response_q); - /* check if we have blocked requests that need to free */ - /* Note that cifs_max_pending is normally 50, but - can be set at module install time to as little as two */ + /* + * Check if we have blocked requests that need to free. Note that + * cifs_max_pending is normally 50, but can be set at module install + * time to as little as two. + */ spin_lock(&GlobalMid_Lock); if (atomic_read(&server->inFlight) >= cifs_max_pending) atomic_set(&server->inFlight, cifs_max_pending - 1); - /* We do not want to set the max_pending too low or we - could end up with the counter going negative */ + /* + * We do not want to set the max_pending too low or we could end up + * with the counter going negative. + */ spin_unlock(&GlobalMid_Lock); - /* Although there should not be any requests blocked on - this queue it can not hurt to be paranoid and try to wake up requests - that may haven been blocked when more than 50 at time were on the wire - to the same server - they now will see the session is in exit state - and get out of SendReceive. */ + /* + * Although there should not be any requests blocked on this queue it + * can not hurt to be paranoid and try to wake up requests that may + * haven been blocked when more than 50 at time were on the wire to the + * same server - they now will see the session is in exit state and get + * out of SendReceive. + */ wake_up_all(&server->request_q); /* give those requests time to exit */ msleep(125); if (server->ssocket) { - sock_release(csocket); + sock_release(server->ssocket); server->ssocket = NULL; } - /* buffer usually freed in free_mid - need to free it here on exit */ - cifs_buf_release(bigbuf); - if (smallbuf) /* no sense logging a debug message if NULL */ - cifs_small_buf_release(smallbuf); if (!list_empty(&server->pending_mid_q)) { struct list_head dispose_list; + struct mid_q_entry *mid_entry; + struct list_head *tmp, *tmp2; INIT_LIST_HEAD(&dispose_list); spin_lock(&GlobalMid_Lock); @@ -720,26 +679,189 @@ multi_t2_fnd: } if (!list_empty(&server->pending_mid_q)) { - /* mpx threads have not exited yet give them - at least the smb send timeout time for long ops */ - /* due to delays on oplock break requests, we need - to wait at least 45 seconds before giving up - on a request getting a response and going ahead - and killing cifsd */ + /* + * mpx threads have not exited yet give them at least the smb + * send timeout time for long ops. + * + * Due to delays on oplock break requests, we need to wait at + * least 45 seconds before giving up on a request getting a + * response and going ahead and killing cifsd. + */ cFYI(1, "Wait for exit from demultiplex thread"); msleep(46000); - /* if threads still have not exited they are probably never - coming home not much else we can do but free the memory */ + /* + * If threads still have not exited they are probably never + * coming home not much else we can do but free the memory. + */ } kfree(server->hostname); - task_to_wake = xchg(&server->tsk, NULL); + kfree(server->iov); kfree(server); length = atomic_dec_return(&tcpSesAllocCount); - if (length > 0) + if (length > 0) mempool_resize(cifs_req_poolp, length + cifs_min_rcv, GFP_KERNEL); +} + +static int +standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid) +{ + int length; + char *buf = server->smallbuf; + struct smb_hdr *smb_buffer = (struct smb_hdr *)buf; + unsigned int pdu_length = be32_to_cpu(smb_buffer->smb_buf_length); + + /* make sure this will fit in a large buffer */ + if (pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) { + cERROR(1, "SMB response too long (%u bytes)", + pdu_length); + cifs_reconnect(server); + wake_up(&server->response_q); + return -EAGAIN; + } + + /* switch to large buffer if too big for a small one */ + if (pdu_length > MAX_CIFS_SMALL_BUFFER_SIZE - 4) { + server->large_buf = true; + memcpy(server->bigbuf, server->smallbuf, server->total_read); + buf = server->bigbuf; + smb_buffer = (struct smb_hdr *)buf; + } + + /* now read the rest */ + length = cifs_read_from_socket(server, + buf + sizeof(struct smb_hdr) - 1, + pdu_length - sizeof(struct smb_hdr) + 1 + 4); + if (length < 0) + return length; + server->total_read += length; + + dump_smb(smb_buffer, server->total_read); + + /* + * We know that we received enough to get to the MID as we + * checked the pdu_length earlier. Now check to see + * if the rest of the header is OK. We borrow the length + * var for the rest of the loop to avoid a new stack var. + * + * 48 bytes is enough to display the header and a little bit + * into the payload for debugging purposes. + */ + length = checkSMB(smb_buffer, smb_buffer->Mid, server->total_read); + if (length != 0) + cifs_dump_mem("Bad SMB: ", buf, + min_t(unsigned int, server->total_read, 48)); + + if (mid) + handle_mid(mid, server, smb_buffer, length); + + return length; +} + +static int +cifs_demultiplex_thread(void *p) +{ + int length; + struct TCP_Server_Info *server = p; + unsigned int pdu_length; + char *buf = NULL; + struct smb_hdr *smb_buffer = NULL; + struct task_struct *task_to_wake = NULL; + struct mid_q_entry *mid_entry; + + current->flags |= PF_MEMALLOC; + cFYI(1, "Demultiplex PID: %d", task_pid_nr(current)); + + length = atomic_inc_return(&tcpSesAllocCount); + if (length > 1) + mempool_resize(cifs_req_poolp, length + cifs_min_rcv, + GFP_KERNEL); + + set_freezable(); + while (server->tcpStatus != CifsExiting) { + if (try_to_freeze()) + continue; + + if (!allocate_buffers(server)) + continue; + + server->large_buf = false; + smb_buffer = (struct smb_hdr *)server->smallbuf; + buf = server->smallbuf; + pdu_length = 4; /* enough to get RFC1001 header */ + + length = cifs_read_from_socket(server, buf, pdu_length); + if (length < 0) + continue; + server->total_read = length; + + /* + * The right amount was read from socket - 4 bytes, + * so we can now interpret the length field. + */ + pdu_length = be32_to_cpu(smb_buffer->smb_buf_length); + + cFYI(1, "RFC1002 header 0x%x", pdu_length); + if (!is_smb_response(server, buf[0])) + continue; + + /* make sure we have enough to get to the MID */ + if (pdu_length < sizeof(struct smb_hdr) - 1 - 4) { + cERROR(1, "SMB response too short (%u bytes)", + pdu_length); + cifs_reconnect(server); + wake_up(&server->response_q); + continue; + } + + /* read down to the MID */ + length = cifs_read_from_socket(server, buf + 4, + sizeof(struct smb_hdr) - 1 - 4); + if (length < 0) + continue; + server->total_read += length; + + mid_entry = find_mid(server, smb_buffer); + + if (!mid_entry || !mid_entry->receive) + length = standard_receive3(server, mid_entry); + else + length = mid_entry->receive(server, mid_entry); + + if (length < 0) + continue; + + if (server->large_buf) { + buf = server->bigbuf; + smb_buffer = (struct smb_hdr *)buf; + } + + server->lstrp = jiffies; + if (mid_entry != NULL) { + if (!mid_entry->multiRsp || mid_entry->multiEnd) + mid_entry->callback(mid_entry); + } else if (!is_valid_oplock_break(smb_buffer, server)) { + cERROR(1, "No task to wake, unknown frame received! " + "NumMids %d", atomic_read(&midCount)); + cifs_dump_mem("Received Data is: ", buf, + sizeof(struct smb_hdr)); +#ifdef CONFIG_CIFS_DEBUG2 + cifs_dump_detail(smb_buffer); + cifs_dump_mids(server); +#endif /* CIFS_DEBUG2 */ + + } + } /* end while !EXITING */ + + /* buffer usually freed in free_mid - need to free it here on exit */ + cifs_buf_release(server->bigbuf); + if (server->smallbuf) /* no sense logging a debug message if NULL */ + cifs_small_buf_release(server->smallbuf); + + task_to_wake = xchg(&server->tsk, NULL); + clean_demultiplex_info(server); /* if server->tsk was NULL then wait for a signal before exiting */ if (!task_to_wake) { @@ -788,6 +910,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, { char *value, *data, *end; char *mountdata_copy = NULL, *options; + int err; unsigned int temp_len, i, j; char separator[2]; short int override_uid = -1; @@ -844,6 +967,8 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, cFYI(1, "Null separator not allowed"); } } + vol->backupuid_specified = false; /* no backup intent for a user */ + vol->backupgid_specified = false; /* no backup intent for a group */ while ((data = strsep(&options, separator)) != NULL) { if (!*data) @@ -1259,7 +1384,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, /* ignore */ } else if (strnicmp(data, "guest", 5) == 0) { /* ignore */ - } else if (strnicmp(data, "rw", 2) == 0) { + } else if (strnicmp(data, "rw", 2) == 0 && strlen(data) == 2) { /* ignore */ } else if (strnicmp(data, "ro", 2) == 0) { /* ignore */ @@ -1362,7 +1487,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, vol->server_ino = 1; } else if (strnicmp(data, "noserverino", 9) == 0) { vol->server_ino = 0; - } else if (strnicmp(data, "rwpidforward", 4) == 0) { + } else if (strnicmp(data, "rwpidforward", 12) == 0) { vol->rwpidforward = 1; } else if (strnicmp(data, "cifsacl", 7) == 0) { vol->cifs_acl = 1; @@ -1403,6 +1528,22 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, vol->mfsymlinks = true; } else if (strnicmp(data, "multiuser", 8) == 0) { vol->multiuser = true; + } else if (!strnicmp(data, "backupuid", 9) && value && *value) { + err = kstrtouint(value, 0, &vol->backupuid); + if (err < 0) { + cERROR(1, "%s: Invalid backupuid value", + __func__); + goto cifs_parse_mount_err; + } + vol->backupuid_specified = true; + } else if (!strnicmp(data, "backupgid", 9) && value && *value) { + err = kstrtouint(value, 0, &vol->backupgid); + if (err < 0) { + cERROR(1, "%s: Invalid backupgid value", + __func__); + goto cifs_parse_mount_err; + } + vol->backupgid_specified = true; } else printk(KERN_WARNING "CIFS: Unknown mount option %s\n", data); @@ -1979,7 +2120,7 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) warned_on_ntlm = true; cERROR(1, "default security mechanism requested. The default " "security mechanism will be upgraded from ntlm to " - "ntlmv2 in kernel release 3.1"); + "ntlmv2 in kernel release 3.2"); } ses->overrideSecFlg = volume_info->secFlg; @@ -2170,16 +2311,16 @@ compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data) (new->mnt_cifs_flags & CIFS_MOUNT_MASK)) return 0; - if (old->rsize != new->rsize) - return 0; - /* - * We want to share sb only if we don't specify wsize or specified wsize - * is greater or equal than existing one. + * We want to share sb only if we don't specify an r/wsize or + * specified r/wsize is greater than or equal to existing one. */ if (new->wsize && new->wsize < old->wsize) return 0; + if (new->rsize && new->rsize < old->rsize) + return 0; + if (old->mnt_uid != new->mnt_uid || old->mnt_gid != new->mnt_gid) return 0; @@ -2617,14 +2758,6 @@ void reset_cifs_unix_caps(int xid, struct cifs_tcon *tcon, CIFS_MOUNT_POSIX_PATHS; } - if (cifs_sb && (cifs_sb->rsize > 127 * 1024)) { - if ((cap & CIFS_UNIX_LARGE_READ_CAP) == 0) { - cifs_sb->rsize = 127 * 1024; - cFYI(DBG2, "larger reads not supported by srv"); - } - } - - cFYI(1, "Negotiate caps 0x%x", (int)cap); #ifdef CONFIG_CIFS_DEBUG2 if (cap & CIFS_UNIX_FCNTL_CAP) @@ -2669,31 +2802,19 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, spin_lock_init(&cifs_sb->tlink_tree_lock); cifs_sb->tlink_tree = RB_ROOT; - if (pvolume_info->rsize > CIFSMaxBufSize) { - cERROR(1, "rsize %d too large, using MaxBufSize", - pvolume_info->rsize); - cifs_sb->rsize = CIFSMaxBufSize; - } else if ((pvolume_info->rsize) && - (pvolume_info->rsize <= CIFSMaxBufSize)) - cifs_sb->rsize = pvolume_info->rsize; - else /* default */ - cifs_sb->rsize = CIFSMaxBufSize; - - if (cifs_sb->rsize < 2048) { - cifs_sb->rsize = 2048; - /* Windows ME may prefer this */ - cFYI(1, "readsize set to minimum: 2048"); - } - /* - * Temporarily set wsize for matching superblock. If we end up using - * new sb then cifs_negotiate_wsize will later negotiate it downward - * if needed. + * Temporarily set r/wsize for matching superblock. If we end up using + * new sb then client will later negotiate it downward if needed. */ + cifs_sb->rsize = pvolume_info->rsize; cifs_sb->wsize = pvolume_info->wsize; cifs_sb->mnt_uid = pvolume_info->linux_uid; cifs_sb->mnt_gid = pvolume_info->linux_gid; + if (pvolume_info->backupuid_specified) + cifs_sb->mnt_backupuid = pvolume_info->backupuid; + if (pvolume_info->backupgid_specified) + cifs_sb->mnt_backupgid = pvolume_info->backupgid; cifs_sb->mnt_file_mode = pvolume_info->file_mode; cifs_sb->mnt_dir_mode = pvolume_info->dir_mode; cFYI(1, "file mode: 0x%x dir mode: 0x%x", @@ -2724,6 +2845,10 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_RWPIDFORWARD; if (pvolume_info->cifs_acl) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_ACL; + if (pvolume_info->backupuid_specified) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_BACKUPUID; + if (pvolume_info->backupgid_specified) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_BACKUPGID; if (pvolume_info->override_uid) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_UID; if (pvolume_info->override_gid) @@ -2756,29 +2881,41 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, } /* - * When the server supports very large writes via POSIX extensions, we can - * allow up to 2^24-1, minus the size of a WRITE_AND_X header, not including - * the RFC1001 length. + * When the server supports very large reads and writes via POSIX extensions, + * we can allow up to 2^24-1, minus the size of a READ/WRITE_AND_X header, not + * including the RFC1001 length. * * Note that this might make for "interesting" allocation problems during * writeback however as we have to allocate an array of pointers for the * pages. A 16M write means ~32kb page array with PAGE_CACHE_SIZE == 4096. + * + * For reads, there is a similar problem as we need to allocate an array + * of kvecs to handle the receive, though that should only need to be done + * once. */ #define CIFS_MAX_WSIZE ((1<<24) - 1 - sizeof(WRITE_REQ) + 4) +#define CIFS_MAX_RSIZE ((1<<24) - sizeof(READ_RSP) + 4) /* - * When the server doesn't allow large posix writes, only allow a wsize of - * 128k minus the size of the WRITE_AND_X header. That allows for a write up - * to the maximum size described by RFC1002. + * When the server doesn't allow large posix writes, only allow a rsize/wsize + * of 2^17-1 minus the size of the call header. That allows for a read or + * write up to the maximum size described by RFC1002. */ -#define CIFS_MAX_RFC1002_WSIZE (128 * 1024 - sizeof(WRITE_REQ) + 4) +#define CIFS_MAX_RFC1002_WSIZE ((1<<17) - 1 - sizeof(WRITE_REQ) + 4) +#define CIFS_MAX_RFC1002_RSIZE ((1<<17) - 1 - sizeof(READ_RSP) + 4) /* * The default wsize is 1M. find_get_pages seems to return a maximum of 256 * pages in a single call. With PAGE_CACHE_SIZE == 4k, this means we can fill * a single wsize request with a single call. */ -#define CIFS_DEFAULT_WSIZE (1024 * 1024) +#define CIFS_DEFAULT_IOSIZE (1024 * 1024) + +/* + * Windows only supports a max of 60k reads. Default to that when posix + * extensions aren't in force. + */ +#define CIFS_DEFAULT_NON_POSIX_RSIZE (60 * 1024) static unsigned int cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info) @@ -2786,7 +2923,7 @@ cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info) __u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability); struct TCP_Server_Info *server = tcon->ses->server; unsigned int wsize = pvolume_info->wsize ? pvolume_info->wsize : - CIFS_DEFAULT_WSIZE; + CIFS_DEFAULT_IOSIZE; /* can server support 24-bit write sizes? (via UNIX extensions) */ if (!tcon->unix_ext || !(unix_cap & CIFS_UNIX_LARGE_WRITE_CAP)) @@ -2809,6 +2946,50 @@ cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info) return wsize; } +static unsigned int +cifs_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info) +{ + __u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability); + struct TCP_Server_Info *server = tcon->ses->server; + unsigned int rsize, defsize; + + /* + * Set default value... + * + * HACK alert! Ancient servers have very small buffers. Even though + * MS-CIFS indicates that servers are only limited by the client's + * bufsize for reads, testing against win98se shows that it throws + * INVALID_PARAMETER errors if you try to request too large a read. + * + * If the server advertises a MaxBufferSize of less than one page, + * assume that it also can't satisfy reads larger than that either. + * + * FIXME: Is there a better heuristic for this? + */ + if (tcon->unix_ext && (unix_cap & CIFS_UNIX_LARGE_READ_CAP)) + defsize = CIFS_DEFAULT_IOSIZE; + else if (server->capabilities & CAP_LARGE_READ_X) + defsize = CIFS_DEFAULT_NON_POSIX_RSIZE; + else if (server->maxBuf >= PAGE_CACHE_SIZE) + defsize = CIFSMaxBufSize; + else + defsize = server->maxBuf - sizeof(READ_RSP); + + rsize = pvolume_info->rsize ? pvolume_info->rsize : defsize; + + /* + * no CAP_LARGE_READ_X? Then MS-CIFS states that we must limit this to + * the client's MaxBufferSize. + */ + if (!(server->capabilities & CAP_LARGE_READ_X)) + rsize = min_t(unsigned int, CIFSMaxBufSize, rsize); + + /* hard limit of CIFS_MAX_RSIZE */ + rsize = min_t(unsigned int, rsize, CIFS_MAX_RSIZE); + + return rsize; +} + static int is_path_accessible(int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *full_path) @@ -2838,8 +3019,9 @@ cleanup_volume_info_contents(struct smb_vol *volume_info) { kfree(volume_info->username); kzfree(volume_info->password); + if (volume_info->UNCip != volume_info->UNC + 2) + kfree(volume_info->UNCip); kfree(volume_info->UNC); - kfree(volume_info->UNCip); kfree(volume_info->domainname); kfree(volume_info->iocharset); kfree(volume_info->prepath); @@ -3001,6 +3183,22 @@ cifs_get_volume_info(char *mount_data, const char *devname) return volume_info; } +/* make sure ra_pages is a multiple of rsize */ +static inline unsigned int +cifs_ra_pages(struct cifs_sb_info *cifs_sb) +{ + unsigned int reads; + unsigned int rsize_pages = cifs_sb->rsize / PAGE_CACHE_SIZE; + + if (rsize_pages >= default_backing_dev_info.ra_pages) + return default_backing_dev_info.ra_pages; + else if (rsize_pages == 0) + return rsize_pages; + + reads = default_backing_dev_info.ra_pages / rsize_pages; + return reads * rsize_pages; +} + int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info) { @@ -3019,8 +3217,6 @@ cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info) if (rc) return rc; - cifs_sb->bdi.ra_pages = default_backing_dev_info.ra_pages; - #ifdef CONFIG_CIFS_DFS_UPCALL try_mount_again: /* cleanup activities if we're chasing a referral */ @@ -3085,15 +3281,11 @@ try_mount_again: CIFSSMBQFSAttributeInfo(xid, tcon); } - if ((tcon->unix_ext == 0) && (cifs_sb->rsize > (1024 * 127))) { - cifs_sb->rsize = 1024 * 127; - cFYI(DBG2, "no very large read support, rsize now 127K"); - } - if (!(tcon->ses->capabilities & CAP_LARGE_READ_X)) - cifs_sb->rsize = min(cifs_sb->rsize, - (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE)); - cifs_sb->wsize = cifs_negotiate_wsize(tcon, volume_info); + cifs_sb->rsize = cifs_negotiate_rsize(tcon, volume_info); + + /* tune readahead according to rsize */ + cifs_sb->bdi.ra_pages = cifs_ra_pages(cifs_sb); remote_path_check: #ifdef CONFIG_CIFS_DFS_UPCALL @@ -3193,15 +3385,9 @@ mount_fail_check: else cifs_put_tcp_session(srvTcp); bdi_destroy(&cifs_sb->bdi); - goto out; } - /* volume_info->password is freed above when existing session found - (in which case it is not needed anymore) but when new sesion is created - the password ptr is put in the new session structure (in which case the - password will be freed at unmount time) */ out: - /* zero out password before freeing */ FreeXid(xid); return rc; } @@ -3267,7 +3453,7 @@ CIFSTCon(unsigned int xid, struct cifs_ses *ses, else #endif /* CIFS_WEAK_PW_HASH */ rc = SMBNTencrypt(tcon->password, ses->server->cryptkey, - bcc_ptr); + bcc_ptr, nls_codepage); bcc_ptr += CIFS_AUTH_RESP_SIZE; if (ses->capabilities & CAP_UNICODE) { diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 14d602f178c2..d7eeb9d3ed6f 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -57,11 +57,6 @@ build_path_from_dentry(struct dentry *direntry) struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); unsigned seq; - if (direntry == NULL) - return NULL; /* not much we can do if dentry is freed and - we need to reopen the file after it was closed implicitly - when the server crashed */ - dirsep = CIFS_DIR_SEP(cifs_sb); if (tcon->Flags & SMB_SHARE_IS_IN_DFS) dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1); @@ -110,8 +105,8 @@ cifs_bp_rename_retry: } rcu_read_unlock(); if (namelen != dfsplen || read_seqretry(&rename_lock, seq)) { - cERROR(1, "did not end path lookup where expected namelen is %d", - namelen); + cFYI(1, "did not end path lookup where expected. namelen=%d " + "dfsplen=%d", namelen, dfsplen); /* presumably this is only possible if racing with a rename of one of the parent directories (we can not lock the dentries above us to prevent this, but retrying should be harmless) */ @@ -176,7 +171,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode, } tcon = tlink_tcon(tlink); - if (oplockEnabled) + if (enable_oplocks) oplock = REQ_OPLOCK; if (nd) @@ -249,6 +244,9 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode, if (!tcon->unix_ext && (mode & S_IWUGO) == 0) create_options |= CREATE_OPTION_READONLY; + if (backup_cred(cifs_sb)) + create_options |= CREATE_OPEN_BACKUP_INTENT; + if (tcon->ses->capabilities & CAP_NT_SMBS) rc = CIFSSMBOpen(xid, tcon, full_path, disposition, desiredAccess, create_options, @@ -362,6 +360,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode, { int rc = -EPERM; int xid; + int create_options = CREATE_NOT_DIR | CREATE_OPTION_SPECIAL; struct cifs_sb_info *cifs_sb; struct tcon_link *tlink; struct cifs_tcon *pTcon; @@ -436,9 +435,11 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode, return rc; } - /* FIXME: would WRITE_OWNER | WRITE_DAC be better? */ + if (backup_cred(cifs_sb)) + create_options |= CREATE_OPEN_BACKUP_INTENT; + rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_CREATE, - GENERIC_WRITE, CREATE_NOT_DIR | CREATE_OPTION_SPECIAL, + GENERIC_WRITE, create_options, &fileHandle, &oplock, buf, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); if (rc) @@ -641,14 +642,22 @@ lookup_out: static int cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd) { - if (nd->flags & LOOKUP_RCU) + if (nd && (nd->flags & LOOKUP_RCU)) return -ECHILD; if (direntry->d_inode) { if (cifs_revalidate_dentry(direntry)) return 0; - else + else { + /* + * Forcibly invalidate automounting directory inodes + * (remote DFS directories) so to have them + * instantiated again for automount + */ + if (IS_AUTOMOUNT(direntry->d_inode)) + return 0; return 1; + } } /* diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c index 548f06230a6d..1d2d91d9bf65 100644 --- a/fs/cifs/dns_resolve.c +++ b/fs/cifs/dns_resolve.c @@ -79,8 +79,8 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr) /* Perform the upcall */ rc = dns_query(NULL, hostname, len, NULL, ip_addr, NULL); if (rc < 0) - cERROR(1, "%s: unable to resolve: %*.*s", - __func__, len, len, hostname); + cFYI(1, "%s: unable to resolve: %*.*s", + __func__, len, len, hostname); else cFYI(1, "%s: resolved: %*.*s to %s", __func__, len, len, hostname, *ip_addr); diff --git a/fs/cifs/export.c b/fs/cifs/export.c index 55d87ac52000..9c7ecdccf2f3 100644 --- a/fs/cifs/export.c +++ b/fs/cifs/export.c @@ -45,7 +45,7 @@ #include "cifs_debug.h" #include "cifsfs.h" -#ifdef CIFS_NFSD_EXPORT +#ifdef CONFIG_CIFS_NFSD_EXPORT static struct dentry *cifs_get_parent(struct dentry *dentry) { /* BB need to add code here eventually to enable export via NFSD */ @@ -63,5 +63,5 @@ const struct export_operations cifs_export_ops = { .encode_fs = */ }; -#endif /* CIFS_NFSD_EXPORT */ +#endif /* CONFIG_CIFS_NFSD_EXPORT */ diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 378acdafa356..cf0b1539b321 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -32,6 +32,7 @@ #include <linux/delay.h> #include <linux/mount.h> #include <linux/slab.h> +#include <linux/swap.h> #include <asm/div64.h> #include "cifsfs.h" #include "cifspdu.h" @@ -174,6 +175,7 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb, int rc; int desiredAccess; int disposition; + int create_options = CREATE_NOT_DIR; FILE_ALL_INFO *buf; desiredAccess = cifs_convert_flags(f_flags); @@ -210,9 +212,12 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb, if (!buf) return -ENOMEM; + if (backup_cred(cifs_sb)) + create_options |= CREATE_OPEN_BACKUP_INTENT; + if (tcon->ses->capabilities & CAP_NT_SMBS) rc = CIFSSMBOpen(xid, tcon, full_path, disposition, - desiredAccess, CREATE_NOT_DIR, pnetfid, poplock, buf, + desiredAccess, create_options, pnetfid, poplock, buf, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); else @@ -258,8 +263,6 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file, pCifsFile->invalidHandle = false; pCifsFile->tlink = cifs_get_tlink(tlink); mutex_init(&pCifsFile->fh_mutex); - mutex_init(&pCifsFile->lock_mutex); - INIT_LIST_HEAD(&pCifsFile->llist); INIT_WORK(&pCifsFile->oplock_break, cifs_oplock_break); spin_lock(&cifs_file_list_lock); @@ -272,11 +275,14 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file, spin_unlock(&cifs_file_list_lock); cifs_set_oplock_level(pCifsInode, oplock); + pCifsInode->can_cache_brlcks = pCifsInode->clientCanCacheAll; file->private_data = pCifsFile; return pCifsFile; } +static void cifs_del_lock_waiters(struct cifsLockInfo *lock); + /* * Release a reference on the file private data. This may involve closing * the filehandle out on the server. Must be called without holding @@ -314,6 +320,8 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) } spin_unlock(&cifs_file_list_lock); + cancel_work_sync(&cifs_file->oplock_break); + if (!tcon->need_reconnect && !cifs_file->invalidHandle) { int xid, rc; @@ -325,12 +333,15 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) /* Delete any outstanding lock records. We'll lose them when the file * is closed anyway. */ - mutex_lock(&cifs_file->lock_mutex); - list_for_each_entry_safe(li, tmp, &cifs_file->llist, llist) { + mutex_lock(&cifsi->lock_mutex); + list_for_each_entry_safe(li, tmp, &cifsi->llist, llist) { + if (li->netfid != cifs_file->netfid) + continue; list_del(&li->llist); + cifs_del_lock_waiters(li); kfree(li); } - mutex_unlock(&cifs_file->lock_mutex); + mutex_unlock(&cifsi->lock_mutex); cifs_put_tlink(cifs_file->tlink); dput(cifs_file->dentry); @@ -369,7 +380,7 @@ int cifs_open(struct inode *inode, struct file *file) cFYI(1, "inode = 0x%p file flags are 0x%x for %s", inode, file->f_flags, full_path); - if (oplockEnabled) + if (enable_oplocks) oplock = REQ_OPLOCK; else oplock = 0; @@ -463,6 +474,7 @@ static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush) char *full_path = NULL; int desiredAccess; int disposition = FILE_OPEN; + int create_options = CREATE_NOT_DIR; __u16 netfid; xid = GetXid(); @@ -493,7 +505,7 @@ static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush) cFYI(1, "inode = 0x%p file flags 0x%x for %s", inode, pCifsFile->f_flags, full_path); - if (oplockEnabled) + if (enable_oplocks) oplock = REQ_OPLOCK; else oplock = 0; @@ -522,6 +534,9 @@ static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush) desiredAccess = cifs_convert_flags(pCifsFile->f_flags); + if (backup_cred(cifs_sb)) + create_options |= CREATE_OPEN_BACKUP_INTENT; + /* Can not refresh inode by passing in file_info buf to be returned by SMBOpen and then calling get_inode_info with returned buf since file might have write behind data that needs to be flushed @@ -529,7 +544,7 @@ static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush) that inode was not dirty locally we could do this */ rc = CIFSSMBOpen(xid, tcon, full_path, disposition, desiredAccess, - CREATE_NOT_DIR, &netfid, &oplock, NULL, + create_options, &netfid, &oplock, NULL, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); if (rc) { @@ -629,219 +644,687 @@ int cifs_closedir(struct inode *inode, struct file *file) return rc; } -static int store_file_lock(struct cifsFileInfo *fid, __u64 len, - __u64 offset, __u8 lockType) +static struct cifsLockInfo * +cifs_lock_init(__u64 offset, __u64 length, __u8 type, __u16 netfid) { - struct cifsLockInfo *li = + struct cifsLockInfo *lock = kmalloc(sizeof(struct cifsLockInfo), GFP_KERNEL); - if (li == NULL) - return -ENOMEM; - li->offset = offset; - li->length = len; - li->type = lockType; - mutex_lock(&fid->lock_mutex); - list_add(&li->llist, &fid->llist); - mutex_unlock(&fid->lock_mutex); - return 0; + if (!lock) + return lock; + lock->offset = offset; + lock->length = length; + lock->type = type; + lock->netfid = netfid; + lock->pid = current->tgid; + INIT_LIST_HEAD(&lock->blist); + init_waitqueue_head(&lock->block_q); + return lock; } -int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock) +static void +cifs_del_lock_waiters(struct cifsLockInfo *lock) { - int rc, xid; - __u32 numLock = 0; - __u32 numUnlock = 0; - __u64 length; - bool wait_flag = false; - struct cifs_sb_info *cifs_sb; + struct cifsLockInfo *li, *tmp; + list_for_each_entry_safe(li, tmp, &lock->blist, blist) { + list_del_init(&li->blist); + wake_up(&li->block_q); + } +} + +static bool +__cifs_find_lock_conflict(struct cifsInodeInfo *cinode, __u64 offset, + __u64 length, __u8 type, __u16 netfid, + struct cifsLockInfo **conf_lock) +{ + struct cifsLockInfo *li, *tmp; + + list_for_each_entry_safe(li, tmp, &cinode->llist, llist) { + if (offset + length <= li->offset || + offset >= li->offset + li->length) + continue; + else if ((type & LOCKING_ANDX_SHARED_LOCK) && + ((netfid == li->netfid && current->tgid == li->pid) || + type == li->type)) + continue; + else { + *conf_lock = li; + return true; + } + } + return false; +} + +static bool +cifs_find_lock_conflict(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock, + struct cifsLockInfo **conf_lock) +{ + return __cifs_find_lock_conflict(cinode, lock->offset, lock->length, + lock->type, lock->netfid, conf_lock); +} + +static int +cifs_lock_test(struct cifsInodeInfo *cinode, __u64 offset, __u64 length, + __u8 type, __u16 netfid, struct file_lock *flock) +{ + int rc = 0; + struct cifsLockInfo *conf_lock; + bool exist; + + mutex_lock(&cinode->lock_mutex); + + exist = __cifs_find_lock_conflict(cinode, offset, length, type, netfid, + &conf_lock); + if (exist) { + flock->fl_start = conf_lock->offset; + flock->fl_end = conf_lock->offset + conf_lock->length - 1; + flock->fl_pid = conf_lock->pid; + if (conf_lock->type & LOCKING_ANDX_SHARED_LOCK) + flock->fl_type = F_RDLCK; + else + flock->fl_type = F_WRLCK; + } else if (!cinode->can_cache_brlcks) + rc = 1; + else + flock->fl_type = F_UNLCK; + + mutex_unlock(&cinode->lock_mutex); + return rc; +} + +static void +cifs_lock_add(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock) +{ + mutex_lock(&cinode->lock_mutex); + list_add_tail(&lock->llist, &cinode->llist); + mutex_unlock(&cinode->lock_mutex); +} + +static int +cifs_lock_add_if(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock, + bool wait) +{ + struct cifsLockInfo *conf_lock; + bool exist; + int rc = 0; + +try_again: + exist = false; + mutex_lock(&cinode->lock_mutex); + + exist = cifs_find_lock_conflict(cinode, lock, &conf_lock); + if (!exist && cinode->can_cache_brlcks) { + list_add_tail(&lock->llist, &cinode->llist); + mutex_unlock(&cinode->lock_mutex); + return rc; + } + + if (!exist) + rc = 1; + else if (!wait) + rc = -EACCES; + else { + list_add_tail(&lock->blist, &conf_lock->blist); + mutex_unlock(&cinode->lock_mutex); + rc = wait_event_interruptible(lock->block_q, + (lock->blist.prev == &lock->blist) && + (lock->blist.next == &lock->blist)); + if (!rc) + goto try_again; + mutex_lock(&cinode->lock_mutex); + list_del_init(&lock->blist); + } + + mutex_unlock(&cinode->lock_mutex); + return rc; +} + +static int +cifs_posix_lock_test(struct file *file, struct file_lock *flock) +{ + int rc = 0; + struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode); + unsigned char saved_type = flock->fl_type; + + if ((flock->fl_flags & FL_POSIX) == 0) + return 1; + + mutex_lock(&cinode->lock_mutex); + posix_test_lock(file, flock); + + if (flock->fl_type == F_UNLCK && !cinode->can_cache_brlcks) { + flock->fl_type = saved_type; + rc = 1; + } + + mutex_unlock(&cinode->lock_mutex); + return rc; +} + +static int +cifs_posix_lock_set(struct file *file, struct file_lock *flock) +{ + struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode); + int rc = 1; + + if ((flock->fl_flags & FL_POSIX) == 0) + return rc; + + mutex_lock(&cinode->lock_mutex); + if (!cinode->can_cache_brlcks) { + mutex_unlock(&cinode->lock_mutex); + return rc; + } + rc = posix_lock_file_wait(file, flock); + mutex_unlock(&cinode->lock_mutex); + return rc; +} + +static int +cifs_push_mandatory_locks(struct cifsFileInfo *cfile) +{ + int xid, rc = 0, stored_rc; + struct cifsLockInfo *li, *tmp; struct cifs_tcon *tcon; - __u16 netfid; - __u8 lockType = LOCKING_ANDX_LARGE_FILES; - bool posix_locking = 0; + struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + unsigned int num, max_num; + LOCKING_ANDX_RANGE *buf, *cur; + int types[] = {LOCKING_ANDX_LARGE_FILES, + LOCKING_ANDX_SHARED_LOCK | LOCKING_ANDX_LARGE_FILES}; + int i; + + xid = GetXid(); + tcon = tlink_tcon(cfile->tlink); + + mutex_lock(&cinode->lock_mutex); + if (!cinode->can_cache_brlcks) { + mutex_unlock(&cinode->lock_mutex); + FreeXid(xid); + return rc; + } + + max_num = (tcon->ses->server->maxBuf - sizeof(struct smb_hdr)) / + sizeof(LOCKING_ANDX_RANGE); + buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); + if (!buf) { + mutex_unlock(&cinode->lock_mutex); + FreeXid(xid); + return rc; + } + + for (i = 0; i < 2; i++) { + cur = buf; + num = 0; + list_for_each_entry_safe(li, tmp, &cinode->llist, llist) { + if (li->type != types[i]) + continue; + cur->Pid = cpu_to_le16(li->pid); + cur->LengthLow = cpu_to_le32((u32)li->length); + cur->LengthHigh = cpu_to_le32((u32)(li->length>>32)); + cur->OffsetLow = cpu_to_le32((u32)li->offset); + cur->OffsetHigh = cpu_to_le32((u32)(li->offset>>32)); + if (++num == max_num) { + stored_rc = cifs_lockv(xid, tcon, cfile->netfid, + li->type, 0, num, buf); + if (stored_rc) + rc = stored_rc; + cur = buf; + num = 0; + } else + cur++; + } + + if (num) { + stored_rc = cifs_lockv(xid, tcon, cfile->netfid, + types[i], 0, num, buf); + if (stored_rc) + rc = stored_rc; + } + } + + cinode->can_cache_brlcks = false; + mutex_unlock(&cinode->lock_mutex); + + kfree(buf); + FreeXid(xid); + return rc; +} + +/* copied from fs/locks.c with a name change */ +#define cifs_for_each_lock(inode, lockp) \ + for (lockp = &inode->i_flock; *lockp != NULL; \ + lockp = &(*lockp)->fl_next) + +static int +cifs_push_posix_locks(struct cifsFileInfo *cfile) +{ + struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); + struct file_lock *flock, **before; + struct cifsLockInfo *lck, *tmp; + int rc = 0, xid, type; + __u64 length; + struct list_head locks_to_send; - length = 1 + pfLock->fl_end - pfLock->fl_start; - rc = -EACCES; xid = GetXid(); - cFYI(1, "Lock parm: 0x%x flockflags: " - "0x%x flocktype: 0x%x start: %lld end: %lld", - cmd, pfLock->fl_flags, pfLock->fl_type, pfLock->fl_start, - pfLock->fl_end); + mutex_lock(&cinode->lock_mutex); + if (!cinode->can_cache_brlcks) { + mutex_unlock(&cinode->lock_mutex); + FreeXid(xid); + return rc; + } + + INIT_LIST_HEAD(&locks_to_send); + + lock_flocks(); + cifs_for_each_lock(cfile->dentry->d_inode, before) { + flock = *before; + length = 1 + flock->fl_end - flock->fl_start; + if (flock->fl_type == F_RDLCK || flock->fl_type == F_SHLCK) + type = CIFS_RDLCK; + else + type = CIFS_WRLCK; + + lck = cifs_lock_init(flock->fl_start, length, type, + cfile->netfid); + if (!lck) { + rc = -ENOMEM; + goto send_locks; + } + lck->pid = flock->fl_pid; + + list_add_tail(&lck->llist, &locks_to_send); + } + +send_locks: + unlock_flocks(); + + list_for_each_entry_safe(lck, tmp, &locks_to_send, llist) { + struct file_lock tmp_lock; + int stored_rc; + + tmp_lock.fl_start = lck->offset; + stored_rc = CIFSSMBPosixLock(xid, tcon, lck->netfid, lck->pid, + 0, lck->length, &tmp_lock, + lck->type, 0); + if (stored_rc) + rc = stored_rc; + list_del(&lck->llist); + kfree(lck); + } + + cinode->can_cache_brlcks = false; + mutex_unlock(&cinode->lock_mutex); + + FreeXid(xid); + return rc; +} - if (pfLock->fl_flags & FL_POSIX) +static int +cifs_push_locks(struct cifsFileInfo *cfile) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb); + struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); + + if ((tcon->ses->capabilities & CAP_UNIX) && + (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) && + ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) + return cifs_push_posix_locks(cfile); + + return cifs_push_mandatory_locks(cfile); +} + +static void +cifs_read_flock(struct file_lock *flock, __u8 *type, int *lock, int *unlock, + bool *wait_flag) +{ + if (flock->fl_flags & FL_POSIX) cFYI(1, "Posix"); - if (pfLock->fl_flags & FL_FLOCK) + if (flock->fl_flags & FL_FLOCK) cFYI(1, "Flock"); - if (pfLock->fl_flags & FL_SLEEP) { + if (flock->fl_flags & FL_SLEEP) { cFYI(1, "Blocking lock"); - wait_flag = true; + *wait_flag = true; } - if (pfLock->fl_flags & FL_ACCESS) + if (flock->fl_flags & FL_ACCESS) cFYI(1, "Process suspended by mandatory locking - " - "not implemented yet"); - if (pfLock->fl_flags & FL_LEASE) + "not implemented yet"); + if (flock->fl_flags & FL_LEASE) cFYI(1, "Lease on file - not implemented yet"); - if (pfLock->fl_flags & + if (flock->fl_flags & (~(FL_POSIX | FL_FLOCK | FL_SLEEP | FL_ACCESS | FL_LEASE))) - cFYI(1, "Unknown lock flags 0x%x", pfLock->fl_flags); + cFYI(1, "Unknown lock flags 0x%x", flock->fl_flags); - if (pfLock->fl_type == F_WRLCK) { + *type = LOCKING_ANDX_LARGE_FILES; + if (flock->fl_type == F_WRLCK) { cFYI(1, "F_WRLCK "); - numLock = 1; - } else if (pfLock->fl_type == F_UNLCK) { + *lock = 1; + } else if (flock->fl_type == F_UNLCK) { cFYI(1, "F_UNLCK"); - numUnlock = 1; - /* Check if unlock includes more than - one lock range */ - } else if (pfLock->fl_type == F_RDLCK) { + *unlock = 1; + /* Check if unlock includes more than one lock range */ + } else if (flock->fl_type == F_RDLCK) { cFYI(1, "F_RDLCK"); - lockType |= LOCKING_ANDX_SHARED_LOCK; - numLock = 1; - } else if (pfLock->fl_type == F_EXLCK) { + *type |= LOCKING_ANDX_SHARED_LOCK; + *lock = 1; + } else if (flock->fl_type == F_EXLCK) { cFYI(1, "F_EXLCK"); - numLock = 1; - } else if (pfLock->fl_type == F_SHLCK) { + *lock = 1; + } else if (flock->fl_type == F_SHLCK) { cFYI(1, "F_SHLCK"); - lockType |= LOCKING_ANDX_SHARED_LOCK; - numLock = 1; + *type |= LOCKING_ANDX_SHARED_LOCK; + *lock = 1; } else cFYI(1, "Unknown type of lock"); +} - cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); - tcon = tlink_tcon(((struct cifsFileInfo *)file->private_data)->tlink); - netfid = ((struct cifsFileInfo *)file->private_data)->netfid; +static int +cifs_getlk(struct file *file, struct file_lock *flock, __u8 type, + bool wait_flag, bool posix_lck, int xid) +{ + int rc = 0; + __u64 length = 1 + flock->fl_end - flock->fl_start; + struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data; + struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); + struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + __u16 netfid = cfile->netfid; - if ((tcon->ses->capabilities & CAP_UNIX) && - (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) && - ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) - posix_locking = 1; - /* BB add code here to normalize offset and length to - account for negative length which we can not accept over the - wire */ - if (IS_GETLK(cmd)) { - if (posix_locking) { - int posix_lock_type; - if (lockType & LOCKING_ANDX_SHARED_LOCK) - posix_lock_type = CIFS_RDLCK; - else - posix_lock_type = CIFS_WRLCK; - rc = CIFSSMBPosixLock(xid, tcon, netfid, 1 /* get */, - length, pfLock, posix_lock_type, - wait_flag); - FreeXid(xid); + if (posix_lck) { + int posix_lock_type; + + rc = cifs_posix_lock_test(file, flock); + if (!rc) return rc; - } - /* BB we could chain these into one lock request BB */ - rc = CIFSSMBLock(xid, tcon, netfid, length, pfLock->fl_start, - 0, 1, lockType, 0 /* wait flag */, 0); - if (rc == 0) { - rc = CIFSSMBLock(xid, tcon, netfid, length, - pfLock->fl_start, 1 /* numUnlock */ , - 0 /* numLock */ , lockType, - 0 /* wait flag */, 0); - pfLock->fl_type = F_UNLCK; - if (rc != 0) - cERROR(1, "Error unlocking previously locked " - "range %d during test of lock", rc); - rc = 0; + if (type & LOCKING_ANDX_SHARED_LOCK) + posix_lock_type = CIFS_RDLCK; + else + posix_lock_type = CIFS_WRLCK; + rc = CIFSSMBPosixLock(xid, tcon, netfid, current->tgid, + 1 /* get */, length, flock, + posix_lock_type, wait_flag); + return rc; + } - } else { - /* if rc == ERR_SHARING_VIOLATION ? */ - rc = 0; + rc = cifs_lock_test(cinode, flock->fl_start, length, type, netfid, + flock); + if (!rc) + return rc; + + /* BB we could chain these into one lock request BB */ + rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length, + flock->fl_start, 0, 1, type, 0, 0); + if (rc == 0) { + rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, + length, flock->fl_start, 1, 0, + type, 0, 0); + flock->fl_type = F_UNLCK; + if (rc != 0) + cERROR(1, "Error unlocking previously locked " + "range %d during test of lock", rc); + return 0; + } + + if (type & LOCKING_ANDX_SHARED_LOCK) { + flock->fl_type = F_WRLCK; + return 0; + } + + rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length, + flock->fl_start, 0, 1, + type | LOCKING_ANDX_SHARED_LOCK, 0, 0); + if (rc == 0) { + rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, + length, flock->fl_start, 1, 0, + type | LOCKING_ANDX_SHARED_LOCK, + 0, 0); + flock->fl_type = F_RDLCK; + if (rc != 0) + cERROR(1, "Error unlocking previously locked " + "range %d during test of lock", rc); + } else + flock->fl_type = F_WRLCK; + + return 0; +} + +static void +cifs_move_llist(struct list_head *source, struct list_head *dest) +{ + struct list_head *li, *tmp; + list_for_each_safe(li, tmp, source) + list_move(li, dest); +} + +static void +cifs_free_llist(struct list_head *llist) +{ + struct cifsLockInfo *li, *tmp; + list_for_each_entry_safe(li, tmp, llist, llist) { + cifs_del_lock_waiters(li); + list_del(&li->llist); + kfree(li); + } +} + +static int +cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, int xid) +{ + int rc = 0, stored_rc; + int types[] = {LOCKING_ANDX_LARGE_FILES, + LOCKING_ANDX_SHARED_LOCK | LOCKING_ANDX_LARGE_FILES}; + unsigned int i; + unsigned int max_num, num; + LOCKING_ANDX_RANGE *buf, *cur; + struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); + struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + struct cifsLockInfo *li, *tmp; + __u64 length = 1 + flock->fl_end - flock->fl_start; + struct list_head tmp_llist; + + INIT_LIST_HEAD(&tmp_llist); - if (lockType & LOCKING_ANDX_SHARED_LOCK) { - pfLock->fl_type = F_WRLCK; + max_num = (tcon->ses->server->maxBuf - sizeof(struct smb_hdr)) / + sizeof(LOCKING_ANDX_RANGE); + buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); + if (!buf) + return -ENOMEM; + + mutex_lock(&cinode->lock_mutex); + for (i = 0; i < 2; i++) { + cur = buf; + num = 0; + list_for_each_entry_safe(li, tmp, &cinode->llist, llist) { + if (flock->fl_start > li->offset || + (flock->fl_start + length) < + (li->offset + li->length)) + continue; + if (current->tgid != li->pid) + continue; + if (cfile->netfid != li->netfid) + continue; + if (types[i] != li->type) + continue; + if (!cinode->can_cache_brlcks) { + cur->Pid = cpu_to_le16(li->pid); + cur->LengthLow = cpu_to_le32((u32)li->length); + cur->LengthHigh = + cpu_to_le32((u32)(li->length>>32)); + cur->OffsetLow = cpu_to_le32((u32)li->offset); + cur->OffsetHigh = + cpu_to_le32((u32)(li->offset>>32)); + /* + * We need to save a lock here to let us add + * it again to the inode list if the unlock + * range request fails on the server. + */ + list_move(&li->llist, &tmp_llist); + if (++num == max_num) { + stored_rc = cifs_lockv(xid, tcon, + cfile->netfid, + li->type, num, + 0, buf); + if (stored_rc) { + /* + * We failed on the unlock range + * request - add all locks from + * the tmp list to the head of + * the inode list. + */ + cifs_move_llist(&tmp_llist, + &cinode->llist); + rc = stored_rc; + } else + /* + * The unlock range request + * succeed - free the tmp list. + */ + cifs_free_llist(&tmp_llist); + cur = buf; + num = 0; + } else + cur++; } else { - rc = CIFSSMBLock(xid, tcon, netfid, length, - pfLock->fl_start, 0, 1, - lockType | LOCKING_ANDX_SHARED_LOCK, - 0 /* wait flag */, 0); - if (rc == 0) { - rc = CIFSSMBLock(xid, tcon, netfid, - length, pfLock->fl_start, 1, 0, - lockType | - LOCKING_ANDX_SHARED_LOCK, - 0 /* wait flag */, 0); - pfLock->fl_type = F_RDLCK; - if (rc != 0) - cERROR(1, "Error unlocking " - "previously locked range %d " - "during test of lock", rc); - rc = 0; - } else { - pfLock->fl_type = F_WRLCK; - rc = 0; - } + /* + * We can cache brlock requests - simply remove + * a lock from the inode list. + */ + list_del(&li->llist); + cifs_del_lock_waiters(li); + kfree(li); } } - - FreeXid(xid); - return rc; + if (num) { + stored_rc = cifs_lockv(xid, tcon, cfile->netfid, + types[i], num, 0, buf); + if (stored_rc) { + cifs_move_llist(&tmp_llist, &cinode->llist); + rc = stored_rc; + } else + cifs_free_llist(&tmp_llist); + } } - if (!numLock && !numUnlock) { - /* if no lock or unlock then nothing - to do since we do not know what it is */ - FreeXid(xid); - return -EOPNOTSUPP; - } + mutex_unlock(&cinode->lock_mutex); + kfree(buf); + return rc; +} - if (posix_locking) { +static int +cifs_setlk(struct file *file, struct file_lock *flock, __u8 type, + bool wait_flag, bool posix_lck, int lock, int unlock, int xid) +{ + int rc = 0; + __u64 length = 1 + flock->fl_end - flock->fl_start; + struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data; + struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); + struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode); + __u16 netfid = cfile->netfid; + + if (posix_lck) { int posix_lock_type; - if (lockType & LOCKING_ANDX_SHARED_LOCK) + + rc = cifs_posix_lock_set(file, flock); + if (!rc || rc < 0) + return rc; + + if (type & LOCKING_ANDX_SHARED_LOCK) posix_lock_type = CIFS_RDLCK; else posix_lock_type = CIFS_WRLCK; - if (numUnlock == 1) + if (unlock == 1) posix_lock_type = CIFS_UNLCK; - rc = CIFSSMBPosixLock(xid, tcon, netfid, 0 /* set */, - length, pfLock, posix_lock_type, - wait_flag); - } else { - struct cifsFileInfo *fid = file->private_data; + rc = CIFSSMBPosixLock(xid, tcon, netfid, current->tgid, + 0 /* set */, length, flock, + posix_lock_type, wait_flag); + goto out; + } - if (numLock) { - rc = CIFSSMBLock(xid, tcon, netfid, length, - pfLock->fl_start, 0, numLock, lockType, - wait_flag, 0); + if (lock) { + struct cifsLockInfo *lock; - if (rc == 0) { - /* For Windows locks we must store them. */ - rc = store_file_lock(fid, length, - pfLock->fl_start, lockType); - } - } else if (numUnlock) { - /* For each stored lock that this unlock overlaps - completely, unlock it. */ - int stored_rc = 0; - struct cifsLockInfo *li, *tmp; + lock = cifs_lock_init(flock->fl_start, length, type, netfid); + if (!lock) + return -ENOMEM; - rc = 0; - mutex_lock(&fid->lock_mutex); - list_for_each_entry_safe(li, tmp, &fid->llist, llist) { - if (pfLock->fl_start <= li->offset && - (pfLock->fl_start + length) >= - (li->offset + li->length)) { - stored_rc = CIFSSMBLock(xid, tcon, - netfid, li->length, - li->offset, 1, 0, - li->type, false, 0); - if (stored_rc) - rc = stored_rc; - else { - list_del(&li->llist); - kfree(li); - } - } - } - mutex_unlock(&fid->lock_mutex); + rc = cifs_lock_add_if(cinode, lock, wait_flag); + if (rc < 0) + kfree(lock); + if (rc <= 0) + goto out; + + rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length, + flock->fl_start, 0, 1, type, wait_flag, 0); + if (rc) { + kfree(lock); + goto out; } + + cifs_lock_add(cinode, lock); + } else if (unlock) + rc = cifs_unlock_range(cfile, flock, xid); + +out: + if (flock->fl_flags & FL_POSIX) + posix_lock_file_wait(file, flock); + return rc; +} + +int cifs_lock(struct file *file, int cmd, struct file_lock *flock) +{ + int rc, xid; + int lock = 0, unlock = 0; + bool wait_flag = false; + bool posix_lck = false; + struct cifs_sb_info *cifs_sb; + struct cifs_tcon *tcon; + struct cifsInodeInfo *cinode; + struct cifsFileInfo *cfile; + __u16 netfid; + __u8 type; + + rc = -EACCES; + xid = GetXid(); + + cFYI(1, "Lock parm: 0x%x flockflags: 0x%x flocktype: 0x%x start: %lld " + "end: %lld", cmd, flock->fl_flags, flock->fl_type, + flock->fl_start, flock->fl_end); + + cifs_read_flock(flock, &type, &lock, &unlock, &wait_flag); + + cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); + cfile = (struct cifsFileInfo *)file->private_data; + tcon = tlink_tcon(cfile->tlink); + netfid = cfile->netfid; + cinode = CIFS_I(file->f_path.dentry->d_inode); + + if ((tcon->ses->capabilities & CAP_UNIX) && + (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) && + ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) + posix_lck = true; + /* + * BB add code here to normalize offset and length to account for + * negative length which we can not accept over the wire. + */ + if (IS_GETLK(cmd)) { + rc = cifs_getlk(file, flock, type, wait_flag, posix_lck, xid); + FreeXid(xid); + return rc; } - if (pfLock->fl_flags & FL_POSIX) - posix_lock_file_wait(file, pfLock); + if (!lock && !unlock) { + /* + * if no lock or unlock then nothing to do since we do not + * know what it is + */ + FreeXid(xid); + return -EOPNOTSUPP; + } + + rc = cifs_setlk(file, flock, type, wait_flag, posix_lck, lock, unlock, + xid); FreeXid(xid); return rc; } @@ -1712,6 +2195,7 @@ cifs_iovec_read(struct file *file, const struct iovec *iov, struct smb_com_read_rsp *pSMBr; struct cifs_io_parms io_parms; char *read_data; + unsigned int rsize; __u32 pid; if (!nr_segs) @@ -1724,6 +2208,9 @@ cifs_iovec_read(struct file *file, const struct iovec *iov, xid = GetXid(); cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); + /* FIXME: set up handlers for larger reads and/or convert to async */ + rsize = min_t(unsigned int, cifs_sb->rsize, CIFSMaxBufSize); + open_file = file->private_data; pTcon = tlink_tcon(open_file->tlink); @@ -1736,7 +2223,7 @@ cifs_iovec_read(struct file *file, const struct iovec *iov, cFYI(1, "attempting read on write only file instance"); for (total_read = 0; total_read < len; total_read += bytes_read) { - cur_len = min_t(const size_t, len - total_read, cifs_sb->rsize); + cur_len = min_t(const size_t, len - total_read, rsize); rc = -EAGAIN; read_data = NULL; @@ -1828,6 +2315,7 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size, unsigned int bytes_read = 0; unsigned int total_read; unsigned int current_read_size; + unsigned int rsize; struct cifs_sb_info *cifs_sb; struct cifs_tcon *pTcon; int xid; @@ -1840,6 +2328,9 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size, xid = GetXid(); cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); + /* FIXME: set up handlers for larger reads and/or convert to async */ + rsize = min_t(unsigned int, cifs_sb->rsize, CIFSMaxBufSize); + if (file->private_data == NULL) { rc = -EBADF; FreeXid(xid); @@ -1859,14 +2350,14 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size, for (total_read = 0, current_offset = read_data; read_size > total_read; total_read += bytes_read, current_offset += bytes_read) { - current_read_size = min_t(const int, read_size - total_read, - cifs_sb->rsize); + current_read_size = min_t(uint, read_size - total_read, rsize); + /* For windows me and 9x we do not want to request more than it negotiated since it will refuse the read then */ if ((pTcon->ses) && !(pTcon->ses->capabilities & CAP_LARGE_FILES)) { - current_read_size = min_t(const int, current_read_size, - pTcon->ses->server->maxBuf - 128); + current_read_size = min_t(uint, current_read_size, + CIFSMaxBufSize); } rc = -EAGAIN; while (rc == -EAGAIN) { @@ -1955,82 +2446,24 @@ int cifs_file_mmap(struct file *file, struct vm_area_struct *vma) return rc; } - -static void cifs_copy_cache_pages(struct address_space *mapping, - struct list_head *pages, int bytes_read, char *data) -{ - struct page *page; - char *target; - - while (bytes_read > 0) { - if (list_empty(pages)) - break; - - page = list_entry(pages->prev, struct page, lru); - list_del(&page->lru); - - if (add_to_page_cache_lru(page, mapping, page->index, - GFP_KERNEL)) { - page_cache_release(page); - cFYI(1, "Add page cache failed"); - data += PAGE_CACHE_SIZE; - bytes_read -= PAGE_CACHE_SIZE; - continue; - } - page_cache_release(page); - - target = kmap_atomic(page, KM_USER0); - - if (PAGE_CACHE_SIZE > bytes_read) { - memcpy(target, data, bytes_read); - /* zero the tail end of this partial page */ - memset(target + bytes_read, 0, - PAGE_CACHE_SIZE - bytes_read); - bytes_read = 0; - } else { - memcpy(target, data, PAGE_CACHE_SIZE); - bytes_read -= PAGE_CACHE_SIZE; - } - kunmap_atomic(target, KM_USER0); - - flush_dcache_page(page); - SetPageUptodate(page); - unlock_page(page); - data += PAGE_CACHE_SIZE; - - /* add page to FS-Cache */ - cifs_readpage_to_fscache(mapping->host, page); - } - return; -} - static int cifs_readpages(struct file *file, struct address_space *mapping, struct list_head *page_list, unsigned num_pages) { - int rc = -EACCES; - int xid; - loff_t offset; - struct page *page; - struct cifs_sb_info *cifs_sb; - struct cifs_tcon *pTcon; - unsigned int bytes_read = 0; - unsigned int read_size, i; - char *smb_read_data = NULL; - struct smb_com_read_rsp *pSMBr; - struct cifsFileInfo *open_file; - struct cifs_io_parms io_parms; - int buf_type = CIFS_NO_BUFFER; - __u32 pid; + int rc; + struct list_head tmplist; + struct cifsFileInfo *open_file = file->private_data; + struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); + unsigned int rsize = cifs_sb->rsize; + pid_t pid; - xid = GetXid(); - if (file->private_data == NULL) { - rc = -EBADF; - FreeXid(xid); - return rc; - } - open_file = file->private_data; - cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); - pTcon = tlink_tcon(open_file->tlink); + /* + * Give up immediately if rsize is too small to read an entire page. + * The VFS will fall back to readpage. We should never reach this + * point however since we set ra_pages to 0 when the rsize is smaller + * than a cache page. + */ + if (unlikely(rsize < PAGE_CACHE_SIZE)) + return 0; /* * Reads as many pages as possible from fscache. Returns -ENOBUFS @@ -2039,125 +2472,127 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, rc = cifs_readpages_from_fscache(mapping->host, mapping, page_list, &num_pages); if (rc == 0) - goto read_complete; + return rc; - cFYI(DBG2, "rpages: num pages %d", num_pages); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) pid = open_file->pid; else pid = current->tgid; - for (i = 0; i < num_pages; ) { - unsigned contig_pages; - struct page *tmp_page; - unsigned long expected_index; + rc = 0; + INIT_LIST_HEAD(&tmplist); - if (list_empty(page_list)) - break; + cFYI(1, "%s: file=%p mapping=%p num_pages=%u", __func__, file, + mapping, num_pages); + + /* + * Start with the page at end of list and move it to private + * list. Do the same with any following pages until we hit + * the rsize limit, hit an index discontinuity, or run out of + * pages. Issue the async read and then start the loop again + * until the list is empty. + * + * Note that list order is important. The page_list is in + * the order of declining indexes. When we put the pages in + * the rdata->pages, then we want them in increasing order. + */ + while (!list_empty(page_list)) { + unsigned int bytes = PAGE_CACHE_SIZE; + unsigned int expected_index; + unsigned int nr_pages = 1; + loff_t offset; + struct page *page, *tpage; + struct cifs_readdata *rdata; page = list_entry(page_list->prev, struct page, lru); + + /* + * Lock the page and put it in the cache. Since no one else + * should have access to this page, we're safe to simply set + * PG_locked without checking it first. + */ + __set_page_locked(page); + rc = add_to_page_cache_locked(page, mapping, + page->index, GFP_KERNEL); + + /* give up if we can't stick it in the cache */ + if (rc) { + __clear_page_locked(page); + break; + } + + /* move first page to the tmplist */ offset = (loff_t)page->index << PAGE_CACHE_SHIFT; + list_move_tail(&page->lru, &tmplist); - /* count adjacent pages that we will read into */ - contig_pages = 0; - expected_index = - list_entry(page_list->prev, struct page, lru)->index; - list_for_each_entry_reverse(tmp_page, page_list, lru) { - if (tmp_page->index == expected_index) { - contig_pages++; - expected_index++; - } else + /* now try and add more pages onto the request */ + expected_index = page->index + 1; + list_for_each_entry_safe_reverse(page, tpage, page_list, lru) { + /* discontinuity ? */ + if (page->index != expected_index) + break; + + /* would this page push the read over the rsize? */ + if (bytes + PAGE_CACHE_SIZE > rsize) break; + + __set_page_locked(page); + if (add_to_page_cache_locked(page, mapping, + page->index, GFP_KERNEL)) { + __clear_page_locked(page); + break; + } + list_move_tail(&page->lru, &tmplist); + bytes += PAGE_CACHE_SIZE; + expected_index++; + nr_pages++; } - if (contig_pages + i > num_pages) - contig_pages = num_pages - i; - - /* for reads over a certain size could initiate async - read ahead */ - - read_size = contig_pages * PAGE_CACHE_SIZE; - /* Read size needs to be in multiples of one page */ - read_size = min_t(const unsigned int, read_size, - cifs_sb->rsize & PAGE_CACHE_MASK); - cFYI(DBG2, "rpages: read size 0x%x contiguous pages %d", - read_size, contig_pages); - rc = -EAGAIN; - while (rc == -EAGAIN) { + + rdata = cifs_readdata_alloc(nr_pages); + if (!rdata) { + /* best to give up if we're out of mem */ + list_for_each_entry_safe(page, tpage, &tmplist, lru) { + list_del(&page->lru); + lru_cache_add_file(page); + unlock_page(page); + page_cache_release(page); + } + rc = -ENOMEM; + break; + } + + spin_lock(&cifs_file_list_lock); + cifsFileInfo_get(open_file); + spin_unlock(&cifs_file_list_lock); + rdata->cfile = open_file; + rdata->mapping = mapping; + rdata->offset = offset; + rdata->bytes = bytes; + rdata->pid = pid; + list_splice_init(&tmplist, &rdata->pages); + + do { if (open_file->invalidHandle) { rc = cifs_reopen_file(open_file, true); if (rc != 0) - break; + continue; } - io_parms.netfid = open_file->netfid; - io_parms.pid = pid; - io_parms.tcon = pTcon; - io_parms.offset = offset; - io_parms.length = read_size; - rc = CIFSSMBRead(xid, &io_parms, &bytes_read, - &smb_read_data, &buf_type); - /* BB more RC checks ? */ - if (rc == -EAGAIN) { - if (smb_read_data) { - if (buf_type == CIFS_SMALL_BUFFER) - cifs_small_buf_release(smb_read_data); - else if (buf_type == CIFS_LARGE_BUFFER) - cifs_buf_release(smb_read_data); - smb_read_data = NULL; - } - } - } - if ((rc < 0) || (smb_read_data == NULL)) { - cFYI(1, "Read error in readpages: %d", rc); - break; - } else if (bytes_read > 0) { - task_io_account_read(bytes_read); - pSMBr = (struct smb_com_read_rsp *)smb_read_data; - cifs_copy_cache_pages(mapping, page_list, bytes_read, - smb_read_data + 4 /* RFC1001 hdr */ + - le16_to_cpu(pSMBr->DataOffset)); - - i += bytes_read >> PAGE_CACHE_SHIFT; - cifs_stats_bytes_read(pTcon, bytes_read); - if ((bytes_read & PAGE_CACHE_MASK) != bytes_read) { - i++; /* account for partial page */ - - /* server copy of file can have smaller size - than client */ - /* BB do we need to verify this common case ? - this case is ok - if we are at server EOF - we will hit it on next read */ + rc = cifs_async_readv(rdata); + } while (rc == -EAGAIN); - /* break; */ + if (rc != 0) { + list_for_each_entry_safe(page, tpage, &rdata->pages, + lru) { + list_del(&page->lru); + lru_cache_add_file(page); + unlock_page(page); + page_cache_release(page); } - } else { - cFYI(1, "No bytes read (%d) at offset %lld . " - "Cleaning remaining pages from readahead list", - bytes_read, offset); - /* BB turn off caching and do new lookup on - file size at server? */ + cifs_readdata_free(rdata); break; } - if (smb_read_data) { - if (buf_type == CIFS_SMALL_BUFFER) - cifs_small_buf_release(smb_read_data); - else if (buf_type == CIFS_LARGE_BUFFER) - cifs_buf_release(smb_read_data); - smb_read_data = NULL; - } - bytes_read = 0; } -/* need to free smb_read_data buf before exit */ - if (smb_read_data) { - if (buf_type == CIFS_SMALL_BUFFER) - cifs_small_buf_release(smb_read_data); - else if (buf_type == CIFS_LARGE_BUFFER) - cifs_buf_release(smb_read_data); - smb_read_data = NULL; - } - -read_complete: - FreeXid(xid); return rc; } @@ -2406,6 +2841,10 @@ void cifs_oplock_break(struct work_struct *work) cFYI(1, "Oplock flush inode %p rc %d", inode, rc); } + rc = cifs_push_locks(cfile); + if (rc) + cERROR(1, "Push locks rc = %d", rc); + /* * releasing stale oplock after recent reconnect of smb session using * a now incorrect file handle is not a data integrity issue but do @@ -2413,36 +2852,12 @@ void cifs_oplock_break(struct work_struct *work) * disconnected since oplock already released by the server */ if (!cfile->oplock_break_cancelled) { - rc = CIFSSMBLock(0, tlink_tcon(cfile->tlink), cfile->netfid, 0, - 0, 0, 0, LOCKING_ANDX_OPLOCK_RELEASE, false, + rc = CIFSSMBLock(0, tlink_tcon(cfile->tlink), cfile->netfid, + current->tgid, 0, 0, 0, 0, + LOCKING_ANDX_OPLOCK_RELEASE, false, cinode->clientCanCacheRead ? 1 : 0); cFYI(1, "Oplock release rc = %d", rc); } - - /* - * We might have kicked in before is_valid_oplock_break() - * finished grabbing reference for us. Make sure it's done by - * waiting for cifs_file_list_lock. - */ - spin_lock(&cifs_file_list_lock); - spin_unlock(&cifs_file_list_lock); - - cifs_oplock_break_put(cfile); -} - -/* must be called while holding cifs_file_list_lock */ -void cifs_oplock_break_get(struct cifsFileInfo *cfile) -{ - cifs_sb_active(cfile->dentry->d_sb); - cifsFileInfo_get(cfile); -} - -void cifs_oplock_break_put(struct cifsFileInfo *cfile) -{ - struct super_block *sb = cfile->dentry->d_sb; - - cifsFileInfo_put(cfile); - cifs_sb_deactive(sb); } const struct address_space_operations cifs_addr_ops = { diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 9b018c8334fa..e851d5b8931e 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -132,7 +132,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) inode->i_mtime = fattr->cf_mtime; inode->i_ctime = fattr->cf_ctime; inode->i_rdev = fattr->cf_rdev; - inode->i_nlink = fattr->cf_nlink; + set_nlink(inode, fattr->cf_nlink); inode->i_uid = fattr->cf_uid; inode->i_gid = fattr->cf_gid; @@ -562,7 +562,16 @@ int cifs_get_file_info(struct file *filp) xid = GetXid(); rc = CIFSSMBQFileInfo(xid, tcon, cfile->netfid, &find_data); - if (rc == -EOPNOTSUPP || rc == -EINVAL) { + switch (rc) { + case 0: + cifs_all_info_to_fattr(&fattr, &find_data, cifs_sb, false); + break; + case -EREMOTE: + cifs_create_dfs_fattr(&fattr, inode->i_sb); + rc = 0; + break; + case -EOPNOTSUPP: + case -EINVAL: /* * FIXME: legacy server -- fall back to path-based call? * for now, just skip revalidating and mark inode for @@ -570,18 +579,14 @@ int cifs_get_file_info(struct file *filp) */ rc = 0; CIFS_I(inode)->time = 0; + default: goto cgfi_exit; - } else if (rc == -EREMOTE) { - cifs_create_dfs_fattr(&fattr, inode->i_sb); - rc = 0; - } else if (rc) - goto cgfi_exit; + } /* * don't bother with SFU junk here -- just mark inode as needing * revalidation. */ - cifs_all_info_to_fattr(&fattr, &find_data, cifs_sb, false); fattr.cf_uniqueid = CIFS_I(inode)->uniqueid; fattr.cf_flags |= CIFS_FATTR_NEED_REVAL; cifs_fattr_to_inode(inode, &fattr); @@ -764,20 +769,10 @@ char *cifs_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb, if (full_path == NULL) return full_path; - if (dfsplen) { + if (dfsplen) strncpy(full_path, tcon->treeName, dfsplen); - /* switch slash direction in prepath depending on whether - * windows or posix style path names - */ - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) { - int i; - for (i = 0; i < dfsplen; i++) { - if (full_path[i] == '\\') - full_path[i] = '/'; - } - } - } strncpy(full_path + dfsplen, vol->prepath, pplen); + convert_delimiter(full_path, CIFS_DIR_SEP(cifs_sb)); full_path[dfsplen + pplen] = 0; /* add trailing null */ return full_path; } @@ -910,7 +905,7 @@ struct inode *cifs_root_iget(struct super_block *sb) if (rc && tcon->ipc) { cFYI(1, "ipc connection - fake read inode"); inode->i_mode |= S_IFDIR; - inode->i_nlink = 2; + set_nlink(inode, 2); inode->i_op = &cifs_ipc_inode_ops; inode->i_fop = &simple_dir_operations; inode->i_uid = cifs_sb->mnt_uid; @@ -1372,7 +1367,7 @@ mkdir_get_info: /* setting nlink not necessary except in cases where we * failed to get it from the server or was set bogus */ if ((direntry->d_inode) && (direntry->d_inode->i_nlink < 2)) - direntry->d_inode->i_nlink = 2; + set_nlink(direntry->d_inode, 2); mode &= ~current_umask(); /* must turn on setgid bit if parent dir has it */ @@ -2106,6 +2101,8 @@ static int cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) { int xid; + uid_t uid = NO_CHANGE_32; + gid_t gid = NO_CHANGE_32; struct inode *inode = direntry->d_inode; struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifsInodeInfo *cifsInode = CIFS_I(inode); @@ -2156,13 +2153,25 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) goto cifs_setattr_exit; } - /* - * Without unix extensions we can't send ownership changes to the - * server, so silently ignore them. This is consistent with how - * local DOS/Windows filesystems behave (VFAT, NTFS, etc). With - * CIFSACL support + proper Windows to Unix idmapping, we may be - * able to support this in the future. - */ + if (attrs->ia_valid & ATTR_UID) + uid = attrs->ia_uid; + + if (attrs->ia_valid & ATTR_GID) + gid = attrs->ia_gid; + +#ifdef CONFIG_CIFS_ACL + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { + if (uid != NO_CHANGE_32 || gid != NO_CHANGE_32) { + rc = id_mode_to_cifs_acl(inode, full_path, NO_CHANGE_64, + uid, gid); + if (rc) { + cFYI(1, "%s: Setting id failed with error: %d", + __func__, rc); + goto cifs_setattr_exit; + } + } + } else +#endif /* CONFIG_CIFS_ACL */ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)) attrs->ia_valid &= ~(ATTR_UID | ATTR_GID); @@ -2171,15 +2180,12 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) attrs->ia_valid &= ~ATTR_MODE; if (attrs->ia_valid & ATTR_MODE) { - cFYI(1, "Mode changed to 0%o", attrs->ia_mode); mode = attrs->ia_mode; - } - - if (attrs->ia_valid & ATTR_MODE) { rc = 0; #ifdef CONFIG_CIFS_ACL if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { - rc = mode_to_cifs_acl(inode, full_path, mode); + rc = id_mode_to_cifs_acl(inode, full_path, mode, + NO_CHANGE_32, NO_CHANGE_32); if (rc) { cFYI(1, "%s: Setting ACL failed with error: %d", __func__, rc); diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 556b1a0b54de..6b0e06434391 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -74,8 +74,14 @@ symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash) cERROR(1, "%s: Could not init md5 shash\n", __func__); goto symlink_hash_err; } - crypto_shash_update(&sdescmd5->shash, link_str, link_len); + rc = crypto_shash_update(&sdescmd5->shash, link_str, link_len); + if (rc) { + cERROR(1, "%s: Could not update iwth link_str\n", __func__); + goto symlink_hash_err; + } rc = crypto_shash_final(&sdescmd5->shash, md5_hash); + if (rc) + cERROR(1, "%s: Could not generate md5 hash\n", __func__); symlink_hash_err: crypto_free_shash(md5); @@ -177,14 +183,20 @@ CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str) static int CIFSCreateMFSymLink(const int xid, struct cifs_tcon *tcon, const char *fromName, const char *toName, - const struct nls_table *nls_codepage, int remap) + struct cifs_sb_info *cifs_sb) { int rc; int oplock = 0; + int remap; + int create_options = CREATE_NOT_DIR; __u16 netfid = 0; u8 *buf; unsigned int bytes_written = 0; struct cifs_io_parms io_parms; + struct nls_table *nls_codepage; + + nls_codepage = cifs_sb->local_nls; + remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR; buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL); if (!buf) @@ -196,8 +208,11 @@ CIFSCreateMFSymLink(const int xid, struct cifs_tcon *tcon, return rc; } + if (backup_cred(cifs_sb)) + create_options |= CREATE_OPEN_BACKUP_INTENT; + rc = CIFSSMBOpen(xid, tcon, fromName, FILE_CREATE, GENERIC_WRITE, - CREATE_NOT_DIR, &netfid, &oplock, NULL, + create_options, &netfid, &oplock, NULL, nls_codepage, remap); if (rc != 0) { kfree(buf); @@ -418,7 +433,7 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode, if (old_file->d_inode) { cifsInode = CIFS_I(old_file->d_inode); if (rc == 0) { - old_file->d_inode->i_nlink++; + inc_nlink(old_file->d_inode); /* BB should we make this contingent on superblock flag NOATIME? */ /* old_file->d_inode->i_ctime = CURRENT_TIME;*/ /* parent dir timestamps will update from srv @@ -553,9 +568,7 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname) /* BB what if DFS and this volume is on different share? BB */ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) rc = CIFSCreateMFSymLink(xid, pTcon, full_path, symname, - cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + cifs_sb); else if (pTcon->unix_ext) rc = CIFSUnixCreateSymLink(xid, pTcon, full_path, symname, cifs_sb->local_nls); diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 03a1f491d39b..703ef5c6fdb1 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -420,19 +420,22 @@ check_smb_hdr(struct smb_hdr *smb, __u16 mid) } int -checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length) +checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int total_read) { - __u32 len = be32_to_cpu(smb->smb_buf_length); + __u32 rfclen = be32_to_cpu(smb->smb_buf_length); __u32 clc_len; /* calculated length */ - cFYI(0, "checkSMB Length: 0x%x, smb_buf_length: 0x%x", length, len); + cFYI(0, "checkSMB Length: 0x%x, smb_buf_length: 0x%x", + total_read, rfclen); - if (length < 2 + sizeof(struct smb_hdr)) { - if ((length >= sizeof(struct smb_hdr) - 1) + /* is this frame too small to even get to a BCC? */ + if (total_read < 2 + sizeof(struct smb_hdr)) { + if ((total_read >= sizeof(struct smb_hdr) - 1) && (smb->Status.CifsError != 0)) { + /* it's an error return */ smb->WordCount = 0; /* some error cases do not return wct and bcc */ return 0; - } else if ((length == sizeof(struct smb_hdr) + 1) && + } else if ((total_read == sizeof(struct smb_hdr) + 1) && (smb->WordCount == 0)) { char *tmp = (char *)smb; /* Need to work around a bug in two servers here */ @@ -452,39 +455,35 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length) } else { cERROR(1, "Length less than smb header size"); } - return 1; - } - if (len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) { - cERROR(1, "smb length greater than MaxBufSize, mid=%d", - smb->Mid); - return 1; + return -EIO; } + /* otherwise, there is enough to get to the BCC */ if (check_smb_hdr(smb, mid)) - return 1; + return -EIO; clc_len = smbCalcSize(smb); - if (4 + len != length) { + if (4 + rfclen != total_read) { cERROR(1, "Length read does not match RFC1001 length %d", - len); - return 1; + rfclen); + return -EIO; } - if (4 + len != clc_len) { + if (4 + rfclen != clc_len) { /* check if bcc wrapped around for large read responses */ - if ((len > 64 * 1024) && (len > clc_len)) { + if ((rfclen > 64 * 1024) && (rfclen > clc_len)) { /* check if lengths match mod 64K */ - if (((4 + len) & 0xFFFF) == (clc_len & 0xFFFF)) + if (((4 + rfclen) & 0xFFFF) == (clc_len & 0xFFFF)) return 0; /* bcc wrapped */ } cFYI(1, "Calculated size %u vs length %u mismatch for mid=%u", - clc_len, 4 + len, smb->Mid); + clc_len, 4 + rfclen, smb->Mid); - if (4 + len < clc_len) { + if (4 + rfclen < clc_len) { cERROR(1, "RFC1001 size %u smaller than SMB for mid=%u", - len, smb->Mid); - return 1; - } else if (len > clc_len + 512) { + rfclen, smb->Mid); + return -EIO; + } else if (rfclen > clc_len + 512) { /* * Some servers (Windows XP in particular) send more * data than the lengths in the SMB packet would @@ -495,8 +494,8 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length) * data to 512 bytes. */ cERROR(1, "RFC1001 size %u more than 512 bytes larger " - "than SMB for mid=%u", len, smb->Mid); - return 1; + "than SMB for mid=%u", rfclen, smb->Mid); + return -EIO; } } return 0; @@ -585,15 +584,8 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv) cifs_set_oplock_level(pCifsInode, pSMB->OplockLevel ? OPLOCK_READ : 0); - /* - * cifs_oplock_break_put() can't be called - * from here. Get reference after queueing - * succeeded. cifs_oplock_break() will - * synchronize using cifs_file_list_lock. - */ - if (queue_work(system_nrt_wq, - &netfile->oplock_break)) - cifs_oplock_break_get(netfile); + queue_work(system_nrt_wq, + &netfile->oplock_break); netfile->oplock_break_cancelled = false; spin_unlock(&cifs_file_list_lock); @@ -683,3 +675,18 @@ void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock) cinode->clientCanCacheRead = false; } } + +bool +backup_cred(struct cifs_sb_info *cifs_sb) +{ + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPUID) { + if (cifs_sb->mnt_backupuid == current_fsuid()) + return true; + } + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPGID) { + if (in_group_p(cifs_sb->mnt_backupgid)) + return true; + } + + return false; +} diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 965a3af186a1..5de03ec20144 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -4,6 +4,7 @@ * Directory search handling * * Copyright (C) International Business Machines Corp., 2004, 2008 + * Copyright (C) Red Hat, Inc., 2011 * Author(s): Steve French (sfrench@us.ibm.com) * * This library is free software; you can redistribute it and/or modify @@ -290,10 +291,10 @@ error_exit: } /* return length of unicode string in bytes */ -static int cifs_unicode_bytelen(char *str) +static int cifs_unicode_bytelen(const char *str) { int len; - __le16 *ustr = (__le16 *)str; + const __le16 *ustr = (const __le16 *)str; for (len = 0; len <= PATH_MAX; len++) { if (ustr[len] == 0) @@ -334,78 +335,128 @@ static char *nxt_dir_entry(char *old_entry, char *end_of_smb, int level) } +struct cifs_dirent { + const char *name; + size_t namelen; + u32 resume_key; + u64 ino; +}; + +static void cifs_fill_dirent_unix(struct cifs_dirent *de, + const FILE_UNIX_INFO *info, bool is_unicode) +{ + de->name = &info->FileName[0]; + if (is_unicode) + de->namelen = cifs_unicode_bytelen(de->name); + else + de->namelen = strnlen(de->name, PATH_MAX); + de->resume_key = info->ResumeKey; + de->ino = le64_to_cpu(info->basic.UniqueId); +} + +static void cifs_fill_dirent_dir(struct cifs_dirent *de, + const FILE_DIRECTORY_INFO *info) +{ + de->name = &info->FileName[0]; + de->namelen = le32_to_cpu(info->FileNameLength); + de->resume_key = info->FileIndex; +} + +static void cifs_fill_dirent_full(struct cifs_dirent *de, + const FILE_FULL_DIRECTORY_INFO *info) +{ + de->name = &info->FileName[0]; + de->namelen = le32_to_cpu(info->FileNameLength); + de->resume_key = info->FileIndex; +} + +static void cifs_fill_dirent_search(struct cifs_dirent *de, + const SEARCH_ID_FULL_DIR_INFO *info) +{ + de->name = &info->FileName[0]; + de->namelen = le32_to_cpu(info->FileNameLength); + de->resume_key = info->FileIndex; + de->ino = le64_to_cpu(info->UniqueId); +} + +static void cifs_fill_dirent_both(struct cifs_dirent *de, + const FILE_BOTH_DIRECTORY_INFO *info) +{ + de->name = &info->FileName[0]; + de->namelen = le32_to_cpu(info->FileNameLength); + de->resume_key = info->FileIndex; +} + +static void cifs_fill_dirent_std(struct cifs_dirent *de, + const FIND_FILE_STANDARD_INFO *info) +{ + de->name = &info->FileName[0]; + /* one byte length, no endianess conversion */ + de->namelen = info->FileNameLength; + de->resume_key = info->ResumeKey; +} + +static int cifs_fill_dirent(struct cifs_dirent *de, const void *info, + u16 level, bool is_unicode) +{ + memset(de, 0, sizeof(*de)); + + switch (level) { + case SMB_FIND_FILE_UNIX: + cifs_fill_dirent_unix(de, info, is_unicode); + break; + case SMB_FIND_FILE_DIRECTORY_INFO: + cifs_fill_dirent_dir(de, info); + break; + case SMB_FIND_FILE_FULL_DIRECTORY_INFO: + cifs_fill_dirent_full(de, info); + break; + case SMB_FIND_FILE_ID_FULL_DIR_INFO: + cifs_fill_dirent_search(de, info); + break; + case SMB_FIND_FILE_BOTH_DIRECTORY_INFO: + cifs_fill_dirent_both(de, info); + break; + case SMB_FIND_FILE_INFO_STANDARD: + cifs_fill_dirent_std(de, info); + break; + default: + cFYI(1, "Unknown findfirst level %d", level); + return -EINVAL; + } + + return 0; +} + #define UNICODE_DOT cpu_to_le16(0x2e) /* return 0 if no match and 1 for . (current directory) and 2 for .. (parent) */ -static int cifs_entry_is_dot(char *current_entry, struct cifsFileInfo *cfile) +static int cifs_entry_is_dot(struct cifs_dirent *de, bool is_unicode) { int rc = 0; - char *filename = NULL; - int len = 0; - - if (cfile->srch_inf.info_level == SMB_FIND_FILE_UNIX) { - FILE_UNIX_INFO *pFindData = (FILE_UNIX_INFO *)current_entry; - filename = &pFindData->FileName[0]; - if (cfile->srch_inf.unicode) { - len = cifs_unicode_bytelen(filename); - } else { - /* BB should we make this strnlen of PATH_MAX? */ - len = strnlen(filename, 5); - } - } else if (cfile->srch_inf.info_level == SMB_FIND_FILE_DIRECTORY_INFO) { - FILE_DIRECTORY_INFO *pFindData = - (FILE_DIRECTORY_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = le32_to_cpu(pFindData->FileNameLength); - } else if (cfile->srch_inf.info_level == - SMB_FIND_FILE_FULL_DIRECTORY_INFO) { - FILE_FULL_DIRECTORY_INFO *pFindData = - (FILE_FULL_DIRECTORY_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = le32_to_cpu(pFindData->FileNameLength); - } else if (cfile->srch_inf.info_level == - SMB_FIND_FILE_ID_FULL_DIR_INFO) { - SEARCH_ID_FULL_DIR_INFO *pFindData = - (SEARCH_ID_FULL_DIR_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = le32_to_cpu(pFindData->FileNameLength); - } else if (cfile->srch_inf.info_level == - SMB_FIND_FILE_BOTH_DIRECTORY_INFO) { - FILE_BOTH_DIRECTORY_INFO *pFindData = - (FILE_BOTH_DIRECTORY_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = le32_to_cpu(pFindData->FileNameLength); - } else if (cfile->srch_inf.info_level == SMB_FIND_FILE_INFO_STANDARD) { - FIND_FILE_STANDARD_INFO *pFindData = - (FIND_FILE_STANDARD_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = pFindData->FileNameLength; - } else { - cFYI(1, "Unknown findfirst level %d", - cfile->srch_inf.info_level); - } - if (filename) { - if (cfile->srch_inf.unicode) { - __le16 *ufilename = (__le16 *)filename; - if (len == 2) { - /* check for . */ - if (ufilename[0] == UNICODE_DOT) - rc = 1; - } else if (len == 4) { - /* check for .. */ - if ((ufilename[0] == UNICODE_DOT) - && (ufilename[1] == UNICODE_DOT)) - rc = 2; - } - } else /* ASCII */ { - if (len == 1) { - if (filename[0] == '.') - rc = 1; - } else if (len == 2) { - if ((filename[0] == '.') && (filename[1] == '.')) - rc = 2; - } + if (!de->name) + return 0; + + if (is_unicode) { + __le16 *ufilename = (__le16 *)de->name; + if (de->namelen == 2) { + /* check for . */ + if (ufilename[0] == UNICODE_DOT) + rc = 1; + } else if (de->namelen == 4) { + /* check for .. */ + if (ufilename[0] == UNICODE_DOT && + ufilename[1] == UNICODE_DOT) + rc = 2; + } + } else /* ASCII */ { + if (de->namelen == 1) { + if (de->name[0] == '.') + rc = 1; + } else if (de->namelen == 2) { + if (de->name[0] == '.' && de->name[1] == '.') + rc = 2; } } @@ -427,66 +478,18 @@ static int is_dir_changed(struct file *file) } static int cifs_save_resume_key(const char *current_entry, - struct cifsFileInfo *cifsFile) + struct cifsFileInfo *file_info) { - int rc = 0; - unsigned int len = 0; - __u16 level; - char *filename; - - if ((cifsFile == NULL) || (current_entry == NULL)) - return -EINVAL; - - level = cifsFile->srch_inf.info_level; - - if (level == SMB_FIND_FILE_UNIX) { - FILE_UNIX_INFO *pFindData = (FILE_UNIX_INFO *)current_entry; + struct cifs_dirent de; + int rc; - filename = &pFindData->FileName[0]; - if (cifsFile->srch_inf.unicode) { - len = cifs_unicode_bytelen(filename); - } else { - /* BB should we make this strnlen of PATH_MAX? */ - len = strnlen(filename, PATH_MAX); - } - cifsFile->srch_inf.resume_key = pFindData->ResumeKey; - } else if (level == SMB_FIND_FILE_DIRECTORY_INFO) { - FILE_DIRECTORY_INFO *pFindData = - (FILE_DIRECTORY_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = le32_to_cpu(pFindData->FileNameLength); - cifsFile->srch_inf.resume_key = pFindData->FileIndex; - } else if (level == SMB_FIND_FILE_FULL_DIRECTORY_INFO) { - FILE_FULL_DIRECTORY_INFO *pFindData = - (FILE_FULL_DIRECTORY_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = le32_to_cpu(pFindData->FileNameLength); - cifsFile->srch_inf.resume_key = pFindData->FileIndex; - } else if (level == SMB_FIND_FILE_ID_FULL_DIR_INFO) { - SEARCH_ID_FULL_DIR_INFO *pFindData = - (SEARCH_ID_FULL_DIR_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = le32_to_cpu(pFindData->FileNameLength); - cifsFile->srch_inf.resume_key = pFindData->FileIndex; - } else if (level == SMB_FIND_FILE_BOTH_DIRECTORY_INFO) { - FILE_BOTH_DIRECTORY_INFO *pFindData = - (FILE_BOTH_DIRECTORY_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = le32_to_cpu(pFindData->FileNameLength); - cifsFile->srch_inf.resume_key = pFindData->FileIndex; - } else if (level == SMB_FIND_FILE_INFO_STANDARD) { - FIND_FILE_STANDARD_INFO *pFindData = - (FIND_FILE_STANDARD_INFO *)current_entry; - filename = &pFindData->FileName[0]; - /* one byte length, no name conversion */ - len = (unsigned int)pFindData->FileNameLength; - cifsFile->srch_inf.resume_key = pFindData->ResumeKey; - } else { - cFYI(1, "Unknown findfirst level %d", level); - return -EINVAL; + rc = cifs_fill_dirent(&de, current_entry, file_info->srch_inf.info_level, + file_info->srch_inf.unicode); + if (!rc) { + file_info->srch_inf.presume_name = de.name; + file_info->srch_inf.resume_name_len = de.namelen; + file_info->srch_inf.resume_key = de.resume_key; } - cifsFile->srch_inf.resume_name_len = len; - cifsFile->srch_inf.presume_name = filename; return rc; } @@ -605,136 +608,70 @@ static int find_cifs_entry(const int xid, struct cifs_tcon *pTcon, return rc; } -/* inode num, inode type and filename returned */ -static int cifs_get_name_from_search_buf(struct qstr *pqst, - char *current_entry, __u16 level, unsigned int unicode, - struct cifs_sb_info *cifs_sb, unsigned int max_len, __u64 *pinum) +static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir, + void *dirent, char *scratch_buf, unsigned int max_len) { + struct cifsFileInfo *file_info = file->private_data; + struct super_block *sb = file->f_path.dentry->d_sb; + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + struct cifs_dirent de = { NULL, }; + struct cifs_fattr fattr; + struct dentry *dentry; + struct qstr name; int rc = 0; - unsigned int len = 0; - char *filename; - struct nls_table *nlt = cifs_sb->local_nls; - - *pinum = 0; - - if (level == SMB_FIND_FILE_UNIX) { - FILE_UNIX_INFO *pFindData = (FILE_UNIX_INFO *)current_entry; - - filename = &pFindData->FileName[0]; - if (unicode) { - len = cifs_unicode_bytelen(filename); - } else { - /* BB should we make this strnlen of PATH_MAX? */ - len = strnlen(filename, PATH_MAX); - } + ino_t ino; - *pinum = le64_to_cpu(pFindData->basic.UniqueId); - } else if (level == SMB_FIND_FILE_DIRECTORY_INFO) { - FILE_DIRECTORY_INFO *pFindData = - (FILE_DIRECTORY_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = le32_to_cpu(pFindData->FileNameLength); - } else if (level == SMB_FIND_FILE_FULL_DIRECTORY_INFO) { - FILE_FULL_DIRECTORY_INFO *pFindData = - (FILE_FULL_DIRECTORY_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = le32_to_cpu(pFindData->FileNameLength); - } else if (level == SMB_FIND_FILE_ID_FULL_DIR_INFO) { - SEARCH_ID_FULL_DIR_INFO *pFindData = - (SEARCH_ID_FULL_DIR_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = le32_to_cpu(pFindData->FileNameLength); - *pinum = le64_to_cpu(pFindData->UniqueId); - } else if (level == SMB_FIND_FILE_BOTH_DIRECTORY_INFO) { - FILE_BOTH_DIRECTORY_INFO *pFindData = - (FILE_BOTH_DIRECTORY_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = le32_to_cpu(pFindData->FileNameLength); - } else if (level == SMB_FIND_FILE_INFO_STANDARD) { - FIND_FILE_STANDARD_INFO *pFindData = - (FIND_FILE_STANDARD_INFO *)current_entry; - filename = &pFindData->FileName[0]; - /* one byte length, no name conversion */ - len = (unsigned int)pFindData->FileNameLength; - } else { - cFYI(1, "Unknown findfirst level %d", level); - return -EINVAL; - } + rc = cifs_fill_dirent(&de, find_entry, file_info->srch_inf.info_level, + file_info->srch_inf.unicode); + if (rc) + return rc; - if (len > max_len) { - cERROR(1, "bad search response length %d past smb end", len); + if (de.namelen > max_len) { + cERROR(1, "bad search response length %zd past smb end", + de.namelen); return -EINVAL; } - if (unicode) { - pqst->len = cifs_from_ucs2((char *) pqst->name, - (__le16 *) filename, - UNICODE_NAME_MAX, - min(len, max_len), nlt, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); - pqst->len -= nls_nullsize(nlt); - } else { - pqst->name = filename; - pqst->len = len; - } - return rc; -} - -static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir, - void *direntry, char *scratch_buf, unsigned int max_len) -{ - int rc = 0; - struct qstr qstring; - struct cifsFileInfo *pCifsF; - u64 inum; - ino_t ino; - struct super_block *sb; - struct cifs_sb_info *cifs_sb; - struct dentry *tmp_dentry; - struct cifs_fattr fattr; - - /* get filename and len into qstring */ - /* get dentry */ - /* decide whether to create and populate ionde */ - if ((direntry == NULL) || (file == NULL)) - return -EINVAL; - - pCifsF = file->private_data; - - if ((scratch_buf == NULL) || (pfindEntry == NULL) || (pCifsF == NULL)) - return -ENOENT; - - rc = cifs_entry_is_dot(pfindEntry, pCifsF); /* skip . and .. since we added them first */ - if (rc != 0) + if (cifs_entry_is_dot(&de, file_info->srch_inf.unicode)) return 0; - sb = file->f_path.dentry->d_sb; - cifs_sb = CIFS_SB(sb); - - qstring.name = scratch_buf; - rc = cifs_get_name_from_search_buf(&qstring, pfindEntry, - pCifsF->srch_inf.info_level, - pCifsF->srch_inf.unicode, cifs_sb, - max_len, &inum /* returned */); + if (file_info->srch_inf.unicode) { + struct nls_table *nlt = cifs_sb->local_nls; - if (rc) - return rc; + name.name = scratch_buf; + name.len = + cifs_from_ucs2((char *)name.name, (__le16 *)de.name, + UNICODE_NAME_MAX, + min(de.namelen, (size_t)max_len), nlt, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + name.len -= nls_nullsize(nlt); + } else { + name.name = de.name; + name.len = de.namelen; + } - if (pCifsF->srch_inf.info_level == SMB_FIND_FILE_UNIX) + switch (file_info->srch_inf.info_level) { + case SMB_FIND_FILE_UNIX: cifs_unix_basic_to_fattr(&fattr, - &((FILE_UNIX_INFO *) pfindEntry)->basic, - cifs_sb); - else if (pCifsF->srch_inf.info_level == SMB_FIND_FILE_INFO_STANDARD) - cifs_std_info_to_fattr(&fattr, (FIND_FILE_STANDARD_INFO *) - pfindEntry, cifs_sb); - else - cifs_dir_info_to_fattr(&fattr, (FILE_DIRECTORY_INFO *) - pfindEntry, cifs_sb); + &((FILE_UNIX_INFO *)find_entry)->basic, + cifs_sb); + break; + case SMB_FIND_FILE_INFO_STANDARD: + cifs_std_info_to_fattr(&fattr, + (FIND_FILE_STANDARD_INFO *)find_entry, + cifs_sb); + break; + default: + cifs_dir_info_to_fattr(&fattr, + (FILE_DIRECTORY_INFO *)find_entry, + cifs_sb); + break; + } - if (inum && (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) { - fattr.cf_uniqueid = inum; + if (de.ino && (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) { + fattr.cf_uniqueid = de.ino; } else { fattr.cf_uniqueid = iunique(sb, ROOT_I); cifs_autodisable_serverino(cifs_sb); @@ -750,12 +687,12 @@ static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir, fattr.cf_flags |= CIFS_FATTR_NEED_REVAL; ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid); - tmp_dentry = cifs_readdir_lookup(file->f_dentry, &qstring, &fattr); + dentry = cifs_readdir_lookup(file->f_dentry, &name, &fattr); - rc = filldir(direntry, qstring.name, qstring.len, file->f_pos, - ino, fattr.cf_dtype); + rc = filldir(dirent, name.name, name.len, file->f_pos, ino, + fattr.cf_dtype); - dput(tmp_dentry); + dput(dentry); return rc; } diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index d3e619692ee0..4ec3ee9d72cc 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -124,7 +124,9 @@ static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, SESSION_SETUP_ANDX *pSMB) /* that we use in next few lines */ /* Note that header is initialized to zero in header_assemble */ pSMB->req.AndXCommand = 0xFF; - pSMB->req.MaxBufferSize = cpu_to_le16(ses->server->maxBuf); + pSMB->req.MaxBufferSize = cpu_to_le16(min_t(u32, + CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4, + USHRT_MAX)); pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq); pSMB->req.VcNumber = get_next_vcnum(ses); @@ -681,7 +683,7 @@ ssetup_ntlmssp_authenticate: cpu_to_le16(CIFS_AUTH_RESP_SIZE); /* calculate ntlm response and session key */ - rc = setup_ntlm_response(ses); + rc = setup_ntlm_response(ses, nls_cp); if (rc) { cERROR(1, "Error %d during NTLM authentication", rc); goto ssetup_exit; diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c index 1c5b770c3141..7cacba12b8f1 100644 --- a/fs/cifs/smbencrypt.c +++ b/fs/cifs/smbencrypt.c @@ -157,8 +157,14 @@ mdfour(unsigned char *md4_hash, unsigned char *link_str, int link_len) cERROR(1, "%s: Could not init md4 shash\n", __func__); goto mdfour_err; } - crypto_shash_update(&sdescmd4->shash, link_str, link_len); + rc = crypto_shash_update(&sdescmd4->shash, link_str, link_len); + if (rc) { + cERROR(1, "%s: Could not update with link_str\n", __func__); + goto mdfour_err; + } rc = crypto_shash_final(&sdescmd4->shash, md4_hash); + if (rc) + cERROR(1, "%s: Could not genereate md4 hash\n", __func__); mdfour_err: crypto_free_shash(md4); @@ -193,160 +199,36 @@ SMBencrypt(unsigned char *passwd, const unsigned char *c8, unsigned char *p24) return rc; } -/* Routines for Windows NT MD4 Hash functions. */ -static int -_my_wcslen(__u16 *str) -{ - int len = 0; - while (*str++ != 0) - len++; - return len; -} - -/* - * Convert a string into an NT UNICODE string. - * Note that regardless of processor type - * this must be in intel (little-endian) - * format. - */ - -static int -_my_mbstowcs(__u16 *dst, const unsigned char *src, int len) -{ /* BB not a very good conversion routine - change/fix */ - int i; - __u16 val; - - for (i = 0; i < len; i++) { - val = *src; - SSVAL(dst, 0, val); - dst++; - src++; - if (val == 0) - break; - } - return i; -} - /* * Creates the MD4 Hash of the users password in NT UNICODE. */ int -E_md4hash(const unsigned char *passwd, unsigned char *p16) +E_md4hash(const unsigned char *passwd, unsigned char *p16, + const struct nls_table *codepage) { int rc; int len; __u16 wpwd[129]; /* Password cannot be longer than 128 characters */ - if (passwd) { - len = strlen((char *) passwd); - if (len > 128) - len = 128; - - /* Password must be converted to NT unicode */ - _my_mbstowcs(wpwd, passwd, len); - } else + if (passwd) /* Password must be converted to NT unicode */ + len = cifs_strtoUCS(wpwd, passwd, 128, codepage); + else { len = 0; + *wpwd = 0; /* Ensure string is null terminated */ + } - wpwd[len] = 0; /* Ensure string is null terminated */ - /* Calculate length in bytes */ - len = _my_wcslen(wpwd) * sizeof(__u16); - - rc = mdfour(p16, (unsigned char *) wpwd, len); - memset(wpwd, 0, 129 * 2); + rc = mdfour(p16, (unsigned char *) wpwd, len * sizeof(__u16)); + memset(wpwd, 0, 129 * sizeof(__u16)); return rc; } -#if 0 /* currently unused */ -/* Does both the NT and LM owfs of a user's password */ -static void -nt_lm_owf_gen(char *pwd, unsigned char nt_p16[16], unsigned char p16[16]) -{ - char passwd[514]; - - memset(passwd, '\0', 514); - if (strlen(pwd) < 513) - strcpy(passwd, pwd); - else - memcpy(passwd, pwd, 512); - /* Calculate the MD4 hash (NT compatible) of the password */ - memset(nt_p16, '\0', 16); - E_md4hash(passwd, nt_p16); - - /* Mangle the passwords into Lanman format */ - passwd[14] = '\0'; -/* strupper(passwd); */ - - /* Calculate the SMB (lanman) hash functions of the password */ - - memset(p16, '\0', 16); - E_P16((unsigned char *) passwd, (unsigned char *) p16); - - /* clear out local copy of user's password (just being paranoid). */ - memset(passwd, '\0', sizeof(passwd)); -} -#endif - -/* Does the NTLMv2 owfs of a user's password */ -#if 0 /* function not needed yet - but will be soon */ -static void -ntv2_owf_gen(const unsigned char owf[16], const char *user_n, - const char *domain_n, unsigned char kr_buf[16], - const struct nls_table *nls_codepage) -{ - wchar_t *user_u; - wchar_t *dom_u; - int user_l, domain_l; - struct HMACMD5Context ctx; - - /* might as well do one alloc to hold both (user_u and dom_u) */ - user_u = kmalloc(2048 * sizeof(wchar_t), GFP_KERNEL); - if (user_u == NULL) - return; - dom_u = user_u + 1024; - - /* push_ucs2(NULL, user_u, user_n, (user_l+1)*2, - STR_UNICODE|STR_NOALIGN|STR_TERMINATE|STR_UPPER); - push_ucs2(NULL, dom_u, domain_n, (domain_l+1)*2, - STR_UNICODE|STR_NOALIGN|STR_TERMINATE|STR_UPPER); */ - - /* BB user and domain may need to be uppercased */ - user_l = cifs_strtoUCS(user_u, user_n, 511, nls_codepage); - domain_l = cifs_strtoUCS(dom_u, domain_n, 511, nls_codepage); - - user_l++; /* trailing null */ - domain_l++; - - hmac_md5_init_limK_to_64(owf, 16, &ctx); - hmac_md5_update((const unsigned char *) user_u, user_l * 2, &ctx); - hmac_md5_update((const unsigned char *) dom_u, domain_l * 2, &ctx); - hmac_md5_final(kr_buf, &ctx); - - kfree(user_u); -} -#endif - -/* Does the des encryption from the FIRST 8 BYTES of the NT or LM MD4 hash. */ -#if 0 /* currently unused */ -static void -NTLMSSPOWFencrypt(unsigned char passwd[8], - unsigned char *ntlmchalresp, unsigned char p24[24]) -{ - unsigned char p21[21]; - - memset(p21, '\0', 21); - memcpy(p21, passwd, 8); - memset(p21 + 8, 0xbd, 8); - - E_P24(p21, ntlmchalresp, p24); -} -#endif - /* Does the NT MD4 hash then des encryption. */ int -SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24) +SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24, + const struct nls_table *codepage) { int rc; unsigned char p16[16], p21[21]; @@ -354,7 +236,7 @@ SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24) memset(p16, '\0', 16); memset(p21, '\0', 21); - rc = E_md4hash(passwd, p16); + rc = E_md4hash(passwd, p16, codepage); if (rc) { cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc); return rc; @@ -363,39 +245,3 @@ SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24) rc = E_P24(p21, c8, p24); return rc; } - - -/* Does the md5 encryption from the NT hash for NTLMv2. */ -/* These routines will be needed later */ -#if 0 -static void -SMBOWFencrypt_ntv2(const unsigned char kr[16], - const struct data_blob *srv_chal, - const struct data_blob *cli_chal, unsigned char resp_buf[16]) -{ - struct HMACMD5Context ctx; - - hmac_md5_init_limK_to_64(kr, 16, &ctx); - hmac_md5_update(srv_chal->data, srv_chal->length, &ctx); - hmac_md5_update(cli_chal->data, cli_chal->length, &ctx); - hmac_md5_final(resp_buf, &ctx); -} - -static void -SMBsesskeygen_ntv2(const unsigned char kr[16], - const unsigned char *nt_resp, __u8 sess_key[16]) -{ - struct HMACMD5Context ctx; - - hmac_md5_init_limK_to_64(kr, 16, &ctx); - hmac_md5_update(nt_resp, 16, &ctx); - hmac_md5_final((unsigned char *) sess_key, &ctx); -} - -static void -SMBsesskeygen_ntv1(const unsigned char kr[16], - const unsigned char *nt_resp, __u8 sess_key[16]) -{ - mdfour((unsigned char *) sess_key, (unsigned char *) kr, 16); -} -#endif diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 147aa22c3c3a..0cc9584f5889 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -26,6 +26,7 @@ #include <linux/wait.h> #include <linux/net.h> #include <linux/delay.h> +#include <linux/freezer.h> #include <asm/uaccess.h> #include <asm/processor.h> #include <linux/mempool.h> @@ -266,15 +267,11 @@ static int wait_for_free_request(struct TCP_Server_Info *server, while (1) { if (atomic_read(&server->inFlight) >= cifs_max_pending) { spin_unlock(&GlobalMid_Lock); -#ifdef CONFIG_CIFS_STATS2 - atomic_inc(&server->num_waiters); -#endif + cifs_num_waiters_inc(server); wait_event(server->request_q, atomic_read(&server->inFlight) < cifs_max_pending); -#ifdef CONFIG_CIFS_STATS2 - atomic_dec(&server->num_waiters); -#endif + cifs_num_waiters_dec(server); spin_lock(&GlobalMid_Lock); } else { if (server->tcpStatus == CifsExiting) { @@ -328,7 +325,7 @@ wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ) { int error; - error = wait_event_killable(server->response_q, + error = wait_event_freezekillable(server->response_q, midQ->midState != MID_REQUEST_SUBMITTED); if (error < 0) return -ERESTARTSYS; @@ -343,8 +340,8 @@ wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ) */ int cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov, - unsigned int nvec, mid_callback_t *callback, void *cbdata, - bool ignore_pend) + unsigned int nvec, mid_receive_t *receive, + mid_callback_t *callback, void *cbdata, bool ignore_pend) { int rc; struct mid_q_entry *mid; @@ -362,6 +359,8 @@ cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov, mid = AllocMidQEntry(hdr, server); if (mid == NULL) { mutex_unlock(&server->srv_mutex); + atomic_dec(&server->inFlight); + wake_up(&server->request_q); return -ENOMEM; } @@ -376,18 +375,17 @@ cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov, goto out_err; } + mid->receive = receive; mid->callback = callback; mid->callback_data = cbdata; mid->midState = MID_REQUEST_SUBMITTED; -#ifdef CONFIG_CIFS_STATS2 - atomic_inc(&server->inSend); -#endif + + cifs_in_send_inc(server); rc = smb_sendv(server, iov, nvec); -#ifdef CONFIG_CIFS_STATS2 - atomic_dec(&server->inSend); - mid->when_sent = jiffies; -#endif + cifs_in_send_dec(server); + cifs_save_when_sent(mid); mutex_unlock(&server->srv_mutex); + if (rc) goto out_err; @@ -500,13 +498,18 @@ int cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server, bool log_error) { - dump_smb(mid->resp_buf, - min_t(u32, 92, be32_to_cpu(mid->resp_buf->smb_buf_length))); + unsigned int len = be32_to_cpu(mid->resp_buf->smb_buf_length) + 4; + + dump_smb(mid->resp_buf, min_t(u32, 92, len)); /* convert the length into a more usable form */ if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) { + struct kvec iov; + + iov.iov_base = mid->resp_buf; + iov.iov_len = len; /* FIXME: add code to kill session */ - if (cifs_verify_signature(mid->resp_buf, server, + if (cifs_verify_signature(&iov, 1, server, mid->sequence_number + 1) != 0) cERROR(1, "Unexpected SMB signature"); } @@ -573,14 +576,10 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses, } midQ->midState = MID_REQUEST_SUBMITTED; -#ifdef CONFIG_CIFS_STATS2 - atomic_inc(&ses->server->inSend); -#endif + cifs_in_send_inc(ses->server); rc = smb_sendv(ses->server, iov, n_vec); -#ifdef CONFIG_CIFS_STATS2 - atomic_dec(&ses->server->inSend); - midQ->when_sent = jiffies; -#endif + cifs_in_send_dec(ses->server); + cifs_save_when_sent(midQ); mutex_unlock(&ses->server->srv_mutex); @@ -701,14 +700,11 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, } midQ->midState = MID_REQUEST_SUBMITTED; -#ifdef CONFIG_CIFS_STATS2 - atomic_inc(&ses->server->inSend); -#endif + + cifs_in_send_inc(ses->server); rc = smb_send(ses->server, in_buf, be32_to_cpu(in_buf->smb_buf_length)); -#ifdef CONFIG_CIFS_STATS2 - atomic_dec(&ses->server->inSend); - midQ->when_sent = jiffies; -#endif + cifs_in_send_dec(ses->server); + cifs_save_when_sent(midQ); mutex_unlock(&ses->server->srv_mutex); if (rc < 0) @@ -841,14 +837,10 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, } midQ->midState = MID_REQUEST_SUBMITTED; -#ifdef CONFIG_CIFS_STATS2 - atomic_inc(&ses->server->inSend); -#endif + cifs_in_send_inc(ses->server); rc = smb_send(ses->server, in_buf, be32_to_cpu(in_buf->smb_buf_length)); -#ifdef CONFIG_CIFS_STATS2 - atomic_dec(&ses->server->inSend); - midQ->when_sent = jiffies; -#endif + cifs_in_send_dec(ses->server); + cifs_save_when_sent(midQ); mutex_unlock(&ses->server->srv_mutex); if (rc < 0) { diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index 2a22fb2989e4..45f07c46f3ed 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -22,6 +22,7 @@ #include <linux/fs.h> #include <linux/posix_acl_xattr.h> #include <linux/slab.h> +#include <linux/xattr.h> #include "cifsfs.h" #include "cifspdu.h" #include "cifsglob.h" @@ -31,16 +32,8 @@ #define MAX_EA_VALUE_SIZE 65535 #define CIFS_XATTR_DOS_ATTRIB "user.DosAttrib" #define CIFS_XATTR_CIFS_ACL "system.cifs_acl" -#define CIFS_XATTR_USER_PREFIX "user." -#define CIFS_XATTR_SYSTEM_PREFIX "system." -#define CIFS_XATTR_OS2_PREFIX "os2." -#define CIFS_XATTR_SECURITY_PREFIX "security." -#define CIFS_XATTR_TRUSTED_PREFIX "trusted." -#define XATTR_TRUSTED_PREFIX_LEN 8 -#define XATTR_SECURITY_PREFIX_LEN 9 -/* BB need to add server (Samba e.g) support for security and trusted prefix */ - +/* BB need to add server (Samba e.g) support for security and trusted prefix */ int cifs_removexattr(struct dentry *direntry, const char *ea_name) { @@ -76,8 +69,8 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name) } if (ea_name == NULL) { cFYI(1, "Null xattr names not supported"); - } else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5) - && (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4))) { + } else if (strncmp(ea_name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) + && (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN))) { cFYI(1, "illegal xattr request %s (only user namespace supported)", ea_name); @@ -88,7 +81,7 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name) if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) goto remove_ea_exit; - ea_name += 5; /* skip past user. prefix */ + ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */ rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, NULL, (__u16)0, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); @@ -149,21 +142,23 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name, if (ea_name == NULL) { cFYI(1, "Null xattr names not supported"); - } else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5) == 0) { + } else if (strncmp(ea_name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) + == 0) { if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) goto set_ea_exit; if (strncmp(ea_name, CIFS_XATTR_DOS_ATTRIB, 14) == 0) cFYI(1, "attempt to set cifs inode metadata"); - ea_name += 5; /* skip past user. prefix */ + ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */ rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value, (__u16)value_size, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); - } else if (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4) == 0) { + } else if (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) + == 0) { if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) goto set_ea_exit; - ea_name += 4; /* skip past os2. prefix */ + ea_name += XATTR_OS2_PREFIX_LEN; /* skip past os2. prefix */ rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value, (__u16)value_size, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); @@ -178,7 +173,7 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name, #ifdef CONFIG_CIFS_ACL memcpy(pacl, ea_value, value_size); rc = set_cifs_acl(pacl, value_size, - direntry->d_inode, full_path); + direntry->d_inode, full_path, CIFS_ACL_DACL); if (rc == 0) /* force revalidate of the inode */ CIFS_I(direntry->d_inode)->time = 0; kfree(pacl); @@ -269,7 +264,8 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name, /* return alt name if available as pseudo attr */ if (ea_name == NULL) { cFYI(1, "Null xattr names not supported"); - } else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5) == 0) { + } else if (strncmp(ea_name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) + == 0) { if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) goto get_ea_exit; @@ -277,15 +273,15 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name, cFYI(1, "attempt to query cifs inode metadata"); /* revalidate/getattr then populate from inode */ } /* BB add else when above is implemented */ - ea_name += 5; /* skip past user. prefix */ + ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */ rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value, buf_size, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); - } else if (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4) == 0) { + } else if (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) { if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) goto get_ea_exit; - ea_name += 4; /* skip past os2. prefix */ + ea_name += XATTR_OS2_PREFIX_LEN; /* skip past os2. prefix */ rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value, buf_size, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); @@ -339,10 +335,10 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name, cFYI(1, "Query CIFS ACL not supported yet"); #endif /* CONFIG_CIFS_ACL */ } else if (strncmp(ea_name, - CIFS_XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) { + XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) { cFYI(1, "Trusted xattr namespace not supported yet"); } else if (strncmp(ea_name, - CIFS_XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) == 0) { + XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) == 0) { cFYI(1, "Security xattr namespace not supported yet"); } else cFYI(1, diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c index 2bdbcc11b373..854ace712685 100644 --- a/fs/coda/coda_linux.c +++ b/fs/coda/coda_linux.c @@ -104,7 +104,7 @@ void coda_vattr_to_iattr(struct inode *inode, struct coda_vattr *attr) if (attr->va_gid != -1) inode->i_gid = (gid_t) attr->va_gid; if (attr->va_nlink != -1) - inode->i_nlink = attr->va_nlink; + set_nlink(inode, attr->va_nlink); if (attr->va_size != -1) inode->i_size = attr->va_size; if (attr->va_size != -1) diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h index 44e17e9c21ae..cc0ea9fe5ecf 100644 --- a/fs/coda/coda_linux.h +++ b/fs/coda/coda_linux.h @@ -59,12 +59,11 @@ void coda_sysctl_clean(void); #define CODA_ALLOC(ptr, cast, size) do { \ if (size < PAGE_SIZE) \ - ptr = kmalloc((unsigned long) size, GFP_KERNEL); \ + ptr = kzalloc((unsigned long) size, GFP_KERNEL); \ else \ - ptr = (cast)vmalloc((unsigned long) size); \ + ptr = (cast)vzalloc((unsigned long) size); \ if (!ptr) \ printk("kernel malloc returns 0 at %s:%d\n", __FILE__, __LINE__); \ - else memset( ptr, 0, size ); \ } while (0) diff --git a/fs/coda/dir.c b/fs/coda/dir.c index 0239433f50cb..28e7e135cfab 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -340,7 +340,7 @@ static int coda_rmdir(struct inode *dir, struct dentry *de) if (!error) { /* VFS may delete the child */ if (de->d_inode) - de->d_inode->i_nlink = 0; + clear_nlink(de->d_inode); /* fix the link count of the parent */ coda_dir_drop_nlink(dir); diff --git a/fs/compat.c b/fs/compat.c index 0ea00832de23..c98787536bb8 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -37,7 +37,6 @@ #include <linux/dirent.h> #include <linux/fsnotify.h> #include <linux/highuid.h> -#include <linux/nfsd/syscall.h> #include <linux/personality.h> #include <linux/rwsem.h> #include <linux/tsacct_kern.h> @@ -247,11 +246,8 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs * __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) || __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) || __put_user(kbuf->f_frsize, &ubuf->f_frsize) || - __put_user(0, &ubuf->f_spare[0]) || - __put_user(0, &ubuf->f_spare[1]) || - __put_user(0, &ubuf->f_spare[2]) || - __put_user(0, &ubuf->f_spare[3]) || - __put_user(0, &ubuf->f_spare[4])) + __put_user(kbuf->f_flags, &ubuf->f_flags) || + __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare))) return -EFAULT; return 0; } @@ -550,7 +546,7 @@ out: ssize_t compat_rw_copy_check_uvector(int type, const struct compat_iovec __user *uvector, unsigned long nr_segs, unsigned long fast_segs, struct iovec *fast_pointer, - struct iovec **ret_pointer) + struct iovec **ret_pointer, int check_access) { compat_ssize_t tot_len; struct iovec *iov = *ret_pointer = fast_pointer; @@ -597,7 +593,8 @@ ssize_t compat_rw_copy_check_uvector(int type, } if (len < 0) /* size_t not fitting in compat_ssize_t .. */ goto out; - if (!access_ok(vrfy_dir(type), compat_ptr(buf), len)) { + if (check_access && + !access_ok(vrfy_dir(type), compat_ptr(buf), len)) { ret = -EFAULT; goto out; } @@ -1111,7 +1108,7 @@ static ssize_t compat_do_readv_writev(int type, struct file *file, goto out; tot_len = compat_rw_copy_check_uvector(type, uvector, nr_segs, - UIO_FASTIOV, iovstack, &iov); + UIO_FASTIOV, iovstack, &iov, 1); if (tot_len == 0) { ret = 0; goto out; @@ -1675,257 +1672,6 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds, } #endif /* HAVE_SET_RESTORE_SIGMASK */ -#if (defined(CONFIG_NFSD) || defined(CONFIG_NFSD_MODULE)) && !defined(CONFIG_NFSD_DEPRECATED) -/* Stuff for NFS server syscalls... */ -struct compat_nfsctl_svc { - u16 svc32_port; - s32 svc32_nthreads; -}; - -struct compat_nfsctl_client { - s8 cl32_ident[NFSCLNT_IDMAX+1]; - s32 cl32_naddr; - struct in_addr cl32_addrlist[NFSCLNT_ADDRMAX]; - s32 cl32_fhkeytype; - s32 cl32_fhkeylen; - u8 cl32_fhkey[NFSCLNT_KEYMAX]; -}; - -struct compat_nfsctl_export { - char ex32_client[NFSCLNT_IDMAX+1]; - char ex32_path[NFS_MAXPATHLEN+1]; - compat_dev_t ex32_dev; - compat_ino_t ex32_ino; - compat_int_t ex32_flags; - __compat_uid_t ex32_anon_uid; - __compat_gid_t ex32_anon_gid; -}; - -struct compat_nfsctl_fdparm { - struct sockaddr gd32_addr; - s8 gd32_path[NFS_MAXPATHLEN+1]; - compat_int_t gd32_version; -}; - -struct compat_nfsctl_fsparm { - struct sockaddr gd32_addr; - s8 gd32_path[NFS_MAXPATHLEN+1]; - compat_int_t gd32_maxlen; -}; - -struct compat_nfsctl_arg { - compat_int_t ca32_version; /* safeguard */ - union { - struct compat_nfsctl_svc u32_svc; - struct compat_nfsctl_client u32_client; - struct compat_nfsctl_export u32_export; - struct compat_nfsctl_fdparm u32_getfd; - struct compat_nfsctl_fsparm u32_getfs; - } u; -#define ca32_svc u.u32_svc -#define ca32_client u.u32_client -#define ca32_export u.u32_export -#define ca32_getfd u.u32_getfd -#define ca32_getfs u.u32_getfs -}; - -union compat_nfsctl_res { - __u8 cr32_getfh[NFS_FHSIZE]; - struct knfsd_fh cr32_getfs; -}; - -static int compat_nfs_svc_trans(struct nfsctl_arg *karg, - struct compat_nfsctl_arg __user *arg) -{ - if (!access_ok(VERIFY_READ, &arg->ca32_svc, sizeof(arg->ca32_svc)) || - get_user(karg->ca_version, &arg->ca32_version) || - __get_user(karg->ca_svc.svc_port, &arg->ca32_svc.svc32_port) || - __get_user(karg->ca_svc.svc_nthreads, - &arg->ca32_svc.svc32_nthreads)) - return -EFAULT; - return 0; -} - -static int compat_nfs_clnt_trans(struct nfsctl_arg *karg, - struct compat_nfsctl_arg __user *arg) -{ - if (!access_ok(VERIFY_READ, &arg->ca32_client, - sizeof(arg->ca32_client)) || - get_user(karg->ca_version, &arg->ca32_version) || - __copy_from_user(&karg->ca_client.cl_ident[0], - &arg->ca32_client.cl32_ident[0], - NFSCLNT_IDMAX) || - __get_user(karg->ca_client.cl_naddr, - &arg->ca32_client.cl32_naddr) || - __copy_from_user(&karg->ca_client.cl_addrlist[0], - &arg->ca32_client.cl32_addrlist[0], - (sizeof(struct in_addr) * NFSCLNT_ADDRMAX)) || - __get_user(karg->ca_client.cl_fhkeytype, - &arg->ca32_client.cl32_fhkeytype) || - __get_user(karg->ca_client.cl_fhkeylen, - &arg->ca32_client.cl32_fhkeylen) || - __copy_from_user(&karg->ca_client.cl_fhkey[0], - &arg->ca32_client.cl32_fhkey[0], - NFSCLNT_KEYMAX)) - return -EFAULT; - - return 0; -} - -static int compat_nfs_exp_trans(struct nfsctl_arg *karg, - struct compat_nfsctl_arg __user *arg) -{ - if (!access_ok(VERIFY_READ, &arg->ca32_export, - sizeof(arg->ca32_export)) || - get_user(karg->ca_version, &arg->ca32_version) || - __copy_from_user(&karg->ca_export.ex_client[0], - &arg->ca32_export.ex32_client[0], - NFSCLNT_IDMAX) || - __copy_from_user(&karg->ca_export.ex_path[0], - &arg->ca32_export.ex32_path[0], - NFS_MAXPATHLEN) || - __get_user(karg->ca_export.ex_dev, - &arg->ca32_export.ex32_dev) || - __get_user(karg->ca_export.ex_ino, - &arg->ca32_export.ex32_ino) || - __get_user(karg->ca_export.ex_flags, - &arg->ca32_export.ex32_flags) || - __get_user(karg->ca_export.ex_anon_uid, - &arg->ca32_export.ex32_anon_uid) || - __get_user(karg->ca_export.ex_anon_gid, - &arg->ca32_export.ex32_anon_gid)) - return -EFAULT; - SET_UID(karg->ca_export.ex_anon_uid, karg->ca_export.ex_anon_uid); - SET_GID(karg->ca_export.ex_anon_gid, karg->ca_export.ex_anon_gid); - - return 0; -} - -static int compat_nfs_getfd_trans(struct nfsctl_arg *karg, - struct compat_nfsctl_arg __user *arg) -{ - if (!access_ok(VERIFY_READ, &arg->ca32_getfd, - sizeof(arg->ca32_getfd)) || - get_user(karg->ca_version, &arg->ca32_version) || - __copy_from_user(&karg->ca_getfd.gd_addr, - &arg->ca32_getfd.gd32_addr, - (sizeof(struct sockaddr))) || - __copy_from_user(&karg->ca_getfd.gd_path, - &arg->ca32_getfd.gd32_path, - (NFS_MAXPATHLEN+1)) || - __get_user(karg->ca_getfd.gd_version, - &arg->ca32_getfd.gd32_version)) - return -EFAULT; - - return 0; -} - -static int compat_nfs_getfs_trans(struct nfsctl_arg *karg, - struct compat_nfsctl_arg __user *arg) -{ - if (!access_ok(VERIFY_READ,&arg->ca32_getfs,sizeof(arg->ca32_getfs)) || - get_user(karg->ca_version, &arg->ca32_version) || - __copy_from_user(&karg->ca_getfs.gd_addr, - &arg->ca32_getfs.gd32_addr, - (sizeof(struct sockaddr))) || - __copy_from_user(&karg->ca_getfs.gd_path, - &arg->ca32_getfs.gd32_path, - (NFS_MAXPATHLEN+1)) || - __get_user(karg->ca_getfs.gd_maxlen, - &arg->ca32_getfs.gd32_maxlen)) - return -EFAULT; - - return 0; -} - -/* This really doesn't need translations, we are only passing - * back a union which contains opaque nfs file handle data. - */ -static int compat_nfs_getfh_res_trans(union nfsctl_res *kres, - union compat_nfsctl_res __user *res) -{ - int err; - - err = copy_to_user(res, kres, sizeof(*res)); - - return (err) ? -EFAULT : 0; -} - -asmlinkage long compat_sys_nfsservctl(int cmd, - struct compat_nfsctl_arg __user *arg, - union compat_nfsctl_res __user *res) -{ - struct nfsctl_arg *karg; - union nfsctl_res *kres; - mm_segment_t oldfs; - int err; - - karg = kmalloc(sizeof(*karg), GFP_USER); - kres = kmalloc(sizeof(*kres), GFP_USER); - if(!karg || !kres) { - err = -ENOMEM; - goto done; - } - - switch(cmd) { - case NFSCTL_SVC: - err = compat_nfs_svc_trans(karg, arg); - break; - - case NFSCTL_ADDCLIENT: - err = compat_nfs_clnt_trans(karg, arg); - break; - - case NFSCTL_DELCLIENT: - err = compat_nfs_clnt_trans(karg, arg); - break; - - case NFSCTL_EXPORT: - case NFSCTL_UNEXPORT: - err = compat_nfs_exp_trans(karg, arg); - break; - - case NFSCTL_GETFD: - err = compat_nfs_getfd_trans(karg, arg); - break; - - case NFSCTL_GETFS: - err = compat_nfs_getfs_trans(karg, arg); - break; - - default: - err = -EINVAL; - break; - } - - if (err) - goto done; - - oldfs = get_fs(); - set_fs(KERNEL_DS); - /* The __user pointer casts are valid because of the set_fs() */ - err = sys_nfsservctl(cmd, (void __user *) karg, (void __user *) kres); - set_fs(oldfs); - - if (err) - goto done; - - if((cmd == NFSCTL_GETFD) || - (cmd == NFSCTL_GETFS)) - err = compat_nfs_getfh_res_trans(kres, res); - -done: - kfree(karg); - kfree(kres); - return err; -} -#else /* !NFSD */ -long asmlinkage compat_sys_nfsservctl(int cmd, void *notused, void *notused2) -{ - return sys_ni_syscall(); -} -#endif - #ifdef CONFIG_EPOLL #ifdef HAVE_SET_RESTORE_SIGMASK diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 61abb638b4bf..51352de88ef1 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -68,6 +68,8 @@ #ifdef CONFIG_BLOCK #include <linux/loop.h> +#include <linux/cdrom.h> +#include <linux/fd.h> #include <scsi/scsi.h> #include <scsi/scsi_ioctl.h> #include <scsi/sg.h> @@ -944,6 +946,9 @@ COMPATIBLE_IOCTL(FIOQSIZE) IGNORE_IOCTL(LOOP_CLR_FD) /* md calls this on random blockdevs */ IGNORE_IOCTL(RAID_VERSION) +/* qemu/qemu-img might call these two on plain files for probing */ +IGNORE_IOCTL(CDROM_DRIVE_STATUS) +IGNORE_IOCTL(FDGETPRM32) /* SG stuff */ COMPATIBLE_IOCTL(SG_SET_TIMEOUT) COMPATIBLE_IOCTL(SG_GET_TIMEOUT) @@ -998,6 +1003,7 @@ COMPATIBLE_IOCTL(PPPIOCCONNECT) COMPATIBLE_IOCTL(PPPIOCDISCONN) COMPATIBLE_IOCTL(PPPIOCATTCHAN) COMPATIBLE_IOCTL(PPPIOCGCHAN) +COMPATIBLE_IOCTL(PPPIOCGL2TPSTATS) /* PPPOX */ COMPATIBLE_IOCTL(PPPOEIOCSFWD) COMPATIBLE_IOCTL(PPPOEIOCDFWD) diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index c83f4768eeaa..ca418aaf6352 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -23,7 +23,8 @@ * * configfs Copyright (C) 2005 Oracle. All rights reserved. * - * Please see Documentation/filesystems/configfs.txt for more information. + * Please see Documentation/filesystems/configfs/configfs.txt for more + * information. */ #undef DEBUG diff --git a/fs/configfs/item.c b/fs/configfs/item.c index 76dc4c3e5d51..50cee7f9110b 100644 --- a/fs/configfs/item.c +++ b/fs/configfs/item.c @@ -23,7 +23,7 @@ * * configfs Copyright (C) 2005 Oracle. All rights reserved. * - * Please see the file Documentation/filesystems/configfs.txt for + * Please see the file Documentation/filesystems/configfs/configfs.txt for * critical information about using the config_item interface. */ diff --git a/fs/dcache.c b/fs/dcache.c index be18598c7fd7..10ba92def3f6 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -36,6 +36,7 @@ #include <linux/bit_spinlock.h> #include <linux/rculist_bl.h> #include <linux/prefetch.h> +#include <linux/ratelimit.h> #include "internal.h" /* @@ -225,7 +226,7 @@ static void dentry_unlink_inode(struct dentry * dentry) } /* - * dentry_lru_(add|del|move_tail) must be called with d_lock held. + * dentry_lru_(add|del|prune|move_tail) must be called with d_lock held. */ static void dentry_lru_add(struct dentry *dentry) { @@ -245,6 +246,9 @@ static void __dentry_lru_del(struct dentry *dentry) dentry_stat.nr_unused--; } +/* + * Remove a dentry with references from the LRU. + */ static void dentry_lru_del(struct dentry *dentry) { if (!list_empty(&dentry->d_lru)) { @@ -254,6 +258,23 @@ static void dentry_lru_del(struct dentry *dentry) } } +/* + * Remove a dentry that is unreferenced and about to be pruned + * (unhashed and destroyed) from the LRU, and inform the file system. + * This wrapper should be called _prior_ to unhashing a victim dentry. + */ +static void dentry_lru_prune(struct dentry *dentry) +{ + if (!list_empty(&dentry->d_lru)) { + if (dentry->d_flags & DCACHE_OP_PRUNE) + dentry->d_op->d_prune(dentry); + + spin_lock(&dcache_lru_lock); + __dentry_lru_del(dentry); + spin_unlock(&dcache_lru_lock); + } +} + static void dentry_lru_move_tail(struct dentry *dentry) { spin_lock(&dcache_lru_lock); @@ -301,6 +322,27 @@ static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent) return parent; } +/* + * Unhash a dentry without inserting an RCU walk barrier or checking that + * dentry->d_lock is locked. The caller must take care of that, if + * appropriate. + */ +static void __d_shrink(struct dentry *dentry) +{ + if (!d_unhashed(dentry)) { + struct hlist_bl_head *b; + if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED)) + b = &dentry->d_sb->s_anon; + else + b = d_hash(dentry->d_parent, dentry->d_name.hash); + + hlist_bl_lock(b); + __hlist_bl_del(&dentry->d_hash); + dentry->d_hash.pprev = NULL; + hlist_bl_unlock(b); + } +} + /** * d_drop - drop a dentry * @dentry: dentry to drop @@ -319,17 +361,7 @@ static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent) void __d_drop(struct dentry *dentry) { if (!d_unhashed(dentry)) { - struct hlist_bl_head *b; - if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED)) - b = &dentry->d_sb->s_anon; - else - b = d_hash(dentry->d_parent, dentry->d_name.hash); - - hlist_bl_lock(b); - __hlist_bl_del(&dentry->d_hash); - dentry->d_hash.pprev = NULL; - hlist_bl_unlock(b); - + __d_shrink(dentry); dentry_rcuwalk_barrier(dentry); } } @@ -392,8 +424,12 @@ relock: if (ref) dentry->d_count--; - /* if dentry was on the d_lru list delete it from there */ - dentry_lru_del(dentry); + /* + * if dentry was on the d_lru list delete it from there. + * inform the fs via d_prune that this dentry is about to be + * unhashed and destroyed. + */ + dentry_lru_prune(dentry); /* if it was on the hash then remove it */ __d_drop(dentry); return d_kill(dentry, parent); @@ -511,9 +547,11 @@ int d_invalidate(struct dentry * dentry) * would make it unreachable from the root, * we might still populate it if it was a * working directory or similar). + * We also need to leave mountpoints alone, + * directory or not. */ - if (dentry->d_count > 1) { - if (dentry->d_inode && S_ISDIR(dentry->d_inode->i_mode)) { + if (dentry->d_count > 1 && dentry->d_inode) { + if (S_ISDIR(dentry->d_inode->i_mode) || d_mountpoint(dentry)) { spin_unlock(&dentry->d_lock); return -EBUSY; } @@ -784,6 +822,7 @@ relock: /** * prune_dcache_sb - shrink the dcache + * @sb: superblock * @nr_to_scan: number of entries to try to free * * Attempt to shrink the superblock dcache LRU by @nr_to_scan entries. This is @@ -828,44 +867,28 @@ EXPORT_SYMBOL(shrink_dcache_sb); static void shrink_dcache_for_umount_subtree(struct dentry *dentry) { struct dentry *parent; - unsigned detached = 0; BUG_ON(!IS_ROOT(dentry)); - /* detach this root from the system */ - spin_lock(&dentry->d_lock); - dentry_lru_del(dentry); - __d_drop(dentry); - spin_unlock(&dentry->d_lock); - for (;;) { /* descend to the first leaf in the current subtree */ - while (!list_empty(&dentry->d_subdirs)) { - struct dentry *loop; - - /* this is a branch with children - detach all of them - * from the system in one go */ - spin_lock(&dentry->d_lock); - list_for_each_entry(loop, &dentry->d_subdirs, - d_u.d_child) { - spin_lock_nested(&loop->d_lock, - DENTRY_D_LOCK_NESTED); - dentry_lru_del(loop); - __d_drop(loop); - spin_unlock(&loop->d_lock); - } - spin_unlock(&dentry->d_lock); - - /* move to the first child */ + while (!list_empty(&dentry->d_subdirs)) dentry = list_entry(dentry->d_subdirs.next, struct dentry, d_u.d_child); - } /* consume the dentries from this leaf up through its parents * until we find one with children or run out altogether */ do { struct inode *inode; + /* + * remove the dentry from the lru, and inform + * the fs that this dentry is about to be + * unhashed and destroyed. + */ + dentry_lru_prune(dentry); + __d_shrink(dentry); + if (dentry->d_count != 0) { printk(KERN_ERR "BUG: Dentry %p{i=%lx,n=%s}" @@ -886,14 +909,10 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry) list_del(&dentry->d_u.d_child); } else { parent = dentry->d_parent; - spin_lock(&parent->d_lock); parent->d_count--; list_del(&dentry->d_u.d_child); - spin_unlock(&parent->d_lock); } - detached++; - inode = dentry->d_inode; if (inode) { dentry->d_inode = NULL; @@ -938,9 +957,7 @@ void shrink_dcache_for_umount(struct super_block *sb) dentry = sb->s_root; sb->s_root = NULL; - spin_lock(&dentry->d_lock); dentry->d_count--; - spin_unlock(&dentry->d_lock); shrink_dcache_for_umount_subtree(dentry); while (!hlist_bl_empty(&sb->s_anon)) { @@ -1297,6 +1314,8 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op) dentry->d_flags |= DCACHE_OP_REVALIDATE; if (op->d_delete) dentry->d_flags |= DCACHE_OP_DELETE; + if (op->d_prune) + dentry->d_flags |= DCACHE_OP_PRUNE; } EXPORT_SYMBOL(d_set_d_op); @@ -1743,7 +1762,7 @@ seqretry: */ if (read_seqcount_retry(&dentry->d_seq, *seq)) goto seqretry; - if (parent->d_flags & DCACHE_OP_COMPARE) { + if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) { if (parent->d_op->d_compare(parent, *inode, dentry, i, tlen, tname, name)) @@ -2138,8 +2157,9 @@ static void dentry_unlock_parents_for_move(struct dentry *dentry, * @target: new dentry * * Update the dcache to reflect the move of a file name. Negative - * dcache entries should not be moved in this way. Caller hold - * rename_lock. + * dcache entries should not be moved in this way. Caller must hold + * rename_lock, the i_mutex of the source and target directories, + * and the sb->s_vfs_rename_mutex if they differ. See lock_rename(). */ static void __d_move(struct dentry * dentry, struct dentry * target) { @@ -2202,7 +2222,8 @@ static void __d_move(struct dentry * dentry, struct dentry * target) * @target: new dentry * * Update the dcache to reflect the move of a file name. Negative - * dcache entries should not be moved in this way. + * dcache entries should not be moved in this way. See the locking + * requirements for __d_move. */ void d_move(struct dentry *dentry, struct dentry *target) { @@ -2320,7 +2341,8 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon) * @inode: inode to bind to the dentry, to which aliases may be attached * * Introduces an dentry into the tree, substituting an extant disconnected - * root directory alias in its place if there is one + * root directory alias in its place if there is one. Caller must hold the + * i_mutex of the parent directory. */ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode) { @@ -2362,8 +2384,16 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode) actual = __d_unalias(inode, dentry, alias); } write_sequnlock(&rename_lock); - if (IS_ERR(actual)) + if (IS_ERR(actual)) { + if (PTR_ERR(actual) == -ELOOP) + pr_warn_ratelimited( + "VFS: Lookup of '%s' in %s %s" + " would have caused loop\n", + dentry->d_name.name, + inode->i_sb->s_type->name, + inode->i_sb->s_id); dput(alias); + } goto out_nolock; } } diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index e7a7a2f07324..f3a257d7a985 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -1,5 +1,5 @@ /* - * file.c - part of debugfs, a tiny little debug file system + * inode.c - part of debugfs, a tiny little debug file system * * Copyright (C) 2004 Greg Kroah-Hartman <greg@kroah.com> * Copyright (C) 2004 IBM Inc. diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 2f27e578d466..d5d5297efe97 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -307,7 +307,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent) inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR; inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; - inode->i_nlink = 2; + set_nlink(inode, 2); s->s_root = d_alloc_root(inode); if (s->s_root) @@ -549,7 +549,7 @@ void devpts_pty_kill(struct tty_struct *tty) dentry = d_find_alias(inode); - inode->i_nlink--; + drop_nlink(inode); d_delete(dentry); dput(dentry); /* d_alloc_name() in devpts_pty_new() */ dput(dentry); /* d_find_alias above */ diff --git a/fs/direct-io.c b/fs/direct-io.c index 01d2d9ef609c..d740ab67ff6e 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -35,11 +35,11 @@ #include <linux/buffer_head.h> #include <linux/rwsem.h> #include <linux/uio.h> -#include <asm/atomic.h> +#include <linux/atomic.h> /* * How many user pages to map in one call to get_user_pages(). This determines - * the size of a structure on the stack. + * the size of a structure in the slab cache */ #define DIO_PAGES 64 @@ -55,13 +55,10 @@ * blocksize. */ -struct dio { - /* BIO submission state */ +/* dio_state only used in the submission path */ + +struct dio_submit { struct bio *bio; /* bio under assembly */ - struct inode *inode; - int rw; - loff_t i_size; /* i_size when submitted */ - int flags; /* doesn't change */ unsigned blkbits; /* doesn't change */ unsigned blkfactor; /* When we're using an alignment which is finer than the filesystem's soft @@ -76,18 +73,17 @@ struct dio { sector_t block_in_file; /* Current offset into the underlying file in dio_block units. */ unsigned blocks_available; /* At block_in_file. changes */ + int reap_counter; /* rate limit reaping */ sector_t final_block_in_request;/* doesn't change */ unsigned first_block_in_page; /* doesn't change, Used only once */ int boundary; /* prev block is at a boundary */ - int reap_counter; /* rate limit reaping */ get_block_t *get_block; /* block mapping function */ - dio_iodone_t *end_io; /* IO completion function */ dio_submit_t *submit_io; /* IO submition function */ + loff_t logical_offset_in_bio; /* current first logical block in bio */ sector_t final_block_in_bio; /* current final block in bio + 1 */ sector_t next_block_for_io; /* next block to be put under IO, in dio_blocks units */ - struct buffer_head map_bh; /* last get_block() result */ /* * Deferred addition of a page to the dio. These variables are @@ -100,18 +96,6 @@ struct dio { sector_t cur_page_block; /* Where it starts */ loff_t cur_page_fs_offset; /* Offset in file */ - /* BIO completion state */ - spinlock_t bio_lock; /* protects BIO fields below */ - unsigned long refcount; /* direct_io_worker() and bios */ - struct bio *bio_list; /* singly linked via bi_private */ - struct task_struct *waiter; /* waiting task (NULL if none) */ - - /* AIO related stuff */ - struct kiocb *iocb; /* kiocb */ - int is_async; /* is IO async ? */ - int io_error; /* IO error in completion path */ - ssize_t result; /* IO result */ - /* * Page fetching state. These variables belong to dio_refill_pages(). */ @@ -125,7 +109,30 @@ struct dio { */ unsigned head; /* next page to process */ unsigned tail; /* last valid page + 1 */ +}; + +/* dio_state communicated between submission path and end_io */ +struct dio { + int flags; /* doesn't change */ + int rw; + struct inode *inode; + loff_t i_size; /* i_size when submitted */ + dio_iodone_t *end_io; /* IO completion function */ + + void *private; /* copy from map_bh.b_private */ + + /* BIO completion state */ + spinlock_t bio_lock; /* protects BIO fields below */ int page_errors; /* errno from get_user_pages() */ + int is_async; /* is IO async ? */ + int io_error; /* IO error in completion path */ + unsigned long refcount; /* direct_io_worker() and bios */ + struct bio *bio_list; /* singly linked via bi_private */ + struct task_struct *waiter; /* waiting task (NULL if none) */ + + /* AIO related stuff */ + struct kiocb *iocb; /* kiocb */ + ssize_t result; /* IO result */ /* * pages[] (and any fields placed after it) are not zeroed out at @@ -133,7 +140,9 @@ struct dio { * wish that they not be zeroed. */ struct page *pages[DIO_PAGES]; /* page buffer */ -}; +} ____cacheline_aligned_in_smp; + +static struct kmem_cache *dio_cache __read_mostly; static void __inode_dio_wait(struct inode *inode) { @@ -182,27 +191,27 @@ EXPORT_SYMBOL_GPL(inode_dio_done); /* * How many pages are in the queue? */ -static inline unsigned dio_pages_present(struct dio *dio) +static inline unsigned dio_pages_present(struct dio_submit *sdio) { - return dio->tail - dio->head; + return sdio->tail - sdio->head; } /* * Go grab and pin some userspace pages. Typically we'll get 64 at a time. */ -static int dio_refill_pages(struct dio *dio) +static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio) { int ret; int nr_pages; - nr_pages = min(dio->total_pages - dio->curr_page, DIO_PAGES); + nr_pages = min(sdio->total_pages - sdio->curr_page, DIO_PAGES); ret = get_user_pages_fast( - dio->curr_user_address, /* Where from? */ + sdio->curr_user_address, /* Where from? */ nr_pages, /* How many pages? */ dio->rw == READ, /* Write to memory? */ &dio->pages[0]); /* Put results here */ - if (ret < 0 && dio->blocks_available && (dio->rw & WRITE)) { + if (ret < 0 && sdio->blocks_available && (dio->rw & WRITE)) { struct page *page = ZERO_PAGE(0); /* * A memory fault, but the filesystem has some outstanding @@ -213,17 +222,17 @@ static int dio_refill_pages(struct dio *dio) dio->page_errors = ret; page_cache_get(page); dio->pages[0] = page; - dio->head = 0; - dio->tail = 1; + sdio->head = 0; + sdio->tail = 1; ret = 0; goto out; } if (ret >= 0) { - dio->curr_user_address += ret * PAGE_SIZE; - dio->curr_page += ret; - dio->head = 0; - dio->tail = ret; + sdio->curr_user_address += ret * PAGE_SIZE; + sdio->curr_page += ret; + sdio->head = 0; + sdio->tail = ret; ret = 0; } out: @@ -236,17 +245,18 @@ out: * decent number of pages, less frequently. To provide nicer use of the * L1 cache. */ -static struct page *dio_get_page(struct dio *dio) +static inline struct page *dio_get_page(struct dio *dio, + struct dio_submit *sdio) { - if (dio_pages_present(dio) == 0) { + if (dio_pages_present(sdio) == 0) { int ret; - ret = dio_refill_pages(dio); + ret = dio_refill_pages(dio, sdio); if (ret) return ERR_PTR(ret); - BUG_ON(dio_pages_present(dio) == 0); + BUG_ON(dio_pages_present(sdio) == 0); } - return dio->pages[dio->head++]; + return dio->pages[sdio->head++]; } /** @@ -292,7 +302,7 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is if (dio->end_io && dio->result) { dio->end_io(dio->iocb, offset, transferred, - dio->map_bh.b_private, ret, is_async); + dio->private, ret, is_async); } else { if (is_async) aio_complete(dio->iocb, ret, 0); @@ -323,7 +333,7 @@ static void dio_bio_end_aio(struct bio *bio, int error) if (remaining == 0) { dio_complete(dio, dio->iocb->ki_pos, 0, true); - kfree(dio); + kmem_cache_free(dio_cache, dio); } } @@ -367,9 +377,10 @@ void dio_end_io(struct bio *bio, int error) } EXPORT_SYMBOL_GPL(dio_end_io); -static void -dio_bio_alloc(struct dio *dio, struct block_device *bdev, - sector_t first_sector, int nr_vecs) +static inline void +dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, + struct block_device *bdev, + sector_t first_sector, int nr_vecs) { struct bio *bio; @@ -386,8 +397,8 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev, else bio->bi_end_io = dio_bio_end_io; - dio->bio = bio; - dio->logical_offset_in_bio = dio->cur_page_fs_offset; + sdio->bio = bio; + sdio->logical_offset_in_bio = sdio->cur_page_fs_offset; } /* @@ -397,9 +408,9 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev, * * bios hold a dio reference between submit_bio and ->end_io. */ -static void dio_bio_submit(struct dio *dio) +static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) { - struct bio *bio = dio->bio; + struct bio *bio = sdio->bio; unsigned long flags; bio->bi_private = dio; @@ -411,24 +422,24 @@ static void dio_bio_submit(struct dio *dio) if (dio->is_async && dio->rw == READ) bio_set_pages_dirty(bio); - if (dio->submit_io) - dio->submit_io(dio->rw, bio, dio->inode, - dio->logical_offset_in_bio); + if (sdio->submit_io) + sdio->submit_io(dio->rw, bio, dio->inode, + sdio->logical_offset_in_bio); else submit_bio(dio->rw, bio); - dio->bio = NULL; - dio->boundary = 0; - dio->logical_offset_in_bio = 0; + sdio->bio = NULL; + sdio->boundary = 0; + sdio->logical_offset_in_bio = 0; } /* * Release any resources in case of a failure */ -static void dio_cleanup(struct dio *dio) +static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio) { - while (dio_pages_present(dio)) - page_cache_release(dio_get_page(dio)); + while (dio_pages_present(sdio)) + page_cache_release(dio_get_page(dio, sdio)); } /* @@ -518,11 +529,11 @@ static void dio_await_completion(struct dio *dio) * * This also helps to limit the peak amount of pinned userspace memory. */ -static int dio_bio_reap(struct dio *dio) +static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio) { int ret = 0; - if (dio->reap_counter++ >= 64) { + if (sdio->reap_counter++ >= 64) { while (dio->bio_list) { unsigned long flags; struct bio *bio; @@ -536,14 +547,14 @@ static int dio_bio_reap(struct dio *dio) if (ret == 0) ret = ret2; } - dio->reap_counter = 0; + sdio->reap_counter = 0; } return ret; } /* * Call into the fs to map some more disk blocks. We record the current number - * of available blocks at dio->blocks_available. These are in units of the + * of available blocks at sdio->blocks_available. These are in units of the * fs blocksize, (1 << inode->i_blkbits). * * The fs is allowed to map lots of blocks at once. If it wants to do that, @@ -564,10 +575,10 @@ static int dio_bio_reap(struct dio *dio) * buffer_mapped(). However the direct-io code will only process holes one * block at a time - it will repeatedly call get_block() as it walks the hole. */ -static int get_more_blocks(struct dio *dio) +static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, + struct buffer_head *map_bh) { int ret; - struct buffer_head *map_bh = &dio->map_bh; sector_t fs_startblk; /* Into file, in filesystem-sized blocks */ unsigned long fs_count; /* Number of filesystem-sized blocks */ unsigned long dio_count;/* Number of dio_block-sized blocks */ @@ -580,11 +591,11 @@ static int get_more_blocks(struct dio *dio) */ ret = dio->page_errors; if (ret == 0) { - BUG_ON(dio->block_in_file >= dio->final_block_in_request); - fs_startblk = dio->block_in_file >> dio->blkfactor; - dio_count = dio->final_block_in_request - dio->block_in_file; - fs_count = dio_count >> dio->blkfactor; - blkmask = (1 << dio->blkfactor) - 1; + BUG_ON(sdio->block_in_file >= sdio->final_block_in_request); + fs_startblk = sdio->block_in_file >> sdio->blkfactor; + dio_count = sdio->final_block_in_request - sdio->block_in_file; + fs_count = dio_count >> sdio->blkfactor; + blkmask = (1 << sdio->blkfactor) - 1; if (dio_count & blkmask) fs_count++; @@ -604,13 +615,16 @@ static int get_more_blocks(struct dio *dio) */ create = dio->rw & WRITE; if (dio->flags & DIO_SKIP_HOLES) { - if (dio->block_in_file < (i_size_read(dio->inode) >> - dio->blkbits)) + if (sdio->block_in_file < (i_size_read(dio->inode) >> + sdio->blkbits)) create = 0; } - ret = (*dio->get_block)(dio->inode, fs_startblk, + ret = (*sdio->get_block)(dio->inode, fs_startblk, map_bh, create); + + /* Store for completion */ + dio->private = map_bh->b_private; } return ret; } @@ -618,20 +632,21 @@ static int get_more_blocks(struct dio *dio) /* * There is no bio. Make one now. */ -static int dio_new_bio(struct dio *dio, sector_t start_sector) +static inline int dio_new_bio(struct dio *dio, struct dio_submit *sdio, + sector_t start_sector, struct buffer_head *map_bh) { sector_t sector; int ret, nr_pages; - ret = dio_bio_reap(dio); + ret = dio_bio_reap(dio, sdio); if (ret) goto out; - sector = start_sector << (dio->blkbits - 9); - nr_pages = min(dio->pages_in_io, bio_get_nr_vecs(dio->map_bh.b_bdev)); + sector = start_sector << (sdio->blkbits - 9); + nr_pages = min(sdio->pages_in_io, bio_get_nr_vecs(map_bh->b_bdev)); nr_pages = min(nr_pages, BIO_MAX_PAGES); BUG_ON(nr_pages <= 0); - dio_bio_alloc(dio, dio->map_bh.b_bdev, sector, nr_pages); - dio->boundary = 0; + dio_bio_alloc(dio, sdio, map_bh->b_bdev, sector, nr_pages); + sdio->boundary = 0; out: return ret; } @@ -643,21 +658,21 @@ out: * * Return zero on success. Non-zero means the caller needs to start a new BIO. */ -static int dio_bio_add_page(struct dio *dio) +static inline int dio_bio_add_page(struct dio_submit *sdio) { int ret; - ret = bio_add_page(dio->bio, dio->cur_page, - dio->cur_page_len, dio->cur_page_offset); - if (ret == dio->cur_page_len) { + ret = bio_add_page(sdio->bio, sdio->cur_page, + sdio->cur_page_len, sdio->cur_page_offset); + if (ret == sdio->cur_page_len) { /* * Decrement count only, if we are done with this page */ - if ((dio->cur_page_len + dio->cur_page_offset) == PAGE_SIZE) - dio->pages_in_io--; - page_cache_get(dio->cur_page); - dio->final_block_in_bio = dio->cur_page_block + - (dio->cur_page_len >> dio->blkbits); + if ((sdio->cur_page_len + sdio->cur_page_offset) == PAGE_SIZE) + sdio->pages_in_io--; + page_cache_get(sdio->cur_page); + sdio->final_block_in_bio = sdio->cur_page_block + + (sdio->cur_page_len >> sdio->blkbits); ret = 0; } else { ret = 1; @@ -675,14 +690,15 @@ static int dio_bio_add_page(struct dio *dio) * The caller of this function is responsible for removing cur_page from the * dio, and for dropping the refcount which came from that presence. */ -static int dio_send_cur_page(struct dio *dio) +static inline int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio, + struct buffer_head *map_bh) { int ret = 0; - if (dio->bio) { - loff_t cur_offset = dio->cur_page_fs_offset; - loff_t bio_next_offset = dio->logical_offset_in_bio + - dio->bio->bi_size; + if (sdio->bio) { + loff_t cur_offset = sdio->cur_page_fs_offset; + loff_t bio_next_offset = sdio->logical_offset_in_bio + + sdio->bio->bi_size; /* * See whether this new request is contiguous with the old. @@ -698,28 +714,28 @@ static int dio_send_cur_page(struct dio *dio) * be the next logical offset in the bio, submit the bio we * have. */ - if (dio->final_block_in_bio != dio->cur_page_block || + if (sdio->final_block_in_bio != sdio->cur_page_block || cur_offset != bio_next_offset) - dio_bio_submit(dio); + dio_bio_submit(dio, sdio); /* * Submit now if the underlying fs is about to perform a * metadata read */ - else if (dio->boundary) - dio_bio_submit(dio); + else if (sdio->boundary) + dio_bio_submit(dio, sdio); } - if (dio->bio == NULL) { - ret = dio_new_bio(dio, dio->cur_page_block); + if (sdio->bio == NULL) { + ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh); if (ret) goto out; } - if (dio_bio_add_page(dio) != 0) { - dio_bio_submit(dio); - ret = dio_new_bio(dio, dio->cur_page_block); + if (dio_bio_add_page(sdio) != 0) { + dio_bio_submit(dio, sdio); + ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh); if (ret == 0) { - ret = dio_bio_add_page(dio); + ret = dio_bio_add_page(sdio); BUG_ON(ret != 0); } } @@ -744,9 +760,10 @@ out: * If that doesn't work out then we put the old page into the bio and add this * page to the dio instead. */ -static int -submit_page_section(struct dio *dio, struct page *page, - unsigned offset, unsigned len, sector_t blocknr) +static inline int +submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page, + unsigned offset, unsigned len, sector_t blocknr, + struct buffer_head *map_bh) { int ret = 0; @@ -760,20 +777,20 @@ submit_page_section(struct dio *dio, struct page *page, /* * Can we just grow the current page's presence in the dio? */ - if ( (dio->cur_page == page) && - (dio->cur_page_offset + dio->cur_page_len == offset) && - (dio->cur_page_block + - (dio->cur_page_len >> dio->blkbits) == blocknr)) { - dio->cur_page_len += len; + if (sdio->cur_page == page && + sdio->cur_page_offset + sdio->cur_page_len == offset && + sdio->cur_page_block + + (sdio->cur_page_len >> sdio->blkbits) == blocknr) { + sdio->cur_page_len += len; /* - * If dio->boundary then we want to schedule the IO now to + * If sdio->boundary then we want to schedule the IO now to * avoid metadata seeks. */ - if (dio->boundary) { - ret = dio_send_cur_page(dio); - page_cache_release(dio->cur_page); - dio->cur_page = NULL; + if (sdio->boundary) { + ret = dio_send_cur_page(dio, sdio, map_bh); + page_cache_release(sdio->cur_page); + sdio->cur_page = NULL; } goto out; } @@ -781,20 +798,20 @@ submit_page_section(struct dio *dio, struct page *page, /* * If there's a deferred page already there then send it. */ - if (dio->cur_page) { - ret = dio_send_cur_page(dio); - page_cache_release(dio->cur_page); - dio->cur_page = NULL; + if (sdio->cur_page) { + ret = dio_send_cur_page(dio, sdio, map_bh); + page_cache_release(sdio->cur_page); + sdio->cur_page = NULL; if (ret) goto out; } page_cache_get(page); /* It is in dio */ - dio->cur_page = page; - dio->cur_page_offset = offset; - dio->cur_page_len = len; - dio->cur_page_block = blocknr; - dio->cur_page_fs_offset = dio->block_in_file << dio->blkbits; + sdio->cur_page = page; + sdio->cur_page_offset = offset; + sdio->cur_page_len = len; + sdio->cur_page_block = blocknr; + sdio->cur_page_fs_offset = sdio->block_in_file << sdio->blkbits; out: return ret; } @@ -804,16 +821,16 @@ out: * file blocks. Only called for S_ISREG files - blockdevs do not set * buffer_new */ -static void clean_blockdev_aliases(struct dio *dio) +static void clean_blockdev_aliases(struct dio *dio, struct buffer_head *map_bh) { unsigned i; unsigned nblocks; - nblocks = dio->map_bh.b_size >> dio->inode->i_blkbits; + nblocks = map_bh->b_size >> dio->inode->i_blkbits; for (i = 0; i < nblocks; i++) { - unmap_underlying_metadata(dio->map_bh.b_bdev, - dio->map_bh.b_blocknr + i); + unmap_underlying_metadata(map_bh->b_bdev, + map_bh->b_blocknr + i); } } @@ -826,19 +843,20 @@ static void clean_blockdev_aliases(struct dio *dio) * `end' is zero if we're doing the start of the IO, 1 at the end of the * IO. */ -static void dio_zero_block(struct dio *dio, int end) +static inline void dio_zero_block(struct dio *dio, struct dio_submit *sdio, + int end, struct buffer_head *map_bh) { unsigned dio_blocks_per_fs_block; unsigned this_chunk_blocks; /* In dio_blocks */ unsigned this_chunk_bytes; struct page *page; - dio->start_zero_done = 1; - if (!dio->blkfactor || !buffer_new(&dio->map_bh)) + sdio->start_zero_done = 1; + if (!sdio->blkfactor || !buffer_new(map_bh)) return; - dio_blocks_per_fs_block = 1 << dio->blkfactor; - this_chunk_blocks = dio->block_in_file & (dio_blocks_per_fs_block - 1); + dio_blocks_per_fs_block = 1 << sdio->blkfactor; + this_chunk_blocks = sdio->block_in_file & (dio_blocks_per_fs_block - 1); if (!this_chunk_blocks) return; @@ -850,14 +868,14 @@ static void dio_zero_block(struct dio *dio, int end) if (end) this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks; - this_chunk_bytes = this_chunk_blocks << dio->blkbits; + this_chunk_bytes = this_chunk_blocks << sdio->blkbits; page = ZERO_PAGE(0); - if (submit_page_section(dio, page, 0, this_chunk_bytes, - dio->next_block_for_io)) + if (submit_page_section(dio, sdio, page, 0, this_chunk_bytes, + sdio->next_block_for_io, map_bh)) return; - dio->next_block_for_io += this_chunk_blocks; + sdio->next_block_for_io += this_chunk_blocks; } /* @@ -876,20 +894,20 @@ static void dio_zero_block(struct dio *dio, int end) * it should set b_size to PAGE_SIZE or more inside get_block(). This gives * fine alignment but still allows this function to work in PAGE_SIZE units. */ -static int do_direct_IO(struct dio *dio) +static int do_direct_IO(struct dio *dio, struct dio_submit *sdio, + struct buffer_head *map_bh) { - const unsigned blkbits = dio->blkbits; + const unsigned blkbits = sdio->blkbits; const unsigned blocks_per_page = PAGE_SIZE >> blkbits; struct page *page; unsigned block_in_page; - struct buffer_head *map_bh = &dio->map_bh; int ret = 0; /* The I/O can start at any block offset within the first page */ - block_in_page = dio->first_block_in_page; + block_in_page = sdio->first_block_in_page; - while (dio->block_in_file < dio->final_block_in_request) { - page = dio_get_page(dio); + while (sdio->block_in_file < sdio->final_block_in_request) { + page = dio_get_page(dio, sdio); if (IS_ERR(page)) { ret = PTR_ERR(page); goto out; @@ -901,14 +919,14 @@ static int do_direct_IO(struct dio *dio) unsigned this_chunk_blocks; /* # of blocks */ unsigned u; - if (dio->blocks_available == 0) { + if (sdio->blocks_available == 0) { /* * Need to go and map some more disk */ unsigned long blkmask; unsigned long dio_remainder; - ret = get_more_blocks(dio); + ret = get_more_blocks(dio, sdio, map_bh); if (ret) { page_cache_release(page); goto out; @@ -916,18 +934,18 @@ static int do_direct_IO(struct dio *dio) if (!buffer_mapped(map_bh)) goto do_holes; - dio->blocks_available = - map_bh->b_size >> dio->blkbits; - dio->next_block_for_io = - map_bh->b_blocknr << dio->blkfactor; + sdio->blocks_available = + map_bh->b_size >> sdio->blkbits; + sdio->next_block_for_io = + map_bh->b_blocknr << sdio->blkfactor; if (buffer_new(map_bh)) - clean_blockdev_aliases(dio); + clean_blockdev_aliases(dio, map_bh); - if (!dio->blkfactor) + if (!sdio->blkfactor) goto do_holes; - blkmask = (1 << dio->blkfactor) - 1; - dio_remainder = (dio->block_in_file & blkmask); + blkmask = (1 << sdio->blkfactor) - 1; + dio_remainder = (sdio->block_in_file & blkmask); /* * If we are at the start of IO and that IO @@ -941,8 +959,8 @@ static int do_direct_IO(struct dio *dio) * on-disk */ if (!buffer_new(map_bh)) - dio->next_block_for_io += dio_remainder; - dio->blocks_available -= dio_remainder; + sdio->next_block_for_io += dio_remainder; + sdio->blocks_available -= dio_remainder; } do_holes: /* Handle holes */ @@ -961,7 +979,7 @@ do_holes: */ i_size_aligned = ALIGN(i_size_read(dio->inode), 1 << blkbits); - if (dio->block_in_file >= + if (sdio->block_in_file >= i_size_aligned >> blkbits) { /* We hit eof */ page_cache_release(page); @@ -969,7 +987,7 @@ do_holes: } zero_user(page, block_in_page << blkbits, 1 << blkbits); - dio->block_in_file++; + sdio->block_in_file++; block_in_page++; goto next_block; } @@ -979,38 +997,41 @@ do_holes: * is finer than the underlying fs, go check to see if * we must zero out the start of this block. */ - if (unlikely(dio->blkfactor && !dio->start_zero_done)) - dio_zero_block(dio, 0); + if (unlikely(sdio->blkfactor && !sdio->start_zero_done)) + dio_zero_block(dio, sdio, 0, map_bh); /* * Work out, in this_chunk_blocks, how much disk we * can add to this page */ - this_chunk_blocks = dio->blocks_available; + this_chunk_blocks = sdio->blocks_available; u = (PAGE_SIZE - offset_in_page) >> blkbits; if (this_chunk_blocks > u) this_chunk_blocks = u; - u = dio->final_block_in_request - dio->block_in_file; + u = sdio->final_block_in_request - sdio->block_in_file; if (this_chunk_blocks > u) this_chunk_blocks = u; this_chunk_bytes = this_chunk_blocks << blkbits; BUG_ON(this_chunk_bytes == 0); - dio->boundary = buffer_boundary(map_bh); - ret = submit_page_section(dio, page, offset_in_page, - this_chunk_bytes, dio->next_block_for_io); + sdio->boundary = buffer_boundary(map_bh); + ret = submit_page_section(dio, sdio, page, + offset_in_page, + this_chunk_bytes, + sdio->next_block_for_io, + map_bh); if (ret) { page_cache_release(page); goto out; } - dio->next_block_for_io += this_chunk_blocks; + sdio->next_block_for_io += this_chunk_blocks; - dio->block_in_file += this_chunk_blocks; + sdio->block_in_file += this_chunk_blocks; block_in_page += this_chunk_blocks; - dio->blocks_available -= this_chunk_blocks; + sdio->blocks_available -= this_chunk_blocks; next_block: - BUG_ON(dio->block_in_file > dio->final_block_in_request); - if (dio->block_in_file == dio->final_block_in_request) + BUG_ON(sdio->block_in_file > sdio->final_block_in_request); + if (sdio->block_in_file == sdio->final_block_in_request) break; } @@ -1022,135 +1043,10 @@ out: return ret; } -static ssize_t -direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, - const struct iovec *iov, loff_t offset, unsigned long nr_segs, - unsigned blkbits, get_block_t get_block, dio_iodone_t end_io, - dio_submit_t submit_io, struct dio *dio) +static inline int drop_refcount(struct dio *dio) { - unsigned long user_addr; + int ret2; unsigned long flags; - int seg; - ssize_t ret = 0; - ssize_t ret2; - size_t bytes; - - dio->inode = inode; - dio->rw = rw; - dio->blkbits = blkbits; - dio->blkfactor = inode->i_blkbits - blkbits; - dio->block_in_file = offset >> blkbits; - - dio->get_block = get_block; - dio->end_io = end_io; - dio->submit_io = submit_io; - dio->final_block_in_bio = -1; - dio->next_block_for_io = -1; - - dio->iocb = iocb; - dio->i_size = i_size_read(inode); - - spin_lock_init(&dio->bio_lock); - dio->refcount = 1; - - /* - * In case of non-aligned buffers, we may need 2 more - * pages since we need to zero out first and last block. - */ - if (unlikely(dio->blkfactor)) - dio->pages_in_io = 2; - - for (seg = 0; seg < nr_segs; seg++) { - user_addr = (unsigned long)iov[seg].iov_base; - dio->pages_in_io += - ((user_addr+iov[seg].iov_len +PAGE_SIZE-1)/PAGE_SIZE - - user_addr/PAGE_SIZE); - } - - for (seg = 0; seg < nr_segs; seg++) { - user_addr = (unsigned long)iov[seg].iov_base; - dio->size += bytes = iov[seg].iov_len; - - /* Index into the first page of the first block */ - dio->first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits; - dio->final_block_in_request = dio->block_in_file + - (bytes >> blkbits); - /* Page fetching state */ - dio->head = 0; - dio->tail = 0; - dio->curr_page = 0; - - dio->total_pages = 0; - if (user_addr & (PAGE_SIZE-1)) { - dio->total_pages++; - bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1)); - } - dio->total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE; - dio->curr_user_address = user_addr; - - ret = do_direct_IO(dio); - - dio->result += iov[seg].iov_len - - ((dio->final_block_in_request - dio->block_in_file) << - blkbits); - - if (ret) { - dio_cleanup(dio); - break; - } - } /* end iovec loop */ - - if (ret == -ENOTBLK) { - /* - * The remaining part of the request will be - * be handled by buffered I/O when we return - */ - ret = 0; - } - /* - * There may be some unwritten disk at the end of a part-written - * fs-block-sized block. Go zero that now. - */ - dio_zero_block(dio, 1); - - if (dio->cur_page) { - ret2 = dio_send_cur_page(dio); - if (ret == 0) - ret = ret2; - page_cache_release(dio->cur_page); - dio->cur_page = NULL; - } - if (dio->bio) - dio_bio_submit(dio); - - /* - * It is possible that, we return short IO due to end of file. - * In that case, we need to release all the pages we got hold on. - */ - dio_cleanup(dio); - - /* - * All block lookups have been performed. For READ requests - * we can let i_mutex go now that its achieved its purpose - * of protecting us from looking up uninitialized blocks. - */ - if (rw == READ && (dio->flags & DIO_LOCKING)) - mutex_unlock(&dio->inode->i_mutex); - - /* - * The only time we want to leave bios in flight is when a successful - * partial aio read or full aio write have been setup. In that case - * bio completion will call aio_complete. The only time it's safe to - * call aio_complete is when we return -EIOCBQUEUED, so we key on that. - * This had *better* be the only place that raises -EIOCBQUEUED. - */ - BUG_ON(ret == -EIOCBQUEUED); - if (dio->is_async && ret == 0 && dio->result && - ((rw & READ) || (dio->result == dio->size))) - ret = -EIOCBQUEUED; - - if (ret != -EIOCBQUEUED) - dio_await_completion(dio); /* * Sync will always be dropping the final ref and completing the @@ -1166,14 +1062,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, spin_lock_irqsave(&dio->bio_lock, flags); ret2 = --dio->refcount; spin_unlock_irqrestore(&dio->bio_lock, flags); - - if (ret2 == 0) { - ret = dio_complete(dio, offset, ret, false); - kfree(dio); - } else - BUG_ON(ret != -EIOCBQUEUED); - - return ret; + return ret2; } /* @@ -1195,6 +1084,11 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, * expected that filesystem provide exclusion between new direct I/O * and truncates. For DIO_LOCKING filesystems this is done by i_mutex, * but other filesystems need to take care of this on their own. + * + * NOTE: if you pass "sdio" to anything by pointer make sure that function + * is always inlined. Otherwise gcc is unable to split the structure into + * individual fields and will generate much worse code. This is important + * for the whole file. */ ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, @@ -1211,6 +1105,10 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, ssize_t retval = -EINVAL; loff_t end = offset; struct dio *dio; + struct dio_submit sdio = { 0, }; + unsigned long user_addr; + size_t bytes; + struct buffer_head map_bh = { 0, }; if (rw & WRITE) rw = WRITE_ODIRECT; @@ -1244,7 +1142,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, if (rw == READ && end == offset) return 0; - dio = kmalloc(sizeof(*dio), GFP_KERNEL); + dio = kmem_cache_alloc(dio_cache, GFP_KERNEL); retval = -ENOMEM; if (!dio) goto out; @@ -1268,7 +1166,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, end - 1); if (retval) { mutex_unlock(&inode->i_mutex); - kfree(dio); + kmem_cache_free(dio_cache, dio); goto out; } } @@ -1288,11 +1186,141 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) && (end > i_size_read(inode))); - retval = direct_io_worker(rw, iocb, inode, iov, offset, - nr_segs, blkbits, get_block, end_io, - submit_io, dio); + retval = 0; + + dio->inode = inode; + dio->rw = rw; + sdio.blkbits = blkbits; + sdio.blkfactor = inode->i_blkbits - blkbits; + sdio.block_in_file = offset >> blkbits; + + sdio.get_block = get_block; + dio->end_io = end_io; + sdio.submit_io = submit_io; + sdio.final_block_in_bio = -1; + sdio.next_block_for_io = -1; + + dio->iocb = iocb; + dio->i_size = i_size_read(inode); + + spin_lock_init(&dio->bio_lock); + dio->refcount = 1; + + /* + * In case of non-aligned buffers, we may need 2 more + * pages since we need to zero out first and last block. + */ + if (unlikely(sdio.blkfactor)) + sdio.pages_in_io = 2; + + for (seg = 0; seg < nr_segs; seg++) { + user_addr = (unsigned long)iov[seg].iov_base; + sdio.pages_in_io += + ((user_addr + iov[seg].iov_len + PAGE_SIZE-1) / + PAGE_SIZE - user_addr / PAGE_SIZE); + } + + for (seg = 0; seg < nr_segs; seg++) { + user_addr = (unsigned long)iov[seg].iov_base; + sdio.size += bytes = iov[seg].iov_len; + + /* Index into the first page of the first block */ + sdio.first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits; + sdio.final_block_in_request = sdio.block_in_file + + (bytes >> blkbits); + /* Page fetching state */ + sdio.head = 0; + sdio.tail = 0; + sdio.curr_page = 0; + + sdio.total_pages = 0; + if (user_addr & (PAGE_SIZE-1)) { + sdio.total_pages++; + bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1)); + } + sdio.total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE; + sdio.curr_user_address = user_addr; + + retval = do_direct_IO(dio, &sdio, &map_bh); + + dio->result += iov[seg].iov_len - + ((sdio.final_block_in_request - sdio.block_in_file) << + blkbits); + + if (retval) { + dio_cleanup(dio, &sdio); + break; + } + } /* end iovec loop */ + + if (retval == -ENOTBLK) { + /* + * The remaining part of the request will be + * be handled by buffered I/O when we return + */ + retval = 0; + } + /* + * There may be some unwritten disk at the end of a part-written + * fs-block-sized block. Go zero that now. + */ + dio_zero_block(dio, &sdio, 1, &map_bh); + + if (sdio.cur_page) { + ssize_t ret2; + + ret2 = dio_send_cur_page(dio, &sdio, &map_bh); + if (retval == 0) + retval = ret2; + page_cache_release(sdio.cur_page); + sdio.cur_page = NULL; + } + if (sdio.bio) + dio_bio_submit(dio, &sdio); + + /* + * It is possible that, we return short IO due to end of file. + * In that case, we need to release all the pages we got hold on. + */ + dio_cleanup(dio, &sdio); + + /* + * All block lookups have been performed. For READ requests + * we can let i_mutex go now that its achieved its purpose + * of protecting us from looking up uninitialized blocks. + */ + if (rw == READ && (dio->flags & DIO_LOCKING)) + mutex_unlock(&dio->inode->i_mutex); + + /* + * The only time we want to leave bios in flight is when a successful + * partial aio read or full aio write have been setup. In that case + * bio completion will call aio_complete. The only time it's safe to + * call aio_complete is when we return -EIOCBQUEUED, so we key on that. + * This had *better* be the only place that raises -EIOCBQUEUED. + */ + BUG_ON(retval == -EIOCBQUEUED); + if (dio->is_async && retval == 0 && dio->result && + ((rw & READ) || (dio->result == sdio.size))) + retval = -EIOCBQUEUED; + + if (retval != -EIOCBQUEUED) + dio_await_completion(dio); + + if (drop_refcount(dio) == 0) { + retval = dio_complete(dio, offset, retval, false); + kmem_cache_free(dio_cache, dio); + } else + BUG_ON(retval != -EIOCBQUEUED); out: return retval; } EXPORT_SYMBOL(__blockdev_direct_IO); + +static __init int dio_init(void) +{ + dio_cache = KMEM_CACHE(dio, SLAB_PANIC); + return 0; +} +module_init(dio_init) diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c index e2b878004364..01fd5c11a7fb 100644 --- a/fs/dlm/plock.c +++ b/fs/dlm/plock.c @@ -92,7 +92,7 @@ static void do_unlock_close(struct dlm_ls *ls, u64 number, op->info.number = number; op->info.start = 0; op->info.end = OFFSET_MAX; - if (fl->fl_lmops && fl->fl_lmops->fl_grant) + if (fl->fl_lmops && fl->fl_lmops->lm_grant) op->info.owner = (__u64) fl->fl_pid; else op->info.owner = (__u64)(long) fl->fl_owner; @@ -128,11 +128,11 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file, op->info.number = number; op->info.start = fl->fl_start; op->info.end = fl->fl_end; - if (fl->fl_lmops && fl->fl_lmops->fl_grant) { + if (fl->fl_lmops && fl->fl_lmops->lm_grant) { /* fl_owner is lockd which doesn't distinguish processes on the nfs client */ op->info.owner = (__u64) fl->fl_pid; - xop->callback = fl->fl_lmops->fl_grant; + xop->callback = fl->fl_lmops->lm_grant; locks_init_lock(&xop->flc); locks_copy_lock(&xop->flc, fl); xop->fl = fl; @@ -268,7 +268,7 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file, op->info.number = number; op->info.start = fl->fl_start; op->info.end = fl->fl_end; - if (fl->fl_lmops && fl->fl_lmops->fl_grant) + if (fl->fl_lmops && fl->fl_lmops->lm_grant) op->info.owner = (__u64) fl->fl_pid; else op->info.owner = (__u64)(long) fl->fl_owner; @@ -327,7 +327,7 @@ int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file, op->info.number = number; op->info.start = fl->fl_start; op->info.end = fl->fl_end; - if (fl->fl_lmops && fl->fl_lmops->fl_grant) + if (fl->fl_lmops && fl->fl_lmops->lm_grant) op->info.owner = (__u64) fl->fl_pid; else op->info.owner = (__u64)(long) fl->fl_owner; diff --git a/fs/ecryptfs/Kconfig b/fs/ecryptfs/Kconfig index 1cd6d9d3e29a..cc16562654de 100644 --- a/fs/ecryptfs/Kconfig +++ b/fs/ecryptfs/Kconfig @@ -1,6 +1,6 @@ config ECRYPT_FS tristate "eCrypt filesystem layer support (EXPERIMENTAL)" - depends on EXPERIMENTAL && KEYS && CRYPTO + depends on EXPERIMENTAL && KEYS && CRYPTO && (ENCRYPTED_KEYS || ENCRYPTED_KEYS=n) select CRYPTO_ECB select CRYPTO_CBC select CRYPTO_MD5 diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 58609bde3b9f..2a834255c75d 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -967,7 +967,7 @@ static void ecryptfs_set_default_crypt_stat_vals( /** * ecryptfs_new_file_context - * @ecryptfs_dentry: The eCryptfs dentry + * @ecryptfs_inode: The eCryptfs inode * * If the crypto context for the file has not yet been established, * this is where we do that. Establishing a new crypto context @@ -984,13 +984,13 @@ static void ecryptfs_set_default_crypt_stat_vals( * * Returns zero on success; non-zero otherwise */ -int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry) +int ecryptfs_new_file_context(struct inode *ecryptfs_inode) { struct ecryptfs_crypt_stat *crypt_stat = - &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat; + &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat; struct ecryptfs_mount_crypt_stat *mount_crypt_stat = &ecryptfs_superblock_to_private( - ecryptfs_dentry->d_sb)->mount_crypt_stat; + ecryptfs_inode->i_sb)->mount_crypt_stat; int cipher_name_len; int rc = 0; @@ -1299,12 +1299,12 @@ static int ecryptfs_write_headers_virt(char *page_virt, size_t max, } static int -ecryptfs_write_metadata_to_contents(struct dentry *ecryptfs_dentry, +ecryptfs_write_metadata_to_contents(struct inode *ecryptfs_inode, char *virt, size_t virt_len) { int rc; - rc = ecryptfs_write_lower(ecryptfs_dentry->d_inode, virt, + rc = ecryptfs_write_lower(ecryptfs_inode, virt, 0, virt_len); if (rc < 0) printk(KERN_ERR "%s: Error attempting to write header " @@ -1338,7 +1338,8 @@ static unsigned long ecryptfs_get_zeroed_pages(gfp_t gfp_mask, /** * ecryptfs_write_metadata - * @ecryptfs_dentry: The eCryptfs dentry + * @ecryptfs_dentry: The eCryptfs dentry, which should be negative + * @ecryptfs_inode: The newly created eCryptfs inode * * Write the file headers out. This will likely involve a userspace * callout, in which the session key is encrypted with one or more @@ -1348,10 +1349,11 @@ static unsigned long ecryptfs_get_zeroed_pages(gfp_t gfp_mask, * * Returns zero on success; non-zero on error */ -int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry) +int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry, + struct inode *ecryptfs_inode) { struct ecryptfs_crypt_stat *crypt_stat = - &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat; + &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat; unsigned int order; char *virt; size_t virt_len; @@ -1391,7 +1393,7 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry) rc = ecryptfs_write_metadata_to_xattr(ecryptfs_dentry, virt, size); else - rc = ecryptfs_write_metadata_to_contents(ecryptfs_dentry, virt, + rc = ecryptfs_write_metadata_to_contents(ecryptfs_inode, virt, virt_len); if (rc) { printk(KERN_ERR "%s: Error writing metadata out to lower file; " @@ -1943,7 +1945,7 @@ static unsigned char *portable_filename_chars = ("-.0123456789ABCD" /* We could either offset on every reverse map or just pad some 0x00's * at the front here */ -static const unsigned char filename_rev_map[] = { +static const unsigned char filename_rev_map[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 15 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 23 */ @@ -1959,7 +1961,7 @@ static const unsigned char filename_rev_map[] = { 0x00, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, /* 103 */ 0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, /* 111 */ 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, /* 119 */ - 0x3D, 0x3E, 0x3F + 0x3D, 0x3E, 0x3F /* 123 - 255 initialized to 0x00 */ }; /** diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 43c7c43b06f5..a9f29b12fbf2 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -29,6 +29,7 @@ #define ECRYPTFS_KERNEL_H #include <keys/user-type.h> +#include <keys/encrypted-type.h> #include <linux/fs.h> #include <linux/fs_stack.h> #include <linux/namei.h> @@ -36,125 +37,18 @@ #include <linux/hash.h> #include <linux/nsproxy.h> #include <linux/backing-dev.h> +#include <linux/ecryptfs.h> -/* Version verification for shared data structures w/ userspace */ -#define ECRYPTFS_VERSION_MAJOR 0x00 -#define ECRYPTFS_VERSION_MINOR 0x04 -#define ECRYPTFS_SUPPORTED_FILE_VERSION 0x03 -/* These flags indicate which features are supported by the kernel - * module; userspace tools such as the mount helper read - * ECRYPTFS_VERSIONING_MASK from a sysfs handle in order to determine - * how to behave. */ -#define ECRYPTFS_VERSIONING_PASSPHRASE 0x00000001 -#define ECRYPTFS_VERSIONING_PUBKEY 0x00000002 -#define ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH 0x00000004 -#define ECRYPTFS_VERSIONING_POLICY 0x00000008 -#define ECRYPTFS_VERSIONING_XATTR 0x00000010 -#define ECRYPTFS_VERSIONING_MULTKEY 0x00000020 -#define ECRYPTFS_VERSIONING_DEVMISC 0x00000040 -#define ECRYPTFS_VERSIONING_HMAC 0x00000080 -#define ECRYPTFS_VERSIONING_FILENAME_ENCRYPTION 0x00000100 -#define ECRYPTFS_VERSIONING_GCM 0x00000200 -#define ECRYPTFS_VERSIONING_MASK (ECRYPTFS_VERSIONING_PASSPHRASE \ - | ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH \ - | ECRYPTFS_VERSIONING_PUBKEY \ - | ECRYPTFS_VERSIONING_XATTR \ - | ECRYPTFS_VERSIONING_MULTKEY \ - | ECRYPTFS_VERSIONING_DEVMISC \ - | ECRYPTFS_VERSIONING_FILENAME_ENCRYPTION) -#define ECRYPTFS_MAX_PASSWORD_LENGTH 64 -#define ECRYPTFS_MAX_PASSPHRASE_BYTES ECRYPTFS_MAX_PASSWORD_LENGTH -#define ECRYPTFS_SALT_SIZE 8 -#define ECRYPTFS_SALT_SIZE_HEX (ECRYPTFS_SALT_SIZE*2) -/* The original signature size is only for what is stored on disk; all - * in-memory representations are expanded hex, so it better adapted to - * be passed around or referenced on the command line */ -#define ECRYPTFS_SIG_SIZE 8 -#define ECRYPTFS_SIG_SIZE_HEX (ECRYPTFS_SIG_SIZE*2) -#define ECRYPTFS_PASSWORD_SIG_SIZE ECRYPTFS_SIG_SIZE_HEX -#define ECRYPTFS_MAX_KEY_BYTES 64 -#define ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES 512 #define ECRYPTFS_DEFAULT_IV_BYTES 16 -#define ECRYPTFS_FILE_VERSION 0x03 #define ECRYPTFS_DEFAULT_EXTENT_SIZE 4096 #define ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE 8192 #define ECRYPTFS_DEFAULT_MSG_CTX_ELEMS 32 #define ECRYPTFS_DEFAULT_SEND_TIMEOUT HZ #define ECRYPTFS_MAX_MSG_CTX_TTL (HZ*3) -#define ECRYPTFS_MAX_PKI_NAME_BYTES 16 #define ECRYPTFS_DEFAULT_NUM_USERS 4 #define ECRYPTFS_MAX_NUM_USERS 32768 #define ECRYPTFS_XATTR_NAME "user.ecryptfs" -#define RFC2440_CIPHER_DES3_EDE 0x02 -#define RFC2440_CIPHER_CAST_5 0x03 -#define RFC2440_CIPHER_BLOWFISH 0x04 -#define RFC2440_CIPHER_AES_128 0x07 -#define RFC2440_CIPHER_AES_192 0x08 -#define RFC2440_CIPHER_AES_256 0x09 -#define RFC2440_CIPHER_TWOFISH 0x0a -#define RFC2440_CIPHER_CAST_6 0x0b - -#define RFC2440_CIPHER_RSA 0x01 - -/** - * For convenience, we may need to pass around the encrypted session - * key between kernel and userspace because the authentication token - * may not be extractable. For example, the TPM may not release the - * private key, instead requiring the encrypted data and returning the - * decrypted data. - */ -struct ecryptfs_session_key { -#define ECRYPTFS_USERSPACE_SHOULD_TRY_TO_DECRYPT 0x00000001 -#define ECRYPTFS_USERSPACE_SHOULD_TRY_TO_ENCRYPT 0x00000002 -#define ECRYPTFS_CONTAINS_DECRYPTED_KEY 0x00000004 -#define ECRYPTFS_CONTAINS_ENCRYPTED_KEY 0x00000008 - u32 flags; - u32 encrypted_key_size; - u32 decrypted_key_size; - u8 encrypted_key[ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES]; - u8 decrypted_key[ECRYPTFS_MAX_KEY_BYTES]; -}; - -struct ecryptfs_password { - u32 password_bytes; - s32 hash_algo; - u32 hash_iterations; - u32 session_key_encryption_key_bytes; -#define ECRYPTFS_PERSISTENT_PASSWORD 0x01 -#define ECRYPTFS_SESSION_KEY_ENCRYPTION_KEY_SET 0x02 - u32 flags; - /* Iterated-hash concatenation of salt and passphrase */ - u8 session_key_encryption_key[ECRYPTFS_MAX_KEY_BYTES]; - u8 signature[ECRYPTFS_PASSWORD_SIG_SIZE + 1]; - /* Always in expanded hex */ - u8 salt[ECRYPTFS_SALT_SIZE]; -}; - -enum ecryptfs_token_types {ECRYPTFS_PASSWORD, ECRYPTFS_PRIVATE_KEY}; - -struct ecryptfs_private_key { - u32 key_size; - u32 data_len; - u8 signature[ECRYPTFS_PASSWORD_SIG_SIZE + 1]; - char pki_type[ECRYPTFS_MAX_PKI_NAME_BYTES + 1]; - u8 data[]; -}; - -/* May be a password or a private key */ -struct ecryptfs_auth_tok { - u16 version; /* 8-bit major and 8-bit minor */ - u16 token_type; -#define ECRYPTFS_ENCRYPT_ONLY 0x00000001 - u32 flags; - struct ecryptfs_session_key session_key; - u8 reserved[32]; - union { - struct ecryptfs_password password; - struct ecryptfs_private_key private_key; - } token; -} __attribute__ ((packed)); - void ecryptfs_dump_auth_tok(struct ecryptfs_auth_tok *auth_tok); extern void ecryptfs_to_hex(char *dst, char *src, size_t src_size); extern void ecryptfs_from_hex(char *dst, char *src, int dst_size); @@ -185,11 +79,47 @@ struct ecryptfs_page_crypt_context { } param; }; +#if defined(CONFIG_ENCRYPTED_KEYS) || defined(CONFIG_ENCRYPTED_KEYS_MODULE) +static inline struct ecryptfs_auth_tok * +ecryptfs_get_encrypted_key_payload_data(struct key *key) +{ + if (key->type == &key_type_encrypted) + return (struct ecryptfs_auth_tok *) + (&((struct encrypted_key_payload *)key->payload.data)->payload_data); + else + return NULL; +} + +static inline struct key *ecryptfs_get_encrypted_key(char *sig) +{ + return request_key(&key_type_encrypted, sig, NULL); +} + +#else +static inline struct ecryptfs_auth_tok * +ecryptfs_get_encrypted_key_payload_data(struct key *key) +{ + return NULL; +} + +static inline struct key *ecryptfs_get_encrypted_key(char *sig) +{ + return ERR_PTR(-ENOKEY); +} + +#endif /* CONFIG_ENCRYPTED_KEYS */ + static inline struct ecryptfs_auth_tok * ecryptfs_get_key_payload_data(struct key *key) { - return (struct ecryptfs_auth_tok *) - (((struct user_key_payload*)key->payload.data)->data); + struct ecryptfs_auth_tok *auth_tok; + + auth_tok = ecryptfs_get_encrypted_key_payload_data(key); + if (!auth_tok) + return (struct ecryptfs_auth_tok *) + (((struct user_key_payload *)key->payload.data)->data); + else + return auth_tok; } #define ECRYPTFS_MAX_KEYSET_SIZE 1024 @@ -584,7 +514,7 @@ ecryptfs_set_dentry_lower_mnt(struct dentry *dentry, struct vfsmount *lower_mnt) #define ecryptfs_printk(type, fmt, arg...) \ __ecryptfs_printk(type "%s: " fmt, __func__, ## arg); -__attribute__ ((format(printf, 1, 2))) +__printf(1, 2) void __ecryptfs_printk(const char *fmt, ...); extern const struct file_operations ecryptfs_main_fops; @@ -654,9 +584,10 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat); int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode); int ecryptfs_encrypt_page(struct page *page); int ecryptfs_decrypt_page(struct page *page); -int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry); +int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry, + struct inode *ecryptfs_inode); int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry); -int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry); +int ecryptfs_new_file_context(struct inode *ecryptfs_inode); void ecryptfs_write_crypt_stat_flags(char *page_virt, struct ecryptfs_crypt_stat *crypt_stat, size_t *written); diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index c6ac98cf9baa..d3f95f941c47 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -139,6 +139,27 @@ out: return rc; } +static void ecryptfs_vma_close(struct vm_area_struct *vma) +{ + filemap_write_and_wait(vma->vm_file->f_mapping); +} + +static const struct vm_operations_struct ecryptfs_file_vm_ops = { + .close = ecryptfs_vma_close, + .fault = filemap_fault, +}; + +static int ecryptfs_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + int rc; + + rc = generic_file_mmap(file, vma); + if (!rc) + vma->vm_ops = &ecryptfs_file_vm_ops; + + return rc; +} + struct kmem_cache *ecryptfs_file_info_cache; /** @@ -349,7 +370,7 @@ const struct file_operations ecryptfs_main_fops = { #ifdef CONFIG_COMPAT .compat_ioctl = ecryptfs_compat_ioctl, #endif - .mmap = generic_file_mmap, + .mmap = ecryptfs_file_mmap, .open = ecryptfs_open, .flush = ecryptfs_flush, .release = ecryptfs_release, diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 340c657a108c..32f90a3ae63e 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -69,6 +69,7 @@ static int ecryptfs_inode_set(struct inode *inode, void *opaque) inode->i_ino = lower_inode->i_ino; inode->i_version++; inode->i_mapping->a_ops = &ecryptfs_aops; + inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi; if (S_ISLNK(inode->i_mode)) inode->i_op = &ecryptfs_symlink_iops; @@ -171,22 +172,23 @@ ecryptfs_create_underlying_file(struct inode *lower_dir_inode, * it. It will also update the eCryptfs directory inode to mimic the * stat of the lower directory inode. * - * Returns zero on success; non-zero on error condition + * Returns the new eCryptfs inode on success; an ERR_PTR on error condition */ -static int +static struct inode * ecryptfs_do_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry, int mode) { int rc; struct dentry *lower_dentry; struct dentry *lower_dir_dentry; + struct inode *inode; lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry); lower_dir_dentry = lock_parent(lower_dentry); if (IS_ERR(lower_dir_dentry)) { ecryptfs_printk(KERN_ERR, "Error locking directory of " "dentry\n"); - rc = PTR_ERR(lower_dir_dentry); + inode = ERR_CAST(lower_dir_dentry); goto out; } rc = ecryptfs_create_underlying_file(lower_dir_dentry->d_inode, @@ -194,20 +196,19 @@ ecryptfs_do_create(struct inode *directory_inode, if (rc) { printk(KERN_ERR "%s: Failure to create dentry in lower fs; " "rc = [%d]\n", __func__, rc); + inode = ERR_PTR(rc); goto out_lock; } - rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry, - directory_inode->i_sb); - if (rc) { - ecryptfs_printk(KERN_ERR, "Failure in ecryptfs_interpose\n"); + inode = __ecryptfs_get_inode(lower_dentry->d_inode, + directory_inode->i_sb); + if (IS_ERR(inode)) goto out_lock; - } fsstack_copy_attr_times(directory_inode, lower_dir_dentry->d_inode); fsstack_copy_inode_size(directory_inode, lower_dir_dentry->d_inode); out_lock: unlock_dir(lower_dir_dentry); out: - return rc; + return inode; } /** @@ -218,26 +219,26 @@ out: * * Returns zero on success */ -static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry) +static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry, + struct inode *ecryptfs_inode) { struct ecryptfs_crypt_stat *crypt_stat = - &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat; + &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat; int rc = 0; - if (S_ISDIR(ecryptfs_dentry->d_inode->i_mode)) { + if (S_ISDIR(ecryptfs_inode->i_mode)) { ecryptfs_printk(KERN_DEBUG, "This is a directory\n"); crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED); goto out; } ecryptfs_printk(KERN_DEBUG, "Initializing crypto context\n"); - rc = ecryptfs_new_file_context(ecryptfs_dentry); + rc = ecryptfs_new_file_context(ecryptfs_inode); if (rc) { ecryptfs_printk(KERN_ERR, "Error creating new file " "context; rc = [%d]\n", rc); goto out; } - rc = ecryptfs_get_lower_file(ecryptfs_dentry, - ecryptfs_dentry->d_inode); + rc = ecryptfs_get_lower_file(ecryptfs_dentry, ecryptfs_inode); if (rc) { printk(KERN_ERR "%s: Error attempting to initialize " "the lower file for the dentry with name " @@ -245,10 +246,10 @@ static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry) ecryptfs_dentry->d_name.name, rc); goto out; } - rc = ecryptfs_write_metadata(ecryptfs_dentry); + rc = ecryptfs_write_metadata(ecryptfs_dentry, ecryptfs_inode); if (rc) printk(KERN_ERR "Error writing headers; rc = [%d]\n", rc); - ecryptfs_put_lower_file(ecryptfs_dentry->d_inode); + ecryptfs_put_lower_file(ecryptfs_inode); out: return rc; } @@ -268,18 +269,28 @@ static int ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry, int mode, struct nameidata *nd) { + struct inode *ecryptfs_inode; int rc; - /* ecryptfs_do_create() calls ecryptfs_interpose() */ - rc = ecryptfs_do_create(directory_inode, ecryptfs_dentry, mode); - if (unlikely(rc)) { + ecryptfs_inode = ecryptfs_do_create(directory_inode, ecryptfs_dentry, + mode); + if (unlikely(IS_ERR(ecryptfs_inode))) { ecryptfs_printk(KERN_WARNING, "Failed to create file in" "lower filesystem\n"); + rc = PTR_ERR(ecryptfs_inode); goto out; } /* At this point, a file exists on "disk"; we need to make sure * that this on disk file is prepared to be an ecryptfs file */ - rc = ecryptfs_initialize_file(ecryptfs_dentry); + rc = ecryptfs_initialize_file(ecryptfs_dentry, ecryptfs_inode); + if (rc) { + drop_nlink(ecryptfs_inode); + unlock_new_inode(ecryptfs_inode); + iput(ecryptfs_inode); + goto out; + } + d_instantiate(ecryptfs_dentry, ecryptfs_inode); + unlock_new_inode(ecryptfs_inode); out: return rc; } @@ -473,8 +484,8 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir, goto out_lock; fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode); - old_dentry->d_inode->i_nlink = - ecryptfs_inode_to_lower(old_dentry->d_inode)->i_nlink; + set_nlink(old_dentry->d_inode, + ecryptfs_inode_to_lower(old_dentry->d_inode)->i_nlink); i_size_write(new_dentry->d_inode, file_size_save); out_lock: unlock_dir(lower_dir_dentry); @@ -498,8 +509,8 @@ static int ecryptfs_unlink(struct inode *dir, struct dentry *dentry) goto out_unlock; } fsstack_copy_attr_times(dir, lower_dir_inode); - dentry->d_inode->i_nlink = - ecryptfs_inode_to_lower(dentry->d_inode)->i_nlink; + set_nlink(dentry->d_inode, + ecryptfs_inode_to_lower(dentry->d_inode)->i_nlink); dentry->d_inode->i_ctime = dir->i_ctime; d_drop(dentry); out_unlock: @@ -564,7 +575,7 @@ static int ecryptfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) goto out; fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode); - dir->i_nlink = lower_dir_dentry->d_inode->i_nlink; + set_nlink(dir, lower_dir_dentry->d_inode->i_nlink); out: unlock_dir(lower_dir_dentry); if (!dentry->d_inode) @@ -587,7 +598,7 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry) if (!rc && dentry->d_inode) clear_nlink(dentry->d_inode); fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); - dir->i_nlink = lower_dir_dentry->d_inode->i_nlink; + set_nlink(dir, lower_dir_dentry->d_inode->i_nlink); unlock_dir(lower_dir_dentry); if (!rc) d_drop(dentry); diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index 27a7fefb83eb..ac1ad48c2376 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -1635,11 +1635,14 @@ int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key, (*auth_tok_key) = request_key(&key_type_user, sig, NULL); if (!(*auth_tok_key) || IS_ERR(*auth_tok_key)) { - printk(KERN_ERR "Could not find key with description: [%s]\n", - sig); - rc = process_request_key_err(PTR_ERR(*auth_tok_key)); - (*auth_tok_key) = NULL; - goto out; + (*auth_tok_key) = ecryptfs_get_encrypted_key(sig); + if (!(*auth_tok_key) || IS_ERR(*auth_tok_key)) { + printk(KERN_ERR "Could not find key with description: [%s]\n", + sig); + rc = process_request_key_err(PTR_ERR(*auth_tok_key)); + (*auth_tok_key) = NULL; + goto out; + } } down_write(&(*auth_tok_key)->sem); rc = ecryptfs_verify_auth_tok_from_key(*auth_tok_key, auth_tok); @@ -1868,11 +1871,6 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat, * just one will be sufficient to decrypt to get the FEK. */ find_next_matching_auth_tok: found_auth_tok = 0; - if (auth_tok_key) { - up_write(&(auth_tok_key->sem)); - key_put(auth_tok_key); - auth_tok_key = NULL; - } list_for_each_entry(auth_tok_list_item, &auth_tok_list, list) { candidate_auth_tok = &auth_tok_list_item->auth_tok; if (unlikely(ecryptfs_verbosity > 0)) { @@ -1909,14 +1907,22 @@ found_matching_auth_tok: memcpy(&(candidate_auth_tok->token.private_key), &(matching_auth_tok->token.private_key), sizeof(struct ecryptfs_private_key)); + up_write(&(auth_tok_key->sem)); + key_put(auth_tok_key); rc = decrypt_pki_encrypted_session_key(candidate_auth_tok, crypt_stat); } else if (candidate_auth_tok->token_type == ECRYPTFS_PASSWORD) { memcpy(&(candidate_auth_tok->token.password), &(matching_auth_tok->token.password), sizeof(struct ecryptfs_password)); + up_write(&(auth_tok_key->sem)); + key_put(auth_tok_key); rc = decrypt_passphrase_encrypted_session_key( candidate_auth_tok, crypt_stat); + } else { + up_write(&(auth_tok_key->sem)); + key_put(auth_tok_key); + rc = -EINVAL; } if (rc) { struct ecryptfs_auth_tok_list_item *auth_tok_list_item_tmp; @@ -1956,21 +1962,18 @@ found_matching_auth_tok: out_wipe_list: wipe_auth_tok_list(&auth_tok_list); out: - if (auth_tok_key) { - up_write(&(auth_tok_key->sem)); - key_put(auth_tok_key); - } return rc; } static int -pki_encrypt_session_key(struct ecryptfs_auth_tok *auth_tok, +pki_encrypt_session_key(struct key *auth_tok_key, + struct ecryptfs_auth_tok *auth_tok, struct ecryptfs_crypt_stat *crypt_stat, struct ecryptfs_key_record *key_rec) { struct ecryptfs_msg_ctx *msg_ctx = NULL; char *payload = NULL; - size_t payload_len; + size_t payload_len = 0; struct ecryptfs_message *msg; int rc; @@ -1979,6 +1982,8 @@ pki_encrypt_session_key(struct ecryptfs_auth_tok *auth_tok, crypt_stat->cipher, crypt_stat->key_size), crypt_stat, &payload, &payload_len); + up_write(&(auth_tok_key->sem)); + key_put(auth_tok_key); if (rc) { ecryptfs_printk(KERN_ERR, "Error generating tag 66 packet\n"); goto out; @@ -2008,6 +2013,8 @@ out: * write_tag_1_packet - Write an RFC2440-compatible tag 1 (public key) packet * @dest: Buffer into which to write the packet * @remaining_bytes: Maximum number of bytes that can be writtn + * @auth_tok_key: The authentication token key to unlock and put when done with + * @auth_tok * @auth_tok: The authentication token used for generating the tag 1 packet * @crypt_stat: The cryptographic context * @key_rec: The key record struct for the tag 1 packet @@ -2018,7 +2025,7 @@ out: */ static int write_tag_1_packet(char *dest, size_t *remaining_bytes, - struct ecryptfs_auth_tok *auth_tok, + struct key *auth_tok_key, struct ecryptfs_auth_tok *auth_tok, struct ecryptfs_crypt_stat *crypt_stat, struct ecryptfs_key_record *key_rec, size_t *packet_size) { @@ -2039,12 +2046,15 @@ write_tag_1_packet(char *dest, size_t *remaining_bytes, memcpy(key_rec->enc_key, auth_tok->session_key.encrypted_key, auth_tok->session_key.encrypted_key_size); + up_write(&(auth_tok_key->sem)); + key_put(auth_tok_key); goto encrypted_session_key_set; } if (auth_tok->session_key.encrypted_key_size == 0) auth_tok->session_key.encrypted_key_size = auth_tok->token.private_key.key_size; - rc = pki_encrypt_session_key(auth_tok, crypt_stat, key_rec); + rc = pki_encrypt_session_key(auth_tok_key, auth_tok, crypt_stat, + key_rec); if (rc) { printk(KERN_ERR "Failed to encrypt session key via a key " "module; rc = [%d]\n", rc); @@ -2248,7 +2258,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes, auth_tok->token.password.session_key_encryption_key, crypt_stat->key_size); ecryptfs_printk(KERN_DEBUG, - "Cached session key " "encryption key: \n"); + "Cached session key encryption key:\n"); if (ecryptfs_verbosity > 0) ecryptfs_dump_hex(session_key_encryption_key, 16); } @@ -2421,6 +2431,8 @@ ecryptfs_generate_key_packet_set(char *dest_base, &max, auth_tok, crypt_stat, key_rec, &written); + up_write(&(auth_tok_key->sem)); + key_put(auth_tok_key); if (rc) { ecryptfs_printk(KERN_WARNING, "Error " "writing tag 3 packet\n"); @@ -2438,8 +2450,8 @@ ecryptfs_generate_key_packet_set(char *dest_base, } (*len) += written; } else if (auth_tok->token_type == ECRYPTFS_PRIVATE_KEY) { - rc = write_tag_1_packet(dest_base + (*len), - &max, auth_tok, + rc = write_tag_1_packet(dest_base + (*len), &max, + auth_tok_key, auth_tok, crypt_stat, key_rec, &written); if (rc) { ecryptfs_printk(KERN_WARNING, "Error " @@ -2448,14 +2460,13 @@ ecryptfs_generate_key_packet_set(char *dest_base, } (*len) += written; } else { + up_write(&(auth_tok_key->sem)); + key_put(auth_tok_key); ecryptfs_printk(KERN_WARNING, "Unsupported " "authentication token type\n"); rc = -EINVAL; goto out_free; } - up_write(&(auth_tok_key->sem)); - key_put(auth_tok_key); - auth_tok_key = NULL; } if (likely(max > 0)) { dest_base[(*len)] = 0x00; @@ -2468,11 +2479,6 @@ out_free: out: if (rc) (*len) = 0; - if (auth_tok_key) { - up_write(&(auth_tok_key->sem)); - key_put(auth_tok_key); - } - mutex_unlock(&crypt_stat->keysig_list_mutex); return rc; } diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 9f1bb747d77d..b4a6befb1216 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -175,6 +175,7 @@ enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig, ecryptfs_opt_encrypted_view, ecryptfs_opt_fnek_sig, ecryptfs_opt_fn_cipher, ecryptfs_opt_fn_cipher_key_bytes, ecryptfs_opt_unlink_sigs, ecryptfs_opt_mount_auth_tok_only, + ecryptfs_opt_check_dev_ruid, ecryptfs_opt_err }; static const match_table_t tokens = { @@ -191,6 +192,7 @@ static const match_table_t tokens = { {ecryptfs_opt_fn_cipher_key_bytes, "ecryptfs_fn_key_bytes=%u"}, {ecryptfs_opt_unlink_sigs, "ecryptfs_unlink_sigs"}, {ecryptfs_opt_mount_auth_tok_only, "ecryptfs_mount_auth_tok_only"}, + {ecryptfs_opt_check_dev_ruid, "ecryptfs_check_dev_ruid"}, {ecryptfs_opt_err, NULL} }; @@ -236,6 +238,7 @@ static void ecryptfs_init_mount_crypt_stat( * ecryptfs_parse_options * @sb: The ecryptfs super block * @options: The options passed to the kernel + * @check_ruid: set to 1 if device uid should be checked against the ruid * * Parse mount options: * debug=N - ecryptfs_verbosity level for debug output @@ -251,7 +254,8 @@ static void ecryptfs_init_mount_crypt_stat( * * Returns zero on success; non-zero on error */ -static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options) +static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options, + uid_t *check_ruid) { char *p; int rc = 0; @@ -276,6 +280,8 @@ static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options) char *cipher_key_bytes_src; char *fn_cipher_key_bytes_src; + *check_ruid = 0; + if (!options) { rc = -EINVAL; goto out; @@ -380,6 +386,9 @@ static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options) mount_crypt_stat->flags |= ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY; break; + case ecryptfs_opt_check_dev_ruid: + *check_ruid = 1; + break; case ecryptfs_opt_err: default: printk(KERN_WARNING @@ -475,6 +484,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags const char *err = "Getting sb failed"; struct inode *inode; struct path path; + uid_t check_ruid; int rc; sbi = kmem_cache_zalloc(ecryptfs_sb_info_cache, GFP_KERNEL); @@ -483,7 +493,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags goto out; } - rc = ecryptfs_parse_options(sbi, raw_data); + rc = ecryptfs_parse_options(sbi, raw_data, &check_ruid); if (rc) { err = "Error parsing options"; goto out; @@ -521,6 +531,15 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags "known incompatibilities\n"); goto out_free; } + + if (check_ruid && path.dentry->d_inode->i_uid != current_uid()) { + rc = -EPERM; + printk(KERN_ERR "Mount of device (uid: %d) not owned by " + "requested user (uid: %d)\n", + path.dentry->d_inode->i_uid, current_uid()); + goto out_free; + } + ecryptfs_set_superblock_lower(s, path.dentry->d_sb); s->s_maxbytes = path.dentry->d_sb->s_maxbytes; s->s_blocksize = path.dentry->d_sb->s_blocksize; diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c index 85d430963116..3745f7c2b9c2 100644 --- a/fs/ecryptfs/read_write.c +++ b/fs/ecryptfs/read_write.c @@ -39,15 +39,16 @@ int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data, loff_t offset, size_t size) { - struct ecryptfs_inode_info *inode_info; + struct file *lower_file; mm_segment_t fs_save; ssize_t rc; - inode_info = ecryptfs_inode_to_private(ecryptfs_inode); - BUG_ON(!inode_info->lower_file); + lower_file = ecryptfs_inode_to_private(ecryptfs_inode)->lower_file; + if (!lower_file) + return -EIO; fs_save = get_fs(); set_fs(get_ds()); - rc = vfs_write(inode_info->lower_file, data, size, &offset); + rc = vfs_write(lower_file, data, size, &offset); set_fs(fs_save); mark_inode_dirty_sync(ecryptfs_inode); return rc; @@ -225,15 +226,16 @@ out: int ecryptfs_read_lower(char *data, loff_t offset, size_t size, struct inode *ecryptfs_inode) { - struct ecryptfs_inode_info *inode_info = - ecryptfs_inode_to_private(ecryptfs_inode); + struct file *lower_file; mm_segment_t fs_save; ssize_t rc; - BUG_ON(!inode_info->lower_file); + lower_file = ecryptfs_inode_to_private(ecryptfs_inode)->lower_file; + if (!lower_file) + return -EIO; fs_save = get_fs(); set_fs(get_ds()); - rc = vfs_read(inode_info->lower_file, data, size, &offset); + rc = vfs_read(lower_file, data, size, &offset); set_fs(fs_save); return rc; } diff --git a/fs/efs/inode.c b/fs/efs/inode.c index 9c13412e6c99..bc84f365d75c 100644 --- a/fs/efs/inode.c +++ b/fs/efs/inode.c @@ -96,7 +96,7 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino) efs_inode = (struct efs_dinode *) (bh->b_data + offset); inode->i_mode = be16_to_cpu(efs_inode->di_mode); - inode->i_nlink = be16_to_cpu(efs_inode->di_nlink); + set_nlink(inode, be16_to_cpu(efs_inode->di_nlink)); inode->i_uid = (uid_t)be16_to_cpu(efs_inode->di_uid); inode->i_gid = (gid_t)be16_to_cpu(efs_inode->di_gid); inode->i_size = be32_to_cpu(efs_inode->di_size); diff --git a/fs/eventpoll.c b/fs/eventpoll.c index f9cfd168fbe2..828e750af23a 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -37,7 +37,7 @@ #include <asm/system.h> #include <asm/io.h> #include <asm/mman.h> -#include <asm/atomic.h> +#include <linux/atomic.h> /* * LOCKING: @@ -70,6 +70,15 @@ * simultaneous inserts (A into B and B into A) from racing and * constructing a cycle without either insert observing that it is * going to. + * It is necessary to acquire multiple "ep->mtx"es at once in the + * case when one epoll fd is added to another. In this case, we + * always acquire the locks in the order of nesting (i.e. after + * epoll_ctl(e1, EPOLL_CTL_ADD, e2), e1->mtx will always be acquired + * before e2->mtx). Since we disallow cycles of epoll file + * descriptors, this ensures that the mutexes are well-ordered. In + * order to communicate this nesting to lockdep, when walking a tree + * of epoll file descriptors, we use the current recursion depth as + * the lockdep subkey. * It is possible to drop the "ep->mtx" and to use the global * mutex "epmutex" (together with "ep->lock") to have it working, * but having "ep->mtx" will make the interface more scalable. @@ -464,13 +473,15 @@ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi) * @ep: Pointer to the epoll private data structure. * @sproc: Pointer to the scan callback. * @priv: Private opaque data passed to the @sproc callback. + * @depth: The current depth of recursive f_op->poll calls. * * Returns: The same integer error code returned by the @sproc callback. */ static int ep_scan_ready_list(struct eventpoll *ep, int (*sproc)(struct eventpoll *, struct list_head *, void *), - void *priv) + void *priv, + int depth) { int error, pwake = 0; unsigned long flags; @@ -481,7 +492,7 @@ static int ep_scan_ready_list(struct eventpoll *ep, * We need to lock this because we could be hit by * eventpoll_release_file() and epoll_ctl(). */ - mutex_lock(&ep->mtx); + mutex_lock_nested(&ep->mtx, depth); /* * Steal the ready list, and re-init the original one to the @@ -670,7 +681,7 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests) { - return ep_scan_ready_list(priv, ep_read_events_proc, NULL); + return ep_scan_ready_list(priv, ep_read_events_proc, NULL, call_nests + 1); } static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait) @@ -700,7 +711,7 @@ static const struct file_operations eventpoll_fops = { .llseek = noop_llseek, }; -/* Fast test to see if the file is an evenpoll file */ +/* Fast test to see if the file is an eventpoll file */ static inline int is_file_epoll(struct file *f) { return f->f_op == &eventpoll_fops; @@ -737,7 +748,7 @@ void eventpoll_release_file(struct file *file) ep = epi->ep; list_del_init(&epi->fllink); - mutex_lock(&ep->mtx); + mutex_lock_nested(&ep->mtx, 0); ep_remove(ep, epi); mutex_unlock(&ep->mtx); } @@ -1134,7 +1145,7 @@ static int ep_send_events(struct eventpoll *ep, esed.maxevents = maxevents; esed.events = events; - return ep_scan_ready_list(ep, ep_send_events_proc, &esed); + return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0); } static inline struct timespec ep_set_mstimeout(long ms) @@ -1267,7 +1278,7 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) struct rb_node *rbp; struct epitem *epi; - mutex_lock(&ep->mtx); + mutex_lock_nested(&ep->mtx, call_nests + 1); for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { epi = rb_entry(rbp, struct epitem, rbn); if (unlikely(is_file_epoll(epi->ffd.file))) { @@ -1409,7 +1420,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, } - mutex_lock(&ep->mtx); + mutex_lock_nested(&ep->mtx, 0); /* * Try to lookup the file inside our RB tree, Since we grabbed "mtx" diff --git a/fs/exec.c b/fs/exec.c index 842d5700c155..36254645b7cc 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -181,14 +181,7 @@ static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) return; bprm->vma_pages = pages; - -#ifdef SPLIT_RSS_COUNTING - add_mm_counter(mm, MM_ANONPAGES, diff); -#else - spin_lock(&mm->page_table_lock); add_mm_counter(mm, MM_ANONPAGES, diff); - spin_unlock(&mm->page_table_lock); -#endif } static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, @@ -277,7 +270,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm) * use STACK_TOP because that can depend on attributes which aren't * configured yet. */ - BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); + BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); vma->vm_end = STACK_TOP_MAX; vma->vm_start = vma->vm_end - PAGE_SIZE; vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; @@ -848,10 +841,6 @@ static int exec_mmap(struct mm_struct *mm) tsk->mm = mm; tsk->active_mm = mm; activate_mm(active_mm, mm); - if (old_mm && tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { - atomic_dec(&old_mm->oom_disable_count); - atomic_inc(&tsk->mm->oom_disable_count); - } task_unlock(tsk); arch_pick_mmap_layout(mm); if (old_mm) { @@ -1430,9 +1419,9 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs) } } read_unlock(&binfmt_lock); +#ifdef CONFIG_MODULES if (retval != -ENOEXEC || bprm->mm == NULL) { break; -#ifdef CONFIG_MODULES } else { #define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e)) if (printable(bprm->buf[0]) && @@ -1440,9 +1429,13 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs) printable(bprm->buf[2]) && printable(bprm->buf[3])) break; /* -ENOEXEC */ + if (try) + break; /* -ENOEXEC */ request_module("binfmt-%04x", *(unsigned short *)(&bprm->buf[2])); -#endif } +#else + break; +#endif } return retval; } @@ -1462,6 +1455,23 @@ static int do_execve_common(const char *filename, struct files_struct *displaced; bool clear_in_exec; int retval; + const struct cred *cred = current_cred(); + + /* + * We move the actual failure in case of RLIMIT_NPROC excess from + * set*uid() to execve() because too many poorly written programs + * don't check setuid() return code. Here we additionally recheck + * whether NPROC limit is still exceeded. + */ + if ((current->flags & PF_NPROC_EXCEEDED) && + atomic_read(&cred->user->processes) > rlimit(RLIMIT_NPROC)) { + retval = -EAGAIN; + goto out_ret; + } + + /* We're below the limit (still or again), so we don't want to make + * further execve() calls fail. */ + current->flags &= ~PF_NPROC_EXCEEDED; retval = unshare_files(&displaced); if (retval) @@ -1649,15 +1659,26 @@ expand_fail: return ret; } +static void cn_escape(char *str) +{ + for (; *str; str++) + if (*str == '/') + *str = '!'; +} + static int cn_print_exe_file(struct core_name *cn) { struct file *exe_file; - char *pathbuf, *path, *p; + char *pathbuf, *path; int ret; exe_file = get_mm_exe_file(current->mm); - if (!exe_file) - return cn_printf(cn, "(unknown)"); + if (!exe_file) { + char *commstart = cn->corename + cn->used; + ret = cn_printf(cn, "%s (path unknown)", current->comm); + cn_escape(commstart); + return ret; + } pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY); if (!pathbuf) { @@ -1671,9 +1692,7 @@ static int cn_print_exe_file(struct core_name *cn) goto free_buf; } - for (p = path; *p; p++) - if (*p == '/') - *p = '!'; + cn_escape(path); ret = cn_printf(cn, "%s", path); @@ -1745,16 +1764,22 @@ static int format_corename(struct core_name *cn, long signr) break; } /* hostname */ - case 'h': + case 'h': { + char *namestart = cn->corename + cn->used; down_read(&uts_sem); err = cn_printf(cn, "%s", utsname()->nodename); up_read(&uts_sem); + cn_escape(namestart); break; + } /* executable */ - case 'e': + case 'e': { + char *commstart = cn->corename + cn->used; err = cn_printf(cn, "%s", current->comm); + cn_escape(commstart); break; + } case 'E': err = cn_print_exe_file(cn); break; @@ -2118,16 +2143,16 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) ispipe = format_corename(&cn, signr); - if (ispipe == -ENOMEM) { - printk(KERN_WARNING "format_corename failed\n"); - printk(KERN_WARNING "Aborting core\n"); - goto fail_corename; - } - if (ispipe) { int dump_count; char **helper_argv; + if (ispipe < 0) { + printk(KERN_WARNING "format_corename failed\n"); + printk(KERN_WARNING "Aborting core\n"); + goto fail_corename; + } + if (cprm.limit == 1) { /* * Normally core limits are irrelevant to pipes, since diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild index 2d0f757fda3e..352ba149d23e 100644 --- a/fs/exofs/Kbuild +++ b/fs/exofs/Kbuild @@ -12,5 +12,9 @@ # Kbuild - Gets included from the Kernels Makefile and build system # -exofs-y := ios.o inode.o file.o symlink.o namei.o dir.o super.o +# ore module library +libore-y := ore.o ore_raid.o +obj-$(CONFIG_ORE) += libore.o + +exofs-y := inode.o file.o symlink.o namei.o dir.o super.o obj-$(CONFIG_EXOFS_FS) += exofs.o diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig index 86194b2f799d..da42f32c49be 100644 --- a/fs/exofs/Kconfig +++ b/fs/exofs/Kconfig @@ -1,3 +1,14 @@ +# Note ORE needs to "select ASYNC_XOR". So Not to force multiple selects +# for every ORE user we do it like this. Any user should add itself here +# at the "depends on EXOFS_FS || ..." with an ||. The dependencies are +# selected here, and we default to "ON". So in effect it is like been +# selected by any of the users. +config ORE + tristate + depends on EXOFS_FS || PNFS_OBJLAYOUT + select ASYNC_XOR + default SCSI_OSD_ULD + config EXOFS_FS tristate "exofs: OSD based file system support" depends on SCSI_OSD_ULD diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index c965806c2821..51f4b4c40f09 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -36,12 +36,9 @@ #include <linux/fs.h> #include <linux/time.h> #include <linux/backing-dev.h> -#include "common.h" +#include <scsi/osd_ore.h> -/* FIXME: Remove once pnfs hits mainline - * #include <linux/exportfs/pnfs_osd_xdr.h> - */ -#include "pnfs.h" +#include "common.h" #define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a) @@ -56,27 +53,15 @@ /* u64 has problems with printk this will cast it to unsigned long long */ #define _LLU(x) (unsigned long long)(x) -struct exofs_layout { - osd_id s_pid; /* partition ID of file system*/ - - /* Our way of looking at the data_map */ - unsigned stripe_unit; - unsigned mirrors_p1; - - unsigned group_width; - u64 group_depth; - unsigned group_count; - - enum exofs_inode_layout_gen_functions lay_func; - - unsigned s_numdevs; /* Num of devices in array */ - struct osd_dev *s_ods[0]; /* Variable length */ +struct exofs_dev { + struct ore_dev ored; + unsigned did; }; - /* * our extension to the in-memory superblock */ struct exofs_sb_info { + struct backing_dev_info bdi; /* register our bdi with VFS */ struct exofs_sb_stats s_ess; /* Written often, pre-allocate*/ int s_timeout; /* timeout for OSD operations */ uint64_t s_nextid; /* highest object ID used */ @@ -84,17 +69,10 @@ struct exofs_sb_info { spinlock_t s_next_gen_lock; /* spinlock for gen # update */ u32 s_next_generation; /* next gen # to use */ atomic_t s_curr_pending; /* number of pending commands */ - uint8_t s_cred[OSD_CAP_LEN]; /* credential for the fscb */ - struct backing_dev_info bdi; /* register our bdi with VFS */ - - struct pnfs_osd_data_map data_map; /* Default raid to use - * FIXME: Needed ? - */ -/* struct exofs_layout dir_layout;*/ /* Default dir layout */ - struct exofs_layout layout; /* Default files layout, - * contains the variable osd_dev - * array. Keep last */ - struct osd_dev *_min_one_dev[1]; /* Place holder for one dev */ + + struct ore_layout layout; /* Default files layout */ + struct ore_comp one_comp; /* id & cred of partition id=0*/ + struct ore_components oc; /* comps for the partition */ }; /* @@ -107,7 +85,8 @@ struct exofs_i_info { uint32_t i_data[EXOFS_IDATA];/*short symlink names and device #s*/ uint32_t i_dir_start_lookup; /* which page to start lookup */ uint64_t i_commit_size; /* the object's written length */ - uint8_t i_cred[OSD_CAP_LEN];/* all-powerful credential */ + struct ore_comp one_comp; /* same component for all devices */ + struct ore_components oc; /* inode view of the device table */ }; static inline osd_id exofs_oi_objno(struct exofs_i_info *oi) @@ -115,52 +94,6 @@ static inline osd_id exofs_oi_objno(struct exofs_i_info *oi) return oi->vfs_inode.i_ino + EXOFS_OBJ_OFF; } -struct exofs_io_state; -typedef void (*exofs_io_done_fn)(struct exofs_io_state *or, void *private); - -struct exofs_io_state { - struct kref kref; - - void *private; - exofs_io_done_fn done; - - struct exofs_layout *layout; - struct osd_obj_id obj; - u8 *cred; - - /* Global read/write IO*/ - loff_t offset; - unsigned long length; - void *kern_buff; - - struct page **pages; - unsigned nr_pages; - unsigned pgbase; - unsigned pages_consumed; - - /* Attributes */ - unsigned in_attr_len; - struct osd_attr *in_attr; - unsigned out_attr_len; - struct osd_attr *out_attr; - - /* Variable array of size numdevs */ - unsigned numdevs; - struct exofs_per_dev_state { - struct osd_request *or; - struct bio *bio; - loff_t offset; - unsigned length; - unsigned dev; - } per_dev[]; -}; - -static inline unsigned exofs_io_state_size(unsigned numdevs) -{ - return sizeof(struct exofs_io_state) + - sizeof(struct exofs_per_dev_state) * numdevs; -} - /* * our inode flags */ @@ -205,12 +138,6 @@ static inline struct exofs_i_info *exofs_i(struct inode *inode) } /* - * Given a layout, object_number and stripe_index return the associated global - * dev_index - */ -unsigned exofs_layout_od_id(struct exofs_layout *layout, - osd_id obj_no, unsigned layout_index); -/* * Maximum count of links to a file */ #define EXOFS_LINK_MAX 32000 @@ -219,44 +146,8 @@ unsigned exofs_layout_od_id(struct exofs_layout *layout, * function declarations * *************************/ -/* ios.c */ -void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], - const struct osd_obj_id *obj); -int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj, - u64 offset, void *p, unsigned length); - -int exofs_get_io_state(struct exofs_layout *layout, - struct exofs_io_state **ios); -void exofs_put_io_state(struct exofs_io_state *ios); - -int exofs_check_io(struct exofs_io_state *ios, u64 *resid); - -int exofs_sbi_create(struct exofs_io_state *ios); -int exofs_sbi_remove(struct exofs_io_state *ios); -int exofs_sbi_write(struct exofs_io_state *ios); -int exofs_sbi_read(struct exofs_io_state *ios); - -int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr); - -int exofs_oi_truncate(struct exofs_i_info *oi, u64 new_len); -static inline int exofs_oi_write(struct exofs_i_info *oi, - struct exofs_io_state *ios) -{ - ios->obj.id = exofs_oi_objno(oi); - ios->cred = oi->i_cred; - return exofs_sbi_write(ios); -} - -static inline int exofs_oi_read(struct exofs_i_info *oi, - struct exofs_io_state *ios) -{ - ios->obj.id = exofs_oi_objno(oi); - ios->cred = oi->i_cred; - return exofs_sbi_read(ios); -} - /* inode.c */ -unsigned exofs_max_io_pages(struct exofs_layout *layout, +unsigned exofs_max_io_pages(struct ore_layout *layout, unsigned expected_pages); int exofs_setattr(struct dentry *, struct iattr *); int exofs_write_begin(struct file *file, struct address_space *mapping, @@ -281,6 +172,8 @@ int exofs_set_link(struct inode *, struct exofs_dir_entry *, struct page *, struct inode *); /* super.c */ +void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], + const struct osd_obj_id *obj); int exofs_sbi_write_stats(struct exofs_sb_info *sbi); /********************* @@ -295,7 +188,6 @@ extern const struct file_operations exofs_file_operations; /* inode.c */ extern const struct address_space_operations exofs_aops; -extern const struct osd_attr g_attr_logical_length; /* namei.c */ extern const struct inode_operations exofs_dir_inode_operations; @@ -305,4 +197,35 @@ extern const struct inode_operations exofs_special_inode_operations; extern const struct inode_operations exofs_symlink_inode_operations; extern const struct inode_operations exofs_fast_symlink_inode_operations; +/* exofs_init_comps will initialize an ore_components device array + * pointing to a single ore_comp struct, and a round-robin view + * of the device table. + * The first device of each inode is the [inode->ino % num_devices] + * and the rest of the devices sequentially following where the + * first device is after the last device. + * It is assumed that the global device array at @sbi is twice + * bigger and that the device table repeats twice. + * See: exofs_read_lookup_dev_table() + */ +static inline void exofs_init_comps(struct ore_components *oc, + struct ore_comp *one_comp, + struct exofs_sb_info *sbi, osd_id oid) +{ + unsigned dev_mod = (unsigned)oid, first_dev; + + one_comp->obj.partition = sbi->one_comp.obj.partition; + one_comp->obj.id = oid; + exofs_make_credential(one_comp->cred, &one_comp->obj); + + oc->first_dev = 0; + oc->numdevs = sbi->layout.group_width * sbi->layout.mirrors_p1 * + sbi->layout.group_count; + oc->single_comp = EC_SINGLE_COMP; + oc->comps = one_comp; + + /* Round robin device view of the table */ + first_dev = (dev_mod * sbi->layout.mirrors_p1) % sbi->oc.numdevs; + oc->ods = &sbi->oc.ods[first_dev]; +} + #endif diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 8472c098445d..f6dbf7768ce6 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -37,20 +37,15 @@ #define EXOFS_DBGMSG2(M...) do {} while (0) -enum { BIO_MAX_PAGES_KMALLOC = - (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec), - MAX_PAGES_KMALLOC = - PAGE_SIZE / sizeof(struct page *), -}; +enum {MAX_PAGES_KMALLOC = PAGE_SIZE / sizeof(struct page *), }; -unsigned exofs_max_io_pages(struct exofs_layout *layout, +unsigned exofs_max_io_pages(struct ore_layout *layout, unsigned expected_pages) { unsigned pages = min_t(unsigned, expected_pages, MAX_PAGES_KMALLOC); /* TODO: easily support bio chaining */ - pages = min_t(unsigned, pages, - layout->group_width * BIO_MAX_PAGES_KMALLOC); + pages = min_t(unsigned, pages, layout->max_io_length / PAGE_SIZE); return pages; } @@ -58,7 +53,7 @@ struct page_collect { struct exofs_sb_info *sbi; struct inode *inode; unsigned expected_pages; - struct exofs_io_state *ios; + struct ore_io_state *ios; struct page **pages; unsigned alloc_pages; @@ -68,6 +63,7 @@ struct page_collect { bool read_4_write; /* This means two things: that the read is sync * And the pages should not be unlocked. */ + struct page *that_locked_page; }; static void _pcol_init(struct page_collect *pcol, unsigned expected_pages, @@ -86,6 +82,7 @@ static void _pcol_init(struct page_collect *pcol, unsigned expected_pages, pcol->length = 0; pcol->pg_first = -1; pcol->read_4_write = false; + pcol->that_locked_page = NULL; } static void _pcol_reset(struct page_collect *pcol) @@ -98,6 +95,7 @@ static void _pcol_reset(struct page_collect *pcol) pcol->length = 0; pcol->pg_first = -1; pcol->ios = NULL; + pcol->that_locked_page = NULL; /* this is probably the end of the loop but in writes * it might not end here. don't be left with nothing @@ -110,13 +108,6 @@ static int pcol_try_alloc(struct page_collect *pcol) { unsigned pages; - if (!pcol->ios) { /* First time allocate io_state */ - int ret = exofs_get_io_state(&pcol->sbi->layout, &pcol->ios); - - if (ret) - return ret; - } - /* TODO: easily support bio chaining */ pages = exofs_max_io_pages(&pcol->sbi->layout, pcol->expected_pages); @@ -140,7 +131,7 @@ static void pcol_free(struct page_collect *pcol) pcol->pages = NULL; if (pcol->ios) { - exofs_put_io_state(pcol->ios); + ore_put_io_state(pcol->ios); pcol->ios = NULL; } } @@ -156,14 +147,17 @@ static int pcol_add_page(struct page_collect *pcol, struct page *page, return 0; } +enum {PAGE_WAS_NOT_IN_IO = 17}; static int update_read_page(struct page *page, int ret) { - if (ret == 0) { + switch (ret) { + case 0: /* Everything is OK */ SetPageUptodate(page); if (PageError(page)) ClearPageError(page); - } else if (ret == -EFAULT) { + break; + case -EFAULT: /* In this case we were trying to read something that wasn't on * disk yet - return a page full of zeroes. This should be OK, * because the object should be empty (if there was a write @@ -174,16 +168,22 @@ static int update_read_page(struct page *page, int ret) SetPageUptodate(page); if (PageError(page)) ClearPageError(page); - ret = 0; /* recovered error */ EXOFS_DBGMSG("recovered read error\n"); - } else /* Error */ + /* fall through */ + case PAGE_WAS_NOT_IN_IO: + ret = 0; /* recovered error */ + break; + default: SetPageError(page); - + } return ret; } static void update_write_page(struct page *page, int ret) { + if (unlikely(ret == PAGE_WAS_NOT_IN_IO)) + return; /* don't pass start don't collect $200 */ + if (ret) { mapping_set_error(page->mapping, ret); SetPageError(page); @@ -197,15 +197,16 @@ static void update_write_page(struct page *page, int ret) static int __readpages_done(struct page_collect *pcol) { int i; - u64 resid; u64 good_bytes; u64 length = 0; - int ret = exofs_check_io(pcol->ios, &resid); + int ret = ore_check_io(pcol->ios, NULL); - if (likely(!ret)) + if (likely(!ret)) { good_bytes = pcol->length; - else - good_bytes = pcol->length - resid; + ret = PAGE_WAS_NOT_IN_IO; + } else { + good_bytes = 0; + } EXOFS_DBGMSG2("readpages_done(0x%lx) good_bytes=0x%llx" " length=0x%lx nr_pages=%u\n", @@ -241,7 +242,7 @@ static int __readpages_done(struct page_collect *pcol) } /* callback of async reads */ -static void readpages_done(struct exofs_io_state *ios, void *p) +static void readpages_done(struct ore_io_state *ios, void *p) { struct page_collect *pcol = p; @@ -266,23 +267,70 @@ static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw) } } +static int _maybe_not_all_in_one_io(struct ore_io_state *ios, + struct page_collect *pcol_src, struct page_collect *pcol) +{ + /* length was wrong or offset was not page aligned */ + BUG_ON(pcol_src->nr_pages < ios->nr_pages); + + if (pcol_src->nr_pages > ios->nr_pages) { + struct page **src_page; + unsigned pages_less = pcol_src->nr_pages - ios->nr_pages; + unsigned long len_less = pcol_src->length - ios->length; + unsigned i; + int ret; + + /* This IO was trimmed */ + pcol_src->nr_pages = ios->nr_pages; + pcol_src->length = ios->length; + + /* Left over pages are passed to the next io */ + pcol->expected_pages += pages_less; + pcol->nr_pages = pages_less; + pcol->length = len_less; + src_page = pcol_src->pages + pcol_src->nr_pages; + pcol->pg_first = (*src_page)->index; + + ret = pcol_try_alloc(pcol); + if (unlikely(ret)) + return ret; + + for (i = 0; i < pages_less; ++i) + pcol->pages[i] = *src_page++; + + EXOFS_DBGMSG("Length was adjusted nr_pages=0x%x " + "pages_less=0x%x expected_pages=0x%x " + "next_offset=0x%llx next_len=0x%lx\n", + pcol_src->nr_pages, pages_less, pcol->expected_pages, + pcol->pg_first * PAGE_SIZE, pcol->length); + } + return 0; +} + static int read_exec(struct page_collect *pcol) { struct exofs_i_info *oi = exofs_i(pcol->inode); - struct exofs_io_state *ios = pcol->ios; + struct ore_io_state *ios; struct page_collect *pcol_copy = NULL; int ret; if (!pcol->pages) return 0; + if (!pcol->ios) { + int ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, true, + pcol->pg_first << PAGE_CACHE_SHIFT, + pcol->length, &pcol->ios); + + if (ret) + return ret; + } + + ios = pcol->ios; ios->pages = pcol->pages; - ios->nr_pages = pcol->nr_pages; - ios->length = pcol->length; - ios->offset = pcol->pg_first << PAGE_CACHE_SHIFT; if (pcol->read_4_write) { - exofs_oi_read(oi, pcol->ios); + ore_read(pcol->ios); return __readpages_done(pcol); } @@ -295,17 +343,23 @@ static int read_exec(struct page_collect *pcol) *pcol_copy = *pcol; ios->done = readpages_done; ios->private = pcol_copy; - ret = exofs_oi_read(oi, ios); + + /* pages ownership was passed to pcol_copy */ + _pcol_reset(pcol); + + ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol); if (unlikely(ret)) goto err; - atomic_inc(&pcol->sbi->s_curr_pending); + EXOFS_DBGMSG2("read_exec(0x%lx) offset=0x%llx length=0x%llx\n", + pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length)); - EXOFS_DBGMSG2("read_exec obj=0x%llx start=0x%llx length=0x%lx\n", - ios->obj.id, _LLU(ios->offset), pcol->length); + ret = ore_read(ios); + if (unlikely(ret)) + goto err; + + atomic_inc(&pcol->sbi->s_curr_pending); - /* pages ownership was passed to pcol_copy */ - _pcol_reset(pcol); return 0; err: @@ -340,6 +394,8 @@ static int readpage_strip(void *data, struct page *page) EXOFS_ERR("PageUptodate(0x%lx, 0x%lx)\n", pcol->inode->i_ino, page->index); + pcol->that_locked_page = page; + if (page->index < end_index) len = PAGE_CACHE_SIZE; else if (page->index == end_index) @@ -428,6 +484,10 @@ static int exofs_readpages(struct file *file, struct address_space *mapping, return ret; } + ret = read_exec(&pcol); + if (unlikely(ret)) + return ret; + return read_exec(&pcol); } @@ -457,21 +517,22 @@ static int exofs_readpage(struct file *file, struct page *page) } /* Callback for osd_write. All writes are asynchronous */ -static void writepages_done(struct exofs_io_state *ios, void *p) +static void writepages_done(struct ore_io_state *ios, void *p) { struct page_collect *pcol = p; int i; - u64 resid; u64 good_bytes; u64 length = 0; - int ret = exofs_check_io(ios, &resid); + int ret = ore_check_io(ios, NULL); atomic_dec(&pcol->sbi->s_curr_pending); - if (likely(!ret)) + if (likely(!ret)) { good_bytes = pcol->length; - else - good_bytes = pcol->length - resid; + ret = PAGE_WAS_NOT_IN_IO; + } else { + good_bytes = 0; + } EXOFS_DBGMSG2("writepages_done(0x%lx) good_bytes=0x%llx" " length=0x%lx nr_pages=%u\n", @@ -504,16 +565,73 @@ static void writepages_done(struct exofs_io_state *ios, void *p) EXOFS_DBGMSG2("writepages_done END\n"); } +static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate) +{ + struct page_collect *pcol = priv; + pgoff_t index = offset / PAGE_SIZE; + + if (!pcol->that_locked_page || + (pcol->that_locked_page->index != index)) { + struct page *page = find_get_page(pcol->inode->i_mapping, index); + + if (!page) { + page = find_or_create_page(pcol->inode->i_mapping, + index, GFP_NOFS); + if (unlikely(!page)) { + EXOFS_DBGMSG("grab_cache_page Failed " + "index=0x%llx\n", _LLU(index)); + return NULL; + } + unlock_page(page); + } + if (PageDirty(page) || PageWriteback(page)) + *uptodate = true; + else + *uptodate = PageUptodate(page); + EXOFS_DBGMSG("index=0x%lx uptodate=%d\n", index, *uptodate); + return page; + } else { + EXOFS_DBGMSG("YES that_locked_page index=0x%lx\n", + pcol->that_locked_page->index); + *uptodate = true; + return pcol->that_locked_page; + } +} + +static void __r4w_put_page(void *priv, struct page *page) +{ + struct page_collect *pcol = priv; + + if (pcol->that_locked_page != page) { + EXOFS_DBGMSG("index=0x%lx\n", page->index); + page_cache_release(page); + return; + } + EXOFS_DBGMSG("that_locked_page index=0x%lx\n", page->index); +} + +static const struct _ore_r4w_op _r4w_op = { + .get_page = &__r4w_get_page, + .put_page = &__r4w_put_page, +}; + static int write_exec(struct page_collect *pcol) { struct exofs_i_info *oi = exofs_i(pcol->inode); - struct exofs_io_state *ios = pcol->ios; + struct ore_io_state *ios; struct page_collect *pcol_copy = NULL; int ret; if (!pcol->pages) return 0; + BUG_ON(pcol->ios); + ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, false, + pcol->pg_first << PAGE_CACHE_SHIFT, + pcol->length, &pcol->ios); + if (unlikely(ret)) + goto err; + pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL); if (!pcol_copy) { EXOFS_ERR("write_exec: Failed to kmalloc(pcol)\n"); @@ -523,25 +641,29 @@ static int write_exec(struct page_collect *pcol) *pcol_copy = *pcol; + ios = pcol->ios; ios->pages = pcol_copy->pages; - ios->nr_pages = pcol_copy->nr_pages; - ios->offset = pcol_copy->pg_first << PAGE_CACHE_SHIFT; - ios->length = pcol_copy->length; ios->done = writepages_done; + ios->r4w = &_r4w_op; ios->private = pcol_copy; - ret = exofs_oi_write(oi, ios); + /* pages ownership was passed to pcol_copy */ + _pcol_reset(pcol); + + ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol); + if (unlikely(ret)) + goto err; + + EXOFS_DBGMSG2("write_exec(0x%lx) offset=0x%llx length=0x%llx\n", + pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length)); + + ret = ore_write(ios); if (unlikely(ret)) { - EXOFS_ERR("write_exec: exofs_oi_write() Failed\n"); + EXOFS_ERR("write_exec: ore_write() Failed\n"); goto err; } atomic_inc(&pcol->sbi->s_curr_pending); - EXOFS_DBGMSG2("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n", - pcol->inode->i_ino, pcol->pg_first, _LLU(ios->offset), - pcol->length); - /* pages ownership was passed to pcol_copy */ - _pcol_reset(pcol); return 0; err: @@ -681,14 +803,33 @@ static int exofs_writepages(struct address_space *mapping, _pcol_init(&pcol, expected_pages, mapping->host); ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol); - if (ret) { + if (unlikely(ret)) { EXOFS_ERR("write_cache_pages => %d\n", ret); return ret; } - return write_exec(&pcol); + ret = write_exec(&pcol); + if (unlikely(ret)) + return ret; + + if (wbc->sync_mode == WB_SYNC_ALL) { + return write_exec(&pcol); /* pump the last reminder */ + } else if (pcol.nr_pages) { + /* not SYNC let the reminder join the next writeout */ + unsigned i; + + for (i = 0; i < pcol.nr_pages; i++) { + struct page *page = pcol.pages[i]; + + end_page_writeback(page); + set_page_dirty(page); + unlock_page(page); + } + } + return 0; } +/* static int exofs_writepage(struct page *page, struct writeback_control *wbc) { struct page_collect pcol; @@ -704,7 +845,7 @@ static int exofs_writepage(struct page *page, struct writeback_control *wbc) return write_exec(&pcol); } - +*/ /* i_mutex held using inode->i_size directly */ static void _write_failed(struct inode *inode, loff_t to) { @@ -810,7 +951,7 @@ static void exofs_invalidatepage(struct page *page, unsigned long offset) const struct address_space_operations exofs_aops = { .readpage = exofs_readpage, .readpages = exofs_readpages, - .writepage = exofs_writepage, + .writepage = NULL, .writepages = exofs_writepages, .write_begin = exofs_write_begin_export, .write_end = exofs_write_end, @@ -844,17 +985,15 @@ static inline int exofs_inode_is_fast_symlink(struct inode *inode) return S_ISLNK(inode->i_mode) && (oi->i_data[0] != 0); } -const struct osd_attr g_attr_logical_length = ATTR_DEF( - OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8); - static int _do_truncate(struct inode *inode, loff_t newsize) { struct exofs_i_info *oi = exofs_i(inode); + struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; int ret; inode->i_mtime = inode->i_ctime = CURRENT_TIME; - ret = exofs_oi_truncate(oi, (u64)newsize); + ret = ore_truncate(&sbi->layout, &oi->oc, (u64)newsize); if (likely(!ret)) truncate_setsize(inode, newsize); @@ -917,30 +1056,26 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, [1] = g_attr_inode_file_layout, [2] = g_attr_inode_dir_layout, }; - struct exofs_io_state *ios; + struct ore_io_state *ios; struct exofs_on_disk_inode_layout *layout; int ret; - ret = exofs_get_io_state(&sbi->layout, &ios); + ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios); if (unlikely(ret)) { - EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__); + EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); return ret; } - ios->obj.id = exofs_oi_objno(oi); - exofs_make_credential(oi->i_cred, &ios->obj); - ios->cred = oi->i_cred; - - attrs[1].len = exofs_on_disk_inode_layout_size(sbi->layout.s_numdevs); - attrs[2].len = exofs_on_disk_inode_layout_size(sbi->layout.s_numdevs); + attrs[1].len = exofs_on_disk_inode_layout_size(sbi->oc.numdevs); + attrs[2].len = exofs_on_disk_inode_layout_size(sbi->oc.numdevs); ios->in_attr = attrs; ios->in_attr_len = ARRAY_SIZE(attrs); - ret = exofs_sbi_read(ios); + ret = ore_read(ios); if (unlikely(ret)) { EXOFS_ERR("object(0x%llx) corrupted, return empty file=>%d\n", - _LLU(ios->obj.id), ret); + _LLU(oi->one_comp.obj.id), ret); memset(inode, 0, sizeof(*inode)); inode->i_mode = 0040000 | (0777 & ~022); /* If object is lost on target we might as well enable it's @@ -990,7 +1125,7 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, } out: - exofs_put_io_state(ios); + ore_put_io_state(ios); return ret; } @@ -1016,6 +1151,8 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino) return inode; oi = exofs_i(inode); __oi_init(oi); + exofs_init_comps(&oi->oc, &oi->one_comp, sb->s_fs_info, + exofs_oi_objno(oi)); /* read the inode from the osd */ ret = exofs_get_inode(sb, oi, &fcb); @@ -1028,7 +1165,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino) inode->i_mode = le16_to_cpu(fcb.i_mode); inode->i_uid = le32_to_cpu(fcb.i_uid); inode->i_gid = le32_to_cpu(fcb.i_gid); - inode->i_nlink = le16_to_cpu(fcb.i_links_count); + set_nlink(inode, le16_to_cpu(fcb.i_links_count)); inode->i_ctime.tv_sec = (signed)le32_to_cpu(fcb.i_ctime); inode->i_atime.tv_sec = (signed)le32_to_cpu(fcb.i_atime); inode->i_mtime.tv_sec = (signed)le32_to_cpu(fcb.i_mtime); @@ -1107,21 +1244,22 @@ int __exofs_wait_obj_created(struct exofs_i_info *oi) * set the obj_created flag so that other methods know that the object exists on * the OSD. */ -static void create_done(struct exofs_io_state *ios, void *p) +static void create_done(struct ore_io_state *ios, void *p) { struct inode *inode = p; struct exofs_i_info *oi = exofs_i(inode); struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; int ret; - ret = exofs_check_io(ios, NULL); - exofs_put_io_state(ios); + ret = ore_check_io(ios, NULL); + ore_put_io_state(ios); atomic_dec(&sbi->s_curr_pending); if (unlikely(ret)) { EXOFS_ERR("object=0x%llx creation failed in pid=0x%llx", - _LLU(exofs_oi_objno(oi)), _LLU(sbi->layout.s_pid)); + _LLU(exofs_oi_objno(oi)), + _LLU(oi->one_comp.obj.partition)); /*TODO: When FS is corrupted creation can fail, object already * exist. Get rid of this asynchronous creation, if exist * increment the obj counter and try the next object. Until we @@ -1140,14 +1278,13 @@ static void create_done(struct exofs_io_state *ios, void *p) */ struct inode *exofs_new_inode(struct inode *dir, int mode) { - struct super_block *sb; + struct super_block *sb = dir->i_sb; + struct exofs_sb_info *sbi = sb->s_fs_info; struct inode *inode; struct exofs_i_info *oi; - struct exofs_sb_info *sbi; - struct exofs_io_state *ios; + struct ore_io_state *ios; int ret; - sb = dir->i_sb; inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOMEM); @@ -1157,8 +1294,6 @@ struct inode *exofs_new_inode(struct inode *dir, int mode) set_obj_2bcreated(oi); - sbi = sb->s_fs_info; - inode->i_mapping->backing_dev_info = sb->s_bdi; inode_init_owner(inode, dir, mode); inode->i_ino = sbi->s_nextid++; @@ -1170,25 +1305,24 @@ struct inode *exofs_new_inode(struct inode *dir, int mode) spin_unlock(&sbi->s_next_gen_lock); insert_inode_hash(inode); + exofs_init_comps(&oi->oc, &oi->one_comp, sb->s_fs_info, + exofs_oi_objno(oi)); exofs_sbi_write_stats(sbi); /* Make sure new sbi->s_nextid is on disk */ mark_inode_dirty(inode); - ret = exofs_get_io_state(&sbi->layout, &ios); + ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios); if (unlikely(ret)) { - EXOFS_ERR("exofs_new_inode: exofs_get_io_state failed\n"); + EXOFS_ERR("exofs_new_inode: ore_get_io_state failed\n"); return ERR_PTR(ret); } - ios->obj.id = exofs_oi_objno(oi); - exofs_make_credential(oi->i_cred, &ios->obj); - ios->done = create_done; ios->private = inode; - ios->cred = oi->i_cred; - ret = exofs_sbi_create(ios); + + ret = ore_create(ios); if (ret) { - exofs_put_io_state(ios); + ore_put_io_state(ios); return ERR_PTR(ret); } atomic_inc(&sbi->s_curr_pending); @@ -1207,11 +1341,11 @@ struct updatei_args { /* * Callback function from exofs_update_inode(). */ -static void updatei_done(struct exofs_io_state *ios, void *p) +static void updatei_done(struct ore_io_state *ios, void *p) { struct updatei_args *args = p; - exofs_put_io_state(ios); + ore_put_io_state(ios); atomic_dec(&args->sbi->s_curr_pending); @@ -1227,7 +1361,7 @@ static int exofs_update_inode(struct inode *inode, int do_sync) struct exofs_i_info *oi = exofs_i(inode); struct super_block *sb = inode->i_sb; struct exofs_sb_info *sbi = sb->s_fs_info; - struct exofs_io_state *ios; + struct ore_io_state *ios; struct osd_attr attr; struct exofs_fcb *fcb; struct updatei_args *args; @@ -1266,9 +1400,9 @@ static int exofs_update_inode(struct inode *inode, int do_sync) } else memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data)); - ret = exofs_get_io_state(&sbi->layout, &ios); + ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios); if (unlikely(ret)) { - EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__); + EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); goto free_args; } @@ -1285,13 +1419,13 @@ static int exofs_update_inode(struct inode *inode, int do_sync) ios->private = args; } - ret = exofs_oi_write(oi, ios); + ret = ore_write(ios); if (!do_sync && !ret) { atomic_inc(&sbi->s_curr_pending); goto out; /* deallocation in updatei_done */ } - exofs_put_io_state(ios); + ore_put_io_state(ios); free_args: kfree(args); out: @@ -1310,11 +1444,11 @@ int exofs_write_inode(struct inode *inode, struct writeback_control *wbc) * Callback function from exofs_delete_inode() - don't have much cleaning up to * do. */ -static void delete_done(struct exofs_io_state *ios, void *p) +static void delete_done(struct ore_io_state *ios, void *p) { struct exofs_sb_info *sbi = p; - exofs_put_io_state(ios); + ore_put_io_state(ios); atomic_dec(&sbi->s_curr_pending); } @@ -1329,7 +1463,7 @@ void exofs_evict_inode(struct inode *inode) struct exofs_i_info *oi = exofs_i(inode); struct super_block *sb = inode->i_sb; struct exofs_sb_info *sbi = sb->s_fs_info; - struct exofs_io_state *ios; + struct ore_io_state *ios; int ret; truncate_inode_pages(&inode->i_data, 0); @@ -1349,20 +1483,19 @@ void exofs_evict_inode(struct inode *inode) /* ignore the error, attempt a remove anyway */ /* Now Remove the OSD objects */ - ret = exofs_get_io_state(&sbi->layout, &ios); + ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios); if (unlikely(ret)) { - EXOFS_ERR("%s: exofs_get_io_state failed\n", __func__); + EXOFS_ERR("%s: ore_get_io_state failed\n", __func__); return; } - ios->obj.id = exofs_oi_objno(oi); ios->done = delete_done; ios->private = sbi; - ios->cred = oi->i_cred; - ret = exofs_sbi_remove(ios); + + ret = ore_remove(ios); if (ret) { - EXOFS_ERR("%s: exofs_sbi_remove failed\n", __func__); - exofs_put_io_state(ios); + EXOFS_ERR("%s: ore_remove failed\n", __func__); + ore_put_io_state(ios); return; } atomic_inc(&sbi->s_curr_pending); diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c deleted file mode 100644 index f74a2ec027a6..000000000000 --- a/fs/exofs/ios.c +++ /dev/null @@ -1,803 +0,0 @@ -/* - * Copyright (C) 2005, 2006 - * Avishay Traeger (avishay@gmail.com) - * Copyright (C) 2008, 2009 - * Boaz Harrosh <bharrosh@panasas.com> - * - * This file is part of exofs. - * - * exofs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation. Since it is based on ext2, and the only - * valid version of GPL for the Linux kernel is version 2, the only valid - * version of GPL for exofs is version 2. - * - * exofs is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with exofs; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <linux/slab.h> -#include <scsi/scsi_device.h> -#include <asm/div64.h> - -#include "exofs.h" - -#define EXOFS_DBGMSG2(M...) do {} while (0) -/* #define EXOFS_DBGMSG2 EXOFS_DBGMSG */ - -void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj) -{ - osd_sec_init_nosec_doall_caps(cred_a, obj, false, true); -} - -int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj, - u64 offset, void *p, unsigned length) -{ - struct osd_request *or = osd_start_request(od, GFP_KERNEL); -/* struct osd_sense_info osi = {.key = 0};*/ - int ret; - - if (unlikely(!or)) { - EXOFS_DBGMSG("%s: osd_start_request failed.\n", __func__); - return -ENOMEM; - } - ret = osd_req_read_kern(or, obj, offset, p, length); - if (unlikely(ret)) { - EXOFS_DBGMSG("%s: osd_req_read_kern failed.\n", __func__); - goto out; - } - - ret = osd_finalize_request(or, 0, cred, NULL); - if (unlikely(ret)) { - EXOFS_DBGMSG("Failed to osd_finalize_request() => %d\n", ret); - goto out; - } - - ret = osd_execute_request(or); - if (unlikely(ret)) - EXOFS_DBGMSG("osd_execute_request() => %d\n", ret); - /* osd_req_decode_sense(or, ret); */ - -out: - osd_end_request(or); - return ret; -} - -int exofs_get_io_state(struct exofs_layout *layout, - struct exofs_io_state **pios) -{ - struct exofs_io_state *ios; - - /*TODO: Maybe use kmem_cach per sbi of size - * exofs_io_state_size(layout->s_numdevs) - */ - ios = kzalloc(exofs_io_state_size(layout->s_numdevs), GFP_KERNEL); - if (unlikely(!ios)) { - EXOFS_DBGMSG("Failed kzalloc bytes=%d\n", - exofs_io_state_size(layout->s_numdevs)); - *pios = NULL; - return -ENOMEM; - } - - ios->layout = layout; - ios->obj.partition = layout->s_pid; - *pios = ios; - return 0; -} - -void exofs_put_io_state(struct exofs_io_state *ios) -{ - if (ios) { - unsigned i; - - for (i = 0; i < ios->numdevs; i++) { - struct exofs_per_dev_state *per_dev = &ios->per_dev[i]; - - if (per_dev->or) - osd_end_request(per_dev->or); - if (per_dev->bio) - bio_put(per_dev->bio); - } - - kfree(ios); - } -} - -unsigned exofs_layout_od_id(struct exofs_layout *layout, - osd_id obj_no, unsigned layout_index) -{ -/* switch (layout->lay_func) { - case LAYOUT_MOVING_WINDOW: - {*/ - unsigned dev_mod = obj_no; - - return (layout_index + dev_mod * layout->mirrors_p1) % - layout->s_numdevs; -/* } - case LAYOUT_FUNC_IMPLICT: - return layout->devs[layout_index]; - }*/ -} - -static inline struct osd_dev *exofs_ios_od(struct exofs_io_state *ios, - unsigned layout_index) -{ - return ios->layout->s_ods[ - exofs_layout_od_id(ios->layout, ios->obj.id, layout_index)]; -} - -static void _sync_done(struct exofs_io_state *ios, void *p) -{ - struct completion *waiting = p; - - complete(waiting); -} - -static void _last_io(struct kref *kref) -{ - struct exofs_io_state *ios = container_of( - kref, struct exofs_io_state, kref); - - ios->done(ios, ios->private); -} - -static void _done_io(struct osd_request *or, void *p) -{ - struct exofs_io_state *ios = p; - - kref_put(&ios->kref, _last_io); -} - -static int exofs_io_execute(struct exofs_io_state *ios) -{ - DECLARE_COMPLETION_ONSTACK(wait); - bool sync = (ios->done == NULL); - int i, ret; - - if (sync) { - ios->done = _sync_done; - ios->private = &wait; - } - - for (i = 0; i < ios->numdevs; i++) { - struct osd_request *or = ios->per_dev[i].or; - if (unlikely(!or)) - continue; - - ret = osd_finalize_request(or, 0, ios->cred, NULL); - if (unlikely(ret)) { - EXOFS_DBGMSG("Failed to osd_finalize_request() => %d\n", - ret); - return ret; - } - } - - kref_init(&ios->kref); - - for (i = 0; i < ios->numdevs; i++) { - struct osd_request *or = ios->per_dev[i].or; - if (unlikely(!or)) - continue; - - kref_get(&ios->kref); - osd_execute_request_async(or, _done_io, ios); - } - - kref_put(&ios->kref, _last_io); - ret = 0; - - if (sync) { - wait_for_completion(&wait); - ret = exofs_check_io(ios, NULL); - } - return ret; -} - -static void _clear_bio(struct bio *bio) -{ - struct bio_vec *bv; - unsigned i; - - __bio_for_each_segment(bv, bio, i, 0) { - unsigned this_count = bv->bv_len; - - if (likely(PAGE_SIZE == this_count)) - clear_highpage(bv->bv_page); - else - zero_user(bv->bv_page, bv->bv_offset, this_count); - } -} - -int exofs_check_io(struct exofs_io_state *ios, u64 *resid) -{ - enum osd_err_priority acumulated_osd_err = 0; - int acumulated_lin_err = 0; - int i; - - for (i = 0; i < ios->numdevs; i++) { - struct osd_sense_info osi; - struct osd_request *or = ios->per_dev[i].or; - int ret; - - if (unlikely(!or)) - continue; - - ret = osd_req_decode_sense(or, &osi); - if (likely(!ret)) - continue; - - if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) { - /* start read offset passed endof file */ - _clear_bio(ios->per_dev[i].bio); - EXOFS_DBGMSG("start read offset passed end of file " - "offset=0x%llx, length=0x%llx\n", - _LLU(ios->per_dev[i].offset), - _LLU(ios->per_dev[i].length)); - - continue; /* we recovered */ - } - - if (osi.osd_err_pri >= acumulated_osd_err) { - acumulated_osd_err = osi.osd_err_pri; - acumulated_lin_err = ret; - } - } - - /* TODO: raid specific residual calculations */ - if (resid) { - if (likely(!acumulated_lin_err)) - *resid = 0; - else - *resid = ios->length; - } - - return acumulated_lin_err; -} - -/* - * L - logical offset into the file - * - * U - The number of bytes in a stripe within a group - * - * U = stripe_unit * group_width - * - * T - The number of bytes striped within a group of component objects - * (before advancing to the next group) - * - * T = stripe_unit * group_width * group_depth - * - * S - The number of bytes striped across all component objects - * before the pattern repeats - * - * S = stripe_unit * group_width * group_depth * group_count - * - * M - The "major" (i.e., across all components) stripe number - * - * M = L / S - * - * G - Counts the groups from the beginning of the major stripe - * - * G = (L - (M * S)) / T [or (L % S) / T] - * - * H - The byte offset within the group - * - * H = (L - (M * S)) % T [or (L % S) % T] - * - * N - The "minor" (i.e., across the group) stripe number - * - * N = H / U - * - * C - The component index coresponding to L - * - * C = (H - (N * U)) / stripe_unit + G * group_width - * [or (L % U) / stripe_unit + G * group_width] - * - * O - The component offset coresponding to L - * - * O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit - */ -struct _striping_info { - u64 obj_offset; - u64 group_length; - unsigned dev; - unsigned unit_off; -}; - -static void _calc_stripe_info(struct exofs_io_state *ios, u64 file_offset, - struct _striping_info *si) -{ - u32 stripe_unit = ios->layout->stripe_unit; - u32 group_width = ios->layout->group_width; - u64 group_depth = ios->layout->group_depth; - - u32 U = stripe_unit * group_width; - u64 T = U * group_depth; - u64 S = T * ios->layout->group_count; - u64 M = div64_u64(file_offset, S); - - /* - G = (L - (M * S)) / T - H = (L - (M * S)) % T - */ - u64 LmodS = file_offset - M * S; - u32 G = div64_u64(LmodS, T); - u64 H = LmodS - G * T; - - u32 N = div_u64(H, U); - - /* "H - (N * U)" is just "H % U" so it's bound to u32 */ - si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width; - si->dev *= ios->layout->mirrors_p1; - - div_u64_rem(file_offset, stripe_unit, &si->unit_off); - - si->obj_offset = si->unit_off + (N * stripe_unit) + - (M * group_depth * stripe_unit); - - si->group_length = T - H; -} - -static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_pg, - unsigned pgbase, struct exofs_per_dev_state *per_dev, - int cur_len) -{ - unsigned pg = *cur_pg; - struct request_queue *q = - osd_request_queue(exofs_ios_od(ios, per_dev->dev)); - - per_dev->length += cur_len; - - if (per_dev->bio == NULL) { - unsigned pages_in_stripe = ios->layout->group_width * - (ios->layout->stripe_unit / PAGE_SIZE); - unsigned bio_size = (ios->nr_pages + pages_in_stripe) / - ios->layout->group_width; - - per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); - if (unlikely(!per_dev->bio)) { - EXOFS_DBGMSG("Failed to allocate BIO size=%u\n", - bio_size); - return -ENOMEM; - } - } - - while (cur_len > 0) { - unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len); - unsigned added_len; - - BUG_ON(ios->nr_pages <= pg); - cur_len -= pglen; - - added_len = bio_add_pc_page(q, per_dev->bio, ios->pages[pg], - pglen, pgbase); - if (unlikely(pglen != added_len)) - return -ENOMEM; - pgbase = 0; - ++pg; - } - BUG_ON(cur_len); - - *cur_pg = pg; - return 0; -} - -static int _prepare_one_group(struct exofs_io_state *ios, u64 length, - struct _striping_info *si) -{ - unsigned stripe_unit = ios->layout->stripe_unit; - unsigned mirrors_p1 = ios->layout->mirrors_p1; - unsigned devs_in_group = ios->layout->group_width * mirrors_p1; - unsigned dev = si->dev; - unsigned first_dev = dev - (dev % devs_in_group); - unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0; - unsigned cur_pg = ios->pages_consumed; - int ret = 0; - - while (length) { - struct exofs_per_dev_state *per_dev = &ios->per_dev[dev]; - unsigned cur_len, page_off = 0; - - if (!per_dev->length) { - per_dev->dev = dev; - if (dev < si->dev) { - per_dev->offset = si->obj_offset + stripe_unit - - si->unit_off; - cur_len = stripe_unit; - } else if (dev == si->dev) { - per_dev->offset = si->obj_offset; - cur_len = stripe_unit - si->unit_off; - page_off = si->unit_off & ~PAGE_MASK; - BUG_ON(page_off && (page_off != ios->pgbase)); - } else { /* dev > si->dev */ - per_dev->offset = si->obj_offset - si->unit_off; - cur_len = stripe_unit; - } - - if (max_comp < dev) - max_comp = dev; - } else { - cur_len = stripe_unit; - } - if (cur_len >= length) - cur_len = length; - - ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev, - cur_len); - if (unlikely(ret)) - goto out; - - dev += mirrors_p1; - dev = (dev % devs_in_group) + first_dev; - - length -= cur_len; - } -out: - ios->numdevs = max_comp + mirrors_p1; - ios->pages_consumed = cur_pg; - return ret; -} - -static int _prepare_for_striping(struct exofs_io_state *ios) -{ - u64 length = ios->length; - u64 offset = ios->offset; - struct _striping_info si; - int ret = 0; - - if (!ios->pages) { - if (ios->kern_buff) { - struct exofs_per_dev_state *per_dev = &ios->per_dev[0]; - - _calc_stripe_info(ios, ios->offset, &si); - per_dev->offset = si.obj_offset; - per_dev->dev = si.dev; - - /* no cross device without page array */ - BUG_ON((ios->layout->group_width > 1) && - (si.unit_off + ios->length > - ios->layout->stripe_unit)); - } - ios->numdevs = ios->layout->mirrors_p1; - return 0; - } - - while (length) { - _calc_stripe_info(ios, offset, &si); - - if (length < si.group_length) - si.group_length = length; - - ret = _prepare_one_group(ios, si.group_length, &si); - if (unlikely(ret)) - goto out; - - offset += si.group_length; - length -= si.group_length; - } - -out: - return ret; -} - -int exofs_sbi_create(struct exofs_io_state *ios) -{ - int i, ret; - - for (i = 0; i < ios->layout->s_numdevs; i++) { - struct osd_request *or; - - or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL); - if (unlikely(!or)) { - EXOFS_ERR("%s: osd_start_request failed\n", __func__); - ret = -ENOMEM; - goto out; - } - ios->per_dev[i].or = or; - ios->numdevs++; - - osd_req_create_object(or, &ios->obj); - } - ret = exofs_io_execute(ios); - -out: - return ret; -} - -int exofs_sbi_remove(struct exofs_io_state *ios) -{ - int i, ret; - - for (i = 0; i < ios->layout->s_numdevs; i++) { - struct osd_request *or; - - or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL); - if (unlikely(!or)) { - EXOFS_ERR("%s: osd_start_request failed\n", __func__); - ret = -ENOMEM; - goto out; - } - ios->per_dev[i].or = or; - ios->numdevs++; - - osd_req_remove_object(or, &ios->obj); - } - ret = exofs_io_execute(ios); - -out: - return ret; -} - -static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp) -{ - struct exofs_per_dev_state *master_dev = &ios->per_dev[cur_comp]; - unsigned dev = ios->per_dev[cur_comp].dev; - unsigned last_comp = cur_comp + ios->layout->mirrors_p1; - int ret = 0; - - if (ios->pages && !master_dev->length) - return 0; /* Just an empty slot */ - - for (; cur_comp < last_comp; ++cur_comp, ++dev) { - struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp]; - struct osd_request *or; - - or = osd_start_request(exofs_ios_od(ios, dev), GFP_KERNEL); - if (unlikely(!or)) { - EXOFS_ERR("%s: osd_start_request failed\n", __func__); - ret = -ENOMEM; - goto out; - } - per_dev->or = or; - per_dev->offset = master_dev->offset; - - if (ios->pages) { - struct bio *bio; - - if (per_dev != master_dev) { - bio = bio_kmalloc(GFP_KERNEL, - master_dev->bio->bi_max_vecs); - if (unlikely(!bio)) { - EXOFS_DBGMSG( - "Failed to allocate BIO size=%u\n", - master_dev->bio->bi_max_vecs); - ret = -ENOMEM; - goto out; - } - - __bio_clone(bio, master_dev->bio); - bio->bi_bdev = NULL; - bio->bi_next = NULL; - per_dev->length = master_dev->length; - per_dev->bio = bio; - per_dev->dev = dev; - } else { - bio = master_dev->bio; - /* FIXME: bio_set_dir() */ - bio->bi_rw |= REQ_WRITE; - } - - osd_req_write(or, &ios->obj, per_dev->offset, bio, - per_dev->length); - EXOFS_DBGMSG("write(0x%llx) offset=0x%llx " - "length=0x%llx dev=%d\n", - _LLU(ios->obj.id), _LLU(per_dev->offset), - _LLU(per_dev->length), dev); - } else if (ios->kern_buff) { - ret = osd_req_write_kern(or, &ios->obj, per_dev->offset, - ios->kern_buff, ios->length); - if (unlikely(ret)) - goto out; - EXOFS_DBGMSG2("write_kern(0x%llx) offset=0x%llx " - "length=0x%llx dev=%d\n", - _LLU(ios->obj.id), _LLU(per_dev->offset), - _LLU(ios->length), dev); - } else { - osd_req_set_attributes(or, &ios->obj); - EXOFS_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n", - _LLU(ios->obj.id), ios->out_attr_len, dev); - } - - if (ios->out_attr) - osd_req_add_set_attr_list(or, ios->out_attr, - ios->out_attr_len); - - if (ios->in_attr) - osd_req_add_get_attr_list(or, ios->in_attr, - ios->in_attr_len); - } - -out: - return ret; -} - -int exofs_sbi_write(struct exofs_io_state *ios) -{ - int i; - int ret; - - ret = _prepare_for_striping(ios); - if (unlikely(ret)) - return ret; - - for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { - ret = _sbi_write_mirror(ios, i); - if (unlikely(ret)) - return ret; - } - - ret = exofs_io_execute(ios); - return ret; -} - -static int _sbi_read_mirror(struct exofs_io_state *ios, unsigned cur_comp) -{ - struct osd_request *or; - struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp]; - unsigned first_dev = (unsigned)ios->obj.id; - - if (ios->pages && !per_dev->length) - return 0; /* Just an empty slot */ - - first_dev = per_dev->dev + first_dev % ios->layout->mirrors_p1; - or = osd_start_request(exofs_ios_od(ios, first_dev), GFP_KERNEL); - if (unlikely(!or)) { - EXOFS_ERR("%s: osd_start_request failed\n", __func__); - return -ENOMEM; - } - per_dev->or = or; - - if (ios->pages) { - osd_req_read(or, &ios->obj, per_dev->offset, - per_dev->bio, per_dev->length); - EXOFS_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx" - " dev=%d\n", _LLU(ios->obj.id), - _LLU(per_dev->offset), _LLU(per_dev->length), - first_dev); - } else if (ios->kern_buff) { - int ret = osd_req_read_kern(or, &ios->obj, per_dev->offset, - ios->kern_buff, ios->length); - EXOFS_DBGMSG2("read_kern(0x%llx) offset=0x%llx " - "length=0x%llx dev=%d ret=>%d\n", - _LLU(ios->obj.id), _LLU(per_dev->offset), - _LLU(ios->length), first_dev, ret); - if (unlikely(ret)) - return ret; - } else { - osd_req_get_attributes(or, &ios->obj); - EXOFS_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n", - _LLU(ios->obj.id), ios->in_attr_len, first_dev); - } - if (ios->out_attr) - osd_req_add_set_attr_list(or, ios->out_attr, ios->out_attr_len); - - if (ios->in_attr) - osd_req_add_get_attr_list(or, ios->in_attr, ios->in_attr_len); - - return 0; -} - -int exofs_sbi_read(struct exofs_io_state *ios) -{ - int i; - int ret; - - ret = _prepare_for_striping(ios); - if (unlikely(ret)) - return ret; - - for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { - ret = _sbi_read_mirror(ios, i); - if (unlikely(ret)) - return ret; - } - - ret = exofs_io_execute(ios); - return ret; -} - -int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr) -{ - struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */ - void *iter = NULL; - int nelem; - - do { - nelem = 1; - osd_req_decode_get_attr_list(ios->per_dev[0].or, - &cur_attr, &nelem, &iter); - if ((cur_attr.attr_page == attr->attr_page) && - (cur_attr.attr_id == attr->attr_id)) { - attr->len = cur_attr.len; - attr->val_ptr = cur_attr.val_ptr; - return 0; - } - } while (iter); - - return -EIO; -} - -static int _truncate_mirrors(struct exofs_io_state *ios, unsigned cur_comp, - struct osd_attr *attr) -{ - int last_comp = cur_comp + ios->layout->mirrors_p1; - - for (; cur_comp < last_comp; ++cur_comp) { - struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp]; - struct osd_request *or; - - or = osd_start_request(exofs_ios_od(ios, cur_comp), GFP_KERNEL); - if (unlikely(!or)) { - EXOFS_ERR("%s: osd_start_request failed\n", __func__); - return -ENOMEM; - } - per_dev->or = or; - - osd_req_set_attributes(or, &ios->obj); - osd_req_add_set_attr_list(or, attr, 1); - } - - return 0; -} - -int exofs_oi_truncate(struct exofs_i_info *oi, u64 size) -{ - struct exofs_sb_info *sbi = oi->vfs_inode.i_sb->s_fs_info; - struct exofs_io_state *ios; - struct exofs_trunc_attr { - struct osd_attr attr; - __be64 newsize; - } *size_attrs; - struct _striping_info si; - int i, ret; - - ret = exofs_get_io_state(&sbi->layout, &ios); - if (unlikely(ret)) - return ret; - - size_attrs = kcalloc(ios->layout->group_width, sizeof(*size_attrs), - GFP_KERNEL); - if (unlikely(!size_attrs)) { - ret = -ENOMEM; - goto out; - } - - ios->obj.id = exofs_oi_objno(oi); - ios->cred = oi->i_cred; - - ios->numdevs = ios->layout->s_numdevs; - _calc_stripe_info(ios, size, &si); - - for (i = 0; i < ios->layout->group_width; ++i) { - struct exofs_trunc_attr *size_attr = &size_attrs[i]; - u64 obj_size; - - if (i < si.dev) - obj_size = si.obj_offset + - ios->layout->stripe_unit - si.unit_off; - else if (i == si.dev) - obj_size = si.obj_offset; - else /* i > si.dev */ - obj_size = si.obj_offset - si.unit_off; - - size_attr->newsize = cpu_to_be64(obj_size); - size_attr->attr = g_attr_logical_length; - size_attr->attr.val_ptr = &size_attr->newsize; - - ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1, - &size_attr->attr); - if (unlikely(ret)) - goto out; - } - ret = exofs_io_execute(ios); - -out: - kfree(size_attrs); - exofs_put_io_state(ios); - return ret; -} diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c new file mode 100644 index 000000000000..d271ad837202 --- /dev/null +++ b/fs/exofs/ore.c @@ -0,0 +1,1126 @@ +/* + * Copyright (C) 2005, 2006 + * Avishay Traeger (avishay@gmail.com) + * Copyright (C) 2008, 2009 + * Boaz Harrosh <bharrosh@panasas.com> + * + * This file is part of exofs. + * + * exofs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. Since it is based on ext2, and the only + * valid version of GPL for the Linux kernel is version 2, the only valid + * version of GPL for exofs is version 2. + * + * exofs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with exofs; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <linux/slab.h> +#include <linux/module.h> +#include <asm/div64.h> +#include <linux/lcm.h> + +#include "ore_raid.h" + +MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>"); +MODULE_DESCRIPTION("Objects Raid Engine ore.ko"); +MODULE_LICENSE("GPL"); + +/* ore_verify_layout does a couple of things: + * 1. Given a minimum number of needed parameters fixes up the rest of the + * members to be operatonals for the ore. The needed parameters are those + * that are defined by the pnfs-objects layout STD. + * 2. Check to see if the current ore code actually supports these parameters + * for example stripe_unit must be a multple of the system PAGE_SIZE, + * and etc... + * 3. Cache some havily used calculations that will be needed by users. + */ + +enum { BIO_MAX_PAGES_KMALLOC = + (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),}; + +int ore_verify_layout(unsigned total_comps, struct ore_layout *layout) +{ + u64 stripe_length; + + switch (layout->raid_algorithm) { + case PNFS_OSD_RAID_0: + layout->parity = 0; + break; + case PNFS_OSD_RAID_5: + layout->parity = 1; + break; + case PNFS_OSD_RAID_PQ: + case PNFS_OSD_RAID_4: + default: + ORE_ERR("Only RAID_0/5 for now\n"); + return -EINVAL; + } + if (0 != (layout->stripe_unit & ~PAGE_MASK)) { + ORE_ERR("Stripe Unit(0x%llx)" + " must be Multples of PAGE_SIZE(0x%lx)\n", + _LLU(layout->stripe_unit), PAGE_SIZE); + return -EINVAL; + } + if (layout->group_width) { + if (!layout->group_depth) { + ORE_ERR("group_depth == 0 && group_width != 0\n"); + return -EINVAL; + } + if (total_comps < (layout->group_width * layout->mirrors_p1)) { + ORE_ERR("Data Map wrong, " + "numdevs=%d < group_width=%d * mirrors=%d\n", + total_comps, layout->group_width, + layout->mirrors_p1); + return -EINVAL; + } + layout->group_count = total_comps / layout->mirrors_p1 / + layout->group_width; + } else { + if (layout->group_depth) { + printk(KERN_NOTICE "Warning: group_depth ignored " + "group_width == 0 && group_depth == %lld\n", + _LLU(layout->group_depth)); + } + layout->group_width = total_comps / layout->mirrors_p1; + layout->group_depth = -1; + layout->group_count = 1; + } + + stripe_length = (u64)layout->group_width * layout->stripe_unit; + if (stripe_length >= (1ULL << 32)) { + ORE_ERR("Stripe_length(0x%llx) >= 32bit is not supported\n", + _LLU(stripe_length)); + return -EINVAL; + } + + layout->max_io_length = + (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - layout->stripe_unit) * + layout->group_width; + if (layout->parity) { + unsigned stripe_length = + (layout->group_width - layout->parity) * + layout->stripe_unit; + + layout->max_io_length /= stripe_length; + layout->max_io_length *= stripe_length; + } + return 0; +} +EXPORT_SYMBOL(ore_verify_layout); + +static u8 *_ios_cred(struct ore_io_state *ios, unsigned index) +{ + return ios->oc->comps[index & ios->oc->single_comp].cred; +} + +static struct osd_obj_id *_ios_obj(struct ore_io_state *ios, unsigned index) +{ + return &ios->oc->comps[index & ios->oc->single_comp].obj; +} + +static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index) +{ + ORE_DBGMSG2("oc->first_dev=%d oc->numdevs=%d i=%d oc->ods=%p\n", + ios->oc->first_dev, ios->oc->numdevs, index, + ios->oc->ods); + + return ore_comp_dev(ios->oc, index); +} + +int _ore_get_io_state(struct ore_layout *layout, + struct ore_components *oc, unsigned numdevs, + unsigned sgs_per_dev, unsigned num_par_pages, + struct ore_io_state **pios) +{ + struct ore_io_state *ios; + struct page **pages; + struct osd_sg_entry *sgilist; + struct __alloc_all_io_state { + struct ore_io_state ios; + struct ore_per_dev_state per_dev[numdevs]; + union { + struct osd_sg_entry sglist[sgs_per_dev * numdevs]; + struct page *pages[num_par_pages]; + }; + } *_aios; + + if (likely(sizeof(*_aios) <= PAGE_SIZE)) { + _aios = kzalloc(sizeof(*_aios), GFP_KERNEL); + if (unlikely(!_aios)) { + ORE_DBGMSG("Failed kzalloc bytes=%zd\n", + sizeof(*_aios)); + *pios = NULL; + return -ENOMEM; + } + pages = num_par_pages ? _aios->pages : NULL; + sgilist = sgs_per_dev ? _aios->sglist : NULL; + ios = &_aios->ios; + } else { + struct __alloc_small_io_state { + struct ore_io_state ios; + struct ore_per_dev_state per_dev[numdevs]; + } *_aio_small; + union __extra_part { + struct osd_sg_entry sglist[sgs_per_dev * numdevs]; + struct page *pages[num_par_pages]; + } *extra_part; + + _aio_small = kzalloc(sizeof(*_aio_small), GFP_KERNEL); + if (unlikely(!_aio_small)) { + ORE_DBGMSG("Failed alloc first part bytes=%zd\n", + sizeof(*_aio_small)); + *pios = NULL; + return -ENOMEM; + } + extra_part = kzalloc(sizeof(*extra_part), GFP_KERNEL); + if (unlikely(!extra_part)) { + ORE_DBGMSG("Failed alloc second part bytes=%zd\n", + sizeof(*extra_part)); + kfree(_aio_small); + *pios = NULL; + return -ENOMEM; + } + + pages = num_par_pages ? extra_part->pages : NULL; + sgilist = sgs_per_dev ? extra_part->sglist : NULL; + /* In this case the per_dev[0].sgilist holds the pointer to + * be freed + */ + ios = &_aio_small->ios; + ios->extra_part_alloc = true; + } + + if (pages) { + ios->parity_pages = pages; + ios->max_par_pages = num_par_pages; + } + if (sgilist) { + unsigned d; + + for (d = 0; d < numdevs; ++d) { + ios->per_dev[d].sglist = sgilist; + sgilist += sgs_per_dev; + } + ios->sgs_per_dev = sgs_per_dev; + } + + ios->layout = layout; + ios->oc = oc; + *pios = ios; + return 0; +} + +/* Allocate an io_state for only a single group of devices + * + * If a user needs to call ore_read/write() this version must be used becase it + * allocates extra stuff for striping and raid. + * The ore might decide to only IO less then @length bytes do to alignmets + * and constrains as follows: + * - The IO cannot cross group boundary. + * - In raid5/6 The end of the IO must align at end of a stripe eg. + * (@offset + @length) % strip_size == 0. Or the complete range is within a + * single stripe. + * - Memory condition only permitted a shorter IO. (A user can use @length=~0 + * And check the returned ios->length for max_io_size.) + * + * The caller must check returned ios->length (and/or ios->nr_pages) and + * re-issue these pages that fall outside of ios->length + */ +int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc, + bool is_reading, u64 offset, u64 length, + struct ore_io_state **pios) +{ + struct ore_io_state *ios; + unsigned numdevs = layout->group_width * layout->mirrors_p1; + unsigned sgs_per_dev = 0, max_par_pages = 0; + int ret; + + if (layout->parity && length) { + unsigned data_devs = layout->group_width - layout->parity; + unsigned stripe_size = layout->stripe_unit * data_devs; + unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE; + u32 remainder; + u64 num_stripes; + u64 num_raid_units; + + num_stripes = div_u64_rem(length, stripe_size, &remainder); + if (remainder) + ++num_stripes; + + num_raid_units = num_stripes * layout->parity; + + if (is_reading) { + /* For reads add per_dev sglist array */ + /* TODO: Raid 6 we need twice more. Actually: + * num_stripes / LCMdP(W,P); + * if (W%P != 0) num_stripes *= parity; + */ + + /* first/last seg is split */ + num_raid_units += layout->group_width; + sgs_per_dev = div_u64(num_raid_units, data_devs); + } else { + /* For Writes add parity pages array. */ + max_par_pages = num_raid_units * pages_in_unit * + sizeof(struct page *); + } + } + + ret = _ore_get_io_state(layout, oc, numdevs, sgs_per_dev, max_par_pages, + pios); + if (unlikely(ret)) + return ret; + + ios = *pios; + ios->reading = is_reading; + ios->offset = offset; + + if (length) { + ore_calc_stripe_info(layout, offset, length, &ios->si); + ios->length = ios->si.length; + ios->nr_pages = (ios->length + PAGE_SIZE - 1) / PAGE_SIZE; + if (layout->parity) + _ore_post_alloc_raid_stuff(ios); + } + + return 0; +} +EXPORT_SYMBOL(ore_get_rw_state); + +/* Allocate an io_state for all the devices in the comps array + * + * This version of io_state allocation is used mostly by create/remove + * and trunc where we currently need all the devices. The only wastful + * bit is the read/write_attributes with no IO. Those sites should + * be converted to use ore_get_rw_state() with length=0 + */ +int ore_get_io_state(struct ore_layout *layout, struct ore_components *oc, + struct ore_io_state **pios) +{ + return _ore_get_io_state(layout, oc, oc->numdevs, 0, 0, pios); +} +EXPORT_SYMBOL(ore_get_io_state); + +void ore_put_io_state(struct ore_io_state *ios) +{ + if (ios) { + unsigned i; + + for (i = 0; i < ios->numdevs; i++) { + struct ore_per_dev_state *per_dev = &ios->per_dev[i]; + + if (per_dev->or) + osd_end_request(per_dev->or); + if (per_dev->bio) + bio_put(per_dev->bio); + } + + _ore_free_raid_stuff(ios); + kfree(ios); + } +} +EXPORT_SYMBOL(ore_put_io_state); + +static void _sync_done(struct ore_io_state *ios, void *p) +{ + struct completion *waiting = p; + + complete(waiting); +} + +static void _last_io(struct kref *kref) +{ + struct ore_io_state *ios = container_of( + kref, struct ore_io_state, kref); + + ios->done(ios, ios->private); +} + +static void _done_io(struct osd_request *or, void *p) +{ + struct ore_io_state *ios = p; + + kref_put(&ios->kref, _last_io); +} + +int ore_io_execute(struct ore_io_state *ios) +{ + DECLARE_COMPLETION_ONSTACK(wait); + bool sync = (ios->done == NULL); + int i, ret; + + if (sync) { + ios->done = _sync_done; + ios->private = &wait; + } + + for (i = 0; i < ios->numdevs; i++) { + struct osd_request *or = ios->per_dev[i].or; + if (unlikely(!or)) + continue; + + ret = osd_finalize_request(or, 0, _ios_cred(ios, i), NULL); + if (unlikely(ret)) { + ORE_DBGMSG("Failed to osd_finalize_request() => %d\n", + ret); + return ret; + } + } + + kref_init(&ios->kref); + + for (i = 0; i < ios->numdevs; i++) { + struct osd_request *or = ios->per_dev[i].or; + if (unlikely(!or)) + continue; + + kref_get(&ios->kref); + osd_execute_request_async(or, _done_io, ios); + } + + kref_put(&ios->kref, _last_io); + ret = 0; + + if (sync) { + wait_for_completion(&wait); + ret = ore_check_io(ios, NULL); + } + return ret; +} + +static void _clear_bio(struct bio *bio) +{ + struct bio_vec *bv; + unsigned i; + + __bio_for_each_segment(bv, bio, i, 0) { + unsigned this_count = bv->bv_len; + + if (likely(PAGE_SIZE == this_count)) + clear_highpage(bv->bv_page); + else + zero_user(bv->bv_page, bv->bv_offset, this_count); + } +} + +int ore_check_io(struct ore_io_state *ios, ore_on_dev_error on_dev_error) +{ + enum osd_err_priority acumulated_osd_err = 0; + int acumulated_lin_err = 0; + int i; + + for (i = 0; i < ios->numdevs; i++) { + struct osd_sense_info osi; + struct ore_per_dev_state *per_dev = &ios->per_dev[i]; + struct osd_request *or = per_dev->or; + int ret; + + if (unlikely(!or)) + continue; + + ret = osd_req_decode_sense(or, &osi); + if (likely(!ret)) + continue; + + if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) { + /* start read offset passed endof file */ + _clear_bio(per_dev->bio); + ORE_DBGMSG("start read offset passed end of file " + "offset=0x%llx, length=0x%llx\n", + _LLU(per_dev->offset), + _LLU(per_dev->length)); + + continue; /* we recovered */ + } + + if (on_dev_error) { + u64 residual = ios->reading ? + or->in.residual : or->out.residual; + u64 offset = (ios->offset + ios->length) - residual; + struct ore_dev *od = ios->oc->ods[ + per_dev->dev - ios->oc->first_dev]; + + on_dev_error(ios, od, per_dev->dev, osi.osd_err_pri, + offset, residual); + } + if (osi.osd_err_pri >= acumulated_osd_err) { + acumulated_osd_err = osi.osd_err_pri; + acumulated_lin_err = ret; + } + } + + return acumulated_lin_err; +} +EXPORT_SYMBOL(ore_check_io); + +/* + * L - logical offset into the file + * + * D - number of Data devices + * D = group_width - parity + * + * U - The number of bytes in a stripe within a group + * U = stripe_unit * D + * + * T - The number of bytes striped within a group of component objects + * (before advancing to the next group) + * T = U * group_depth + * + * S - The number of bytes striped across all component objects + * before the pattern repeats + * S = T * group_count + * + * M - The "major" (i.e., across all components) cycle number + * M = L / S + * + * G - Counts the groups from the beginning of the major cycle + * G = (L - (M * S)) / T [or (L % S) / T] + * + * H - The byte offset within the group + * H = (L - (M * S)) % T [or (L % S) % T] + * + * N - The "minor" (i.e., across the group) stripe number + * N = H / U + * + * C - The component index coresponding to L + * + * C = (H - (N * U)) / stripe_unit + G * D + * [or (L % U) / stripe_unit + G * D] + * + * O - The component offset coresponding to L + * O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit + * + * LCMdP – Parity cycle: Lowest Common Multiple of group_width, parity + * divide by parity + * LCMdP = lcm(group_width, parity) / parity + * + * R - The parity Rotation stripe + * (Note parity cycle always starts at a group's boundary) + * R = N % LCMdP + * + * I = the first parity device index + * I = (group_width + group_width - R*parity - parity) % group_width + * + * Craid - The component index Rotated + * Craid = (group_width + C - R*parity) % group_width + * (We add the group_width to avoid negative numbers modulo math) + */ +void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, + u64 length, struct ore_striping_info *si) +{ + u32 stripe_unit = layout->stripe_unit; + u32 group_width = layout->group_width; + u64 group_depth = layout->group_depth; + u32 parity = layout->parity; + + u32 D = group_width - parity; + u32 U = D * stripe_unit; + u64 T = U * group_depth; + u64 S = T * layout->group_count; + u64 M = div64_u64(file_offset, S); + + /* + G = (L - (M * S)) / T + H = (L - (M * S)) % T + */ + u64 LmodS = file_offset - M * S; + u32 G = div64_u64(LmodS, T); + u64 H = LmodS - G * T; + + u32 N = div_u64(H, U); + + /* "H - (N * U)" is just "H % U" so it's bound to u32 */ + u32 C = (u32)(H - (N * U)) / stripe_unit + G * group_width; + + div_u64_rem(file_offset, stripe_unit, &si->unit_off); + + si->obj_offset = si->unit_off + (N * stripe_unit) + + (M * group_depth * stripe_unit); + + if (parity) { + u32 LCMdP = lcm(group_width, parity) / parity; + /* R = N % LCMdP; */ + u32 RxP = (N % LCMdP) * parity; + u32 first_dev = C - C % group_width; + + si->par_dev = (group_width + group_width - parity - RxP) % + group_width + first_dev; + si->dev = (group_width + C - RxP) % group_width + first_dev; + si->bytes_in_stripe = U; + si->first_stripe_start = M * S + G * T + N * U; + } else { + /* Make the math correct see _prepare_one_group */ + si->par_dev = group_width; + si->dev = C; + } + + si->dev *= layout->mirrors_p1; + si->par_dev *= layout->mirrors_p1; + si->offset = file_offset; + si->length = T - H; + if (si->length > length) + si->length = length; + si->M = M; +} +EXPORT_SYMBOL(ore_calc_stripe_info); + +int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, + unsigned pgbase, struct page **pages, + struct ore_per_dev_state *per_dev, int cur_len) +{ + unsigned pg = *cur_pg; + struct request_queue *q = + osd_request_queue(_ios_od(ios, per_dev->dev)); + unsigned len = cur_len; + int ret; + + if (per_dev->bio == NULL) { + unsigned pages_in_stripe = ios->layout->group_width * + (ios->layout->stripe_unit / PAGE_SIZE); + unsigned nr_pages = ios->nr_pages * ios->layout->group_width / + (ios->layout->group_width - + ios->layout->parity); + unsigned bio_size = (nr_pages + pages_in_stripe) / + ios->layout->group_width; + + per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); + if (unlikely(!per_dev->bio)) { + ORE_DBGMSG("Failed to allocate BIO size=%u\n", + bio_size); + ret = -ENOMEM; + goto out; + } + } + + while (cur_len > 0) { + unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len); + unsigned added_len; + + cur_len -= pglen; + + added_len = bio_add_pc_page(q, per_dev->bio, pages[pg], + pglen, pgbase); + if (unlikely(pglen != added_len)) { + ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=%u\n", + per_dev->bio->bi_vcnt); + ret = -ENOMEM; + goto out; + } + _add_stripe_page(ios->sp2d, &ios->si, pages[pg]); + + pgbase = 0; + ++pg; + } + BUG_ON(cur_len); + + per_dev->length += len; + *cur_pg = pg; + ret = 0; +out: /* we fail the complete unit on an error eg don't advance + * per_dev->length and cur_pg. This means that we might have a bigger + * bio than the CDB requested length (per_dev->length). That's fine + * only the oposite is fatal. + */ + return ret; +} + +static int _prepare_for_striping(struct ore_io_state *ios) +{ + struct ore_striping_info *si = &ios->si; + unsigned stripe_unit = ios->layout->stripe_unit; + unsigned mirrors_p1 = ios->layout->mirrors_p1; + unsigned group_width = ios->layout->group_width; + unsigned devs_in_group = group_width * mirrors_p1; + unsigned dev = si->dev; + unsigned first_dev = dev - (dev % devs_in_group); + unsigned dev_order; + unsigned cur_pg = ios->pages_consumed; + u64 length = ios->length; + int ret = 0; + + if (!ios->pages) { + ios->numdevs = ios->layout->mirrors_p1; + return 0; + } + + BUG_ON(length > si->length); + + dev_order = _dev_order(devs_in_group, mirrors_p1, si->par_dev, dev); + si->cur_comp = dev_order; + si->cur_pg = si->unit_off / PAGE_SIZE; + + while (length) { + unsigned comp = dev - first_dev; + struct ore_per_dev_state *per_dev = &ios->per_dev[comp]; + unsigned cur_len, page_off = 0; + + if (!per_dev->length) { + per_dev->dev = dev; + if (dev == si->dev) { + WARN_ON(dev == si->par_dev); + per_dev->offset = si->obj_offset; + cur_len = stripe_unit - si->unit_off; + page_off = si->unit_off & ~PAGE_MASK; + BUG_ON(page_off && (page_off != ios->pgbase)); + } else { + if (si->cur_comp > dev_order) + per_dev->offset = + si->obj_offset - si->unit_off; + else /* si->cur_comp < dev_order */ + per_dev->offset = + si->obj_offset + stripe_unit - + si->unit_off; + cur_len = stripe_unit; + } + } else { + cur_len = stripe_unit; + } + if (cur_len >= length) + cur_len = length; + + ret = _ore_add_stripe_unit(ios, &cur_pg, page_off, ios->pages, + per_dev, cur_len); + if (unlikely(ret)) + goto out; + + dev += mirrors_p1; + dev = (dev % devs_in_group) + first_dev; + + length -= cur_len; + + si->cur_comp = (si->cur_comp + 1) % group_width; + if (unlikely((dev == si->par_dev) || (!length && ios->sp2d))) { + if (!length && ios->sp2d) { + /* If we are writing and this is the very last + * stripe. then operate on parity dev. + */ + dev = si->par_dev; + } + if (ios->sp2d) + /* In writes cur_len just means if it's the + * last one. See _ore_add_parity_unit. + */ + cur_len = length; + per_dev = &ios->per_dev[dev - first_dev]; + if (!per_dev->length) { + /* Only/always the parity unit of the first + * stripe will be empty. So this is a chance to + * initialize the per_dev info. + */ + per_dev->dev = dev; + per_dev->offset = si->obj_offset - si->unit_off; + } + + ret = _ore_add_parity_unit(ios, si, per_dev, cur_len); + if (unlikely(ret)) + goto out; + + /* Rotate next par_dev backwards with wraping */ + si->par_dev = (devs_in_group + si->par_dev - + ios->layout->parity * mirrors_p1) % + devs_in_group + first_dev; + /* Next stripe, start fresh */ + si->cur_comp = 0; + si->cur_pg = 0; + } + } +out: + ios->numdevs = devs_in_group; + ios->pages_consumed = cur_pg; + if (unlikely(ret)) { + if (length == ios->length) + return ret; + else + ios->length -= length; + } + return 0; +} + +int ore_create(struct ore_io_state *ios) +{ + int i, ret; + + for (i = 0; i < ios->oc->numdevs; i++) { + struct osd_request *or; + + or = osd_start_request(_ios_od(ios, i), GFP_KERNEL); + if (unlikely(!or)) { + ORE_ERR("%s: osd_start_request failed\n", __func__); + ret = -ENOMEM; + goto out; + } + ios->per_dev[i].or = or; + ios->numdevs++; + + osd_req_create_object(or, _ios_obj(ios, i)); + } + ret = ore_io_execute(ios); + +out: + return ret; +} +EXPORT_SYMBOL(ore_create); + +int ore_remove(struct ore_io_state *ios) +{ + int i, ret; + + for (i = 0; i < ios->oc->numdevs; i++) { + struct osd_request *or; + + or = osd_start_request(_ios_od(ios, i), GFP_KERNEL); + if (unlikely(!or)) { + ORE_ERR("%s: osd_start_request failed\n", __func__); + ret = -ENOMEM; + goto out; + } + ios->per_dev[i].or = or; + ios->numdevs++; + + osd_req_remove_object(or, _ios_obj(ios, i)); + } + ret = ore_io_execute(ios); + +out: + return ret; +} +EXPORT_SYMBOL(ore_remove); + +static int _write_mirror(struct ore_io_state *ios, int cur_comp) +{ + struct ore_per_dev_state *master_dev = &ios->per_dev[cur_comp]; + unsigned dev = ios->per_dev[cur_comp].dev; + unsigned last_comp = cur_comp + ios->layout->mirrors_p1; + int ret = 0; + + if (ios->pages && !master_dev->length) + return 0; /* Just an empty slot */ + + for (; cur_comp < last_comp; ++cur_comp, ++dev) { + struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp]; + struct osd_request *or; + + or = osd_start_request(_ios_od(ios, dev), GFP_KERNEL); + if (unlikely(!or)) { + ORE_ERR("%s: osd_start_request failed\n", __func__); + ret = -ENOMEM; + goto out; + } + per_dev->or = or; + + if (ios->pages) { + struct bio *bio; + + if (per_dev != master_dev) { + bio = bio_kmalloc(GFP_KERNEL, + master_dev->bio->bi_max_vecs); + if (unlikely(!bio)) { + ORE_DBGMSG( + "Failed to allocate BIO size=%u\n", + master_dev->bio->bi_max_vecs); + ret = -ENOMEM; + goto out; + } + + __bio_clone(bio, master_dev->bio); + bio->bi_bdev = NULL; + bio->bi_next = NULL; + per_dev->offset = master_dev->offset; + per_dev->length = master_dev->length; + per_dev->bio = bio; + per_dev->dev = dev; + } else { + bio = master_dev->bio; + /* FIXME: bio_set_dir() */ + bio->bi_rw |= REQ_WRITE; + } + + osd_req_write(or, _ios_obj(ios, dev), per_dev->offset, + bio, per_dev->length); + ORE_DBGMSG("write(0x%llx) offset=0x%llx " + "length=0x%llx dev=%d\n", + _LLU(_ios_obj(ios, dev)->id), + _LLU(per_dev->offset), + _LLU(per_dev->length), dev); + } else if (ios->kern_buff) { + per_dev->offset = ios->si.obj_offset; + per_dev->dev = ios->si.dev + dev; + + /* no cross device without page array */ + BUG_ON((ios->layout->group_width > 1) && + (ios->si.unit_off + ios->length > + ios->layout->stripe_unit)); + + ret = osd_req_write_kern(or, _ios_obj(ios, per_dev->dev), + per_dev->offset, + ios->kern_buff, ios->length); + if (unlikely(ret)) + goto out; + ORE_DBGMSG2("write_kern(0x%llx) offset=0x%llx " + "length=0x%llx dev=%d\n", + _LLU(_ios_obj(ios, dev)->id), + _LLU(per_dev->offset), + _LLU(ios->length), per_dev->dev); + } else { + osd_req_set_attributes(or, _ios_obj(ios, dev)); + ORE_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n", + _LLU(_ios_obj(ios, dev)->id), + ios->out_attr_len, dev); + } + + if (ios->out_attr) + osd_req_add_set_attr_list(or, ios->out_attr, + ios->out_attr_len); + + if (ios->in_attr) + osd_req_add_get_attr_list(or, ios->in_attr, + ios->in_attr_len); + } + +out: + return ret; +} + +int ore_write(struct ore_io_state *ios) +{ + int i; + int ret; + + if (unlikely(ios->sp2d && !ios->r4w)) { + /* A library is attempting a RAID-write without providing + * a pages lock interface. + */ + WARN_ON_ONCE(1); + return -ENOTSUPP; + } + + ret = _prepare_for_striping(ios); + if (unlikely(ret)) + return ret; + + for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { + ret = _write_mirror(ios, i); + if (unlikely(ret)) + return ret; + } + + ret = ore_io_execute(ios); + return ret; +} +EXPORT_SYMBOL(ore_write); + +int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp) +{ + struct osd_request *or; + struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp]; + struct osd_obj_id *obj = _ios_obj(ios, cur_comp); + unsigned first_dev = (unsigned)obj->id; + + if (ios->pages && !per_dev->length) + return 0; /* Just an empty slot */ + + first_dev = per_dev->dev + first_dev % ios->layout->mirrors_p1; + or = osd_start_request(_ios_od(ios, first_dev), GFP_KERNEL); + if (unlikely(!or)) { + ORE_ERR("%s: osd_start_request failed\n", __func__); + return -ENOMEM; + } + per_dev->or = or; + + if (ios->pages) { + if (per_dev->cur_sg) { + /* finalize the last sg_entry */ + _ore_add_sg_seg(per_dev, 0, false); + if (unlikely(!per_dev->cur_sg)) + return 0; /* Skip parity only device */ + + osd_req_read_sg(or, obj, per_dev->bio, + per_dev->sglist, per_dev->cur_sg); + } else { + /* The no raid case */ + osd_req_read(or, obj, per_dev->offset, + per_dev->bio, per_dev->length); + } + + ORE_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx" + " dev=%d sg_len=%d\n", _LLU(obj->id), + _LLU(per_dev->offset), _LLU(per_dev->length), + first_dev, per_dev->cur_sg); + } else { + BUG_ON(ios->kern_buff); + + osd_req_get_attributes(or, obj); + ORE_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n", + _LLU(obj->id), + ios->in_attr_len, first_dev); + } + if (ios->out_attr) + osd_req_add_set_attr_list(or, ios->out_attr, ios->out_attr_len); + + if (ios->in_attr) + osd_req_add_get_attr_list(or, ios->in_attr, ios->in_attr_len); + + return 0; +} + +int ore_read(struct ore_io_state *ios) +{ + int i; + int ret; + + ret = _prepare_for_striping(ios); + if (unlikely(ret)) + return ret; + + for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { + ret = _ore_read_mirror(ios, i); + if (unlikely(ret)) + return ret; + } + + ret = ore_io_execute(ios); + return ret; +} +EXPORT_SYMBOL(ore_read); + +int extract_attr_from_ios(struct ore_io_state *ios, struct osd_attr *attr) +{ + struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */ + void *iter = NULL; + int nelem; + + do { + nelem = 1; + osd_req_decode_get_attr_list(ios->per_dev[0].or, + &cur_attr, &nelem, &iter); + if ((cur_attr.attr_page == attr->attr_page) && + (cur_attr.attr_id == attr->attr_id)) { + attr->len = cur_attr.len; + attr->val_ptr = cur_attr.val_ptr; + return 0; + } + } while (iter); + + return -EIO; +} +EXPORT_SYMBOL(extract_attr_from_ios); + +static int _truncate_mirrors(struct ore_io_state *ios, unsigned cur_comp, + struct osd_attr *attr) +{ + int last_comp = cur_comp + ios->layout->mirrors_p1; + + for (; cur_comp < last_comp; ++cur_comp) { + struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp]; + struct osd_request *or; + + or = osd_start_request(_ios_od(ios, cur_comp), GFP_KERNEL); + if (unlikely(!or)) { + ORE_ERR("%s: osd_start_request failed\n", __func__); + return -ENOMEM; + } + per_dev->or = or; + + osd_req_set_attributes(or, _ios_obj(ios, cur_comp)); + osd_req_add_set_attr_list(or, attr, 1); + } + + return 0; +} + +struct _trunc_info { + struct ore_striping_info si; + u64 prev_group_obj_off; + u64 next_group_obj_off; + + unsigned first_group_dev; + unsigned nex_group_dev; +}; + +static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset, + struct _trunc_info *ti) +{ + unsigned stripe_unit = layout->stripe_unit; + + ore_calc_stripe_info(layout, file_offset, 0, &ti->si); + + ti->prev_group_obj_off = ti->si.M * stripe_unit; + ti->next_group_obj_off = ti->si.M ? (ti->si.M - 1) * stripe_unit : 0; + + ti->first_group_dev = ti->si.dev - (ti->si.dev % layout->group_width); + ti->nex_group_dev = ti->first_group_dev + layout->group_width; +} + +int ore_truncate(struct ore_layout *layout, struct ore_components *oc, + u64 size) +{ + struct ore_io_state *ios; + struct exofs_trunc_attr { + struct osd_attr attr; + __be64 newsize; + } *size_attrs; + struct _trunc_info ti; + int i, ret; + + ret = ore_get_io_state(layout, oc, &ios); + if (unlikely(ret)) + return ret; + + _calc_trunk_info(ios->layout, size, &ti); + + size_attrs = kcalloc(ios->oc->numdevs, sizeof(*size_attrs), + GFP_KERNEL); + if (unlikely(!size_attrs)) { + ret = -ENOMEM; + goto out; + } + + ios->numdevs = ios->oc->numdevs; + + for (i = 0; i < ios->numdevs; ++i) { + struct exofs_trunc_attr *size_attr = &size_attrs[i]; + u64 obj_size; + + if (i < ti.first_group_dev) + obj_size = ti.prev_group_obj_off; + else if (i >= ti.nex_group_dev) + obj_size = ti.next_group_obj_off; + else if (i < ti.si.dev) /* dev within this group */ + obj_size = ti.si.obj_offset + + ios->layout->stripe_unit - ti.si.unit_off; + else if (i == ti.si.dev) + obj_size = ti.si.obj_offset; + else /* i > ti.dev */ + obj_size = ti.si.obj_offset - ti.si.unit_off; + + size_attr->newsize = cpu_to_be64(obj_size); + size_attr->attr = g_attr_logical_length; + size_attr->attr.val_ptr = &size_attr->newsize; + + ORE_DBGMSG("trunc(0x%llx) obj_offset=0x%llx dev=%d\n", + _LLU(oc->comps->obj.id), _LLU(obj_size), i); + ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1, + &size_attr->attr); + if (unlikely(ret)) + goto out; + } + ret = ore_io_execute(ios); + +out: + kfree(size_attrs); + ore_put_io_state(ios); + return ret; +} +EXPORT_SYMBOL(ore_truncate); + +const struct osd_attr g_attr_logical_length = ATTR_DEF( + OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8); +EXPORT_SYMBOL(g_attr_logical_length); diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c new file mode 100644 index 000000000000..29c47e5c4a86 --- /dev/null +++ b/fs/exofs/ore_raid.c @@ -0,0 +1,660 @@ +/* + * Copyright (C) 2011 + * Boaz Harrosh <bharrosh@panasas.com> + * + * This file is part of the objects raid engine (ore). + * + * It is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * You should have received a copy of the GNU General Public License + * along with "ore". If not, write to the Free Software Foundation, Inc: + * "Free Software Foundation <info@fsf.org>" + */ + +#include <linux/gfp.h> +#include <linux/async_tx.h> + +#include "ore_raid.h" + +#undef ORE_DBGMSG2 +#define ORE_DBGMSG2 ORE_DBGMSG + +struct page *_raid_page_alloc(void) +{ + return alloc_page(GFP_KERNEL); +} + +void _raid_page_free(struct page *p) +{ + __free_page(p); +} + +/* This struct is forward declare in ore_io_state, but is private to here. + * It is put on ios->sp2d for RAID5/6 writes only. See _gen_xor_unit. + * + * __stripe_pages_2d is a 2d array of pages, and it is also a corner turn. + * Ascending page index access is sp2d(p-minor, c-major). But storage is + * sp2d[p-minor][c-major], so it can be properlly presented to the async-xor + * API. + */ +struct __stripe_pages_2d { + /* Cache some hot path repeated calculations */ + unsigned parity; + unsigned data_devs; + unsigned pages_in_unit; + + bool needed ; + + /* Array size is pages_in_unit (layout->stripe_unit / PAGE_SIZE) */ + struct __1_page_stripe { + bool alloc; + unsigned write_count; + struct async_submit_ctl submit; + struct dma_async_tx_descriptor *tx; + + /* The size of this array is data_devs + parity */ + struct page **pages; + struct page **scribble; + /* bool array, size of this array is data_devs */ + char *page_is_read; + } _1p_stripes[]; +}; + +/* This can get bigger then a page. So support multiple page allocations + * _sp2d_free should be called even if _sp2d_alloc fails (by returning + * none-zero). + */ +static int _sp2d_alloc(unsigned pages_in_unit, unsigned group_width, + unsigned parity, struct __stripe_pages_2d **psp2d) +{ + struct __stripe_pages_2d *sp2d; + unsigned data_devs = group_width - parity; + struct _alloc_all_bytes { + struct __alloc_stripe_pages_2d { + struct __stripe_pages_2d sp2d; + struct __1_page_stripe _1p_stripes[pages_in_unit]; + } __asp2d; + struct __alloc_1p_arrays { + struct page *pages[group_width]; + struct page *scribble[group_width]; + char page_is_read[data_devs]; + } __a1pa[pages_in_unit]; + } *_aab; + struct __alloc_1p_arrays *__a1pa; + struct __alloc_1p_arrays *__a1pa_end; + const unsigned sizeof__a1pa = sizeof(_aab->__a1pa[0]); + unsigned num_a1pa, alloc_size, i; + + /* FIXME: check these numbers in ore_verify_layout */ + BUG_ON(sizeof(_aab->__asp2d) > PAGE_SIZE); + BUG_ON(sizeof__a1pa > PAGE_SIZE); + + if (sizeof(*_aab) > PAGE_SIZE) { + num_a1pa = (PAGE_SIZE - sizeof(_aab->__asp2d)) / sizeof__a1pa; + alloc_size = sizeof(_aab->__asp2d) + sizeof__a1pa * num_a1pa; + } else { + num_a1pa = pages_in_unit; + alloc_size = sizeof(*_aab); + } + + _aab = kzalloc(alloc_size, GFP_KERNEL); + if (unlikely(!_aab)) { + ORE_DBGMSG("!! Failed to alloc sp2d size=%d\n", alloc_size); + return -ENOMEM; + } + + sp2d = &_aab->__asp2d.sp2d; + *psp2d = sp2d; /* From here Just call _sp2d_free */ + + __a1pa = _aab->__a1pa; + __a1pa_end = __a1pa + num_a1pa; + + for (i = 0; i < pages_in_unit; ++i) { + if (unlikely(__a1pa >= __a1pa_end)) { + num_a1pa = min_t(unsigned, PAGE_SIZE / sizeof__a1pa, + pages_in_unit - i); + + __a1pa = kzalloc(num_a1pa * sizeof__a1pa, GFP_KERNEL); + if (unlikely(!__a1pa)) { + ORE_DBGMSG("!! Failed to _alloc_1p_arrays=%d\n", + num_a1pa); + return -ENOMEM; + } + __a1pa_end = __a1pa + num_a1pa; + /* First *pages is marked for kfree of the buffer */ + sp2d->_1p_stripes[i].alloc = true; + } + + sp2d->_1p_stripes[i].pages = __a1pa->pages; + sp2d->_1p_stripes[i].scribble = __a1pa->scribble ; + sp2d->_1p_stripes[i].page_is_read = __a1pa->page_is_read; + ++__a1pa; + } + + sp2d->parity = parity; + sp2d->data_devs = data_devs; + sp2d->pages_in_unit = pages_in_unit; + return 0; +} + +static void _sp2d_reset(struct __stripe_pages_2d *sp2d, + const struct _ore_r4w_op *r4w, void *priv) +{ + unsigned data_devs = sp2d->data_devs; + unsigned group_width = data_devs + sp2d->parity; + unsigned p; + + if (!sp2d->needed) + return; + + for (p = 0; p < sp2d->pages_in_unit; p++) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + + if (_1ps->write_count < group_width) { + unsigned c; + + for (c = 0; c < data_devs; c++) + if (_1ps->page_is_read[c]) { + struct page *page = _1ps->pages[c]; + + r4w->put_page(priv, page); + _1ps->page_is_read[c] = false; + } + } + + memset(_1ps->pages, 0, group_width * sizeof(*_1ps->pages)); + _1ps->write_count = 0; + _1ps->tx = NULL; + } + + sp2d->needed = false; +} + +static void _sp2d_free(struct __stripe_pages_2d *sp2d) +{ + unsigned i; + + if (!sp2d) + return; + + for (i = 0; i < sp2d->pages_in_unit; ++i) { + if (sp2d->_1p_stripes[i].alloc) + kfree(sp2d->_1p_stripes[i].pages); + } + + kfree(sp2d); +} + +static unsigned _sp2d_min_pg(struct __stripe_pages_2d *sp2d) +{ + unsigned p; + + for (p = 0; p < sp2d->pages_in_unit; p++) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + + if (_1ps->write_count) + return p; + } + + return ~0; +} + +static unsigned _sp2d_max_pg(struct __stripe_pages_2d *sp2d) +{ + unsigned p; + + for (p = sp2d->pages_in_unit - 1; p >= 0; --p) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + + if (_1ps->write_count) + return p; + } + + return ~0; +} + +static void _gen_xor_unit(struct __stripe_pages_2d *sp2d) +{ + unsigned p; + for (p = 0; p < sp2d->pages_in_unit; p++) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + + if (!_1ps->write_count) + continue; + + init_async_submit(&_1ps->submit, + ASYNC_TX_XOR_ZERO_DST | ASYNC_TX_ACK, + NULL, + NULL, NULL, + (addr_conv_t *)_1ps->scribble); + + /* TODO: raid6 */ + _1ps->tx = async_xor(_1ps->pages[sp2d->data_devs], _1ps->pages, + 0, sp2d->data_devs, PAGE_SIZE, + &_1ps->submit); + } + + for (p = 0; p < sp2d->pages_in_unit; p++) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + /* NOTE: We wait for HW synchronously (I don't have such HW + * to test with.) Is parallelism needed with today's multi + * cores? + */ + async_tx_issue_pending(_1ps->tx); + } +} + +void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d, + struct ore_striping_info *si, struct page *page) +{ + struct __1_page_stripe *_1ps; + + sp2d->needed = true; + + _1ps = &sp2d->_1p_stripes[si->cur_pg]; + _1ps->pages[si->cur_comp] = page; + ++_1ps->write_count; + + si->cur_pg = (si->cur_pg + 1) % sp2d->pages_in_unit; + /* si->cur_comp is advanced outside at main loop */ +} + +void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len, + bool not_last) +{ + struct osd_sg_entry *sge; + + ORE_DBGMSG("dev=%d cur_len=0x%x not_last=%d cur_sg=%d " + "offset=0x%llx length=0x%x last_sgs_total=0x%x\n", + per_dev->dev, cur_len, not_last, per_dev->cur_sg, + _LLU(per_dev->offset), per_dev->length, + per_dev->last_sgs_total); + + if (!per_dev->cur_sg) { + sge = per_dev->sglist; + + /* First time we prepare two entries */ + if (per_dev->length) { + ++per_dev->cur_sg; + sge->offset = per_dev->offset; + sge->len = per_dev->length; + } else { + /* Here the parity is the first unit of this object. + * This happens every time we reach a parity device on + * the same stripe as the per_dev->offset. We need to + * just skip this unit. + */ + per_dev->offset += cur_len; + return; + } + } else { + /* finalize the last one */ + sge = &per_dev->sglist[per_dev->cur_sg - 1]; + sge->len = per_dev->length - per_dev->last_sgs_total; + } + + if (not_last) { + /* Partly prepare the next one */ + struct osd_sg_entry *next_sge = sge + 1; + + ++per_dev->cur_sg; + next_sge->offset = sge->offset + sge->len + cur_len; + /* Save cur len so we know how mutch was added next time */ + per_dev->last_sgs_total = per_dev->length; + next_sge->len = 0; + } else if (!sge->len) { + /* Optimize for when the last unit is a parity */ + --per_dev->cur_sg; + } +} + +static int _alloc_read_4_write(struct ore_io_state *ios) +{ + struct ore_layout *layout = ios->layout; + int ret; + /* We want to only read those pages not in cache so worst case + * is a stripe populated with every other page + */ + unsigned sgs_per_dev = ios->sp2d->pages_in_unit + 2; + + ret = _ore_get_io_state(layout, ios->oc, + layout->group_width * layout->mirrors_p1, + sgs_per_dev, 0, &ios->ios_read_4_write); + return ret; +} + +/* @si contains info of the to-be-inserted page. Update of @si should be + * maintained by caller. Specificaly si->dev, si->obj_offset, ... + */ +static int _add_to_read_4_write(struct ore_io_state *ios, + struct ore_striping_info *si, struct page *page) +{ + struct request_queue *q; + struct ore_per_dev_state *per_dev; + struct ore_io_state *read_ios; + unsigned first_dev = si->dev - (si->dev % + (ios->layout->group_width * ios->layout->mirrors_p1)); + unsigned comp = si->dev - first_dev; + unsigned added_len; + + if (!ios->ios_read_4_write) { + int ret = _alloc_read_4_write(ios); + + if (unlikely(ret)) + return ret; + } + + read_ios = ios->ios_read_4_write; + read_ios->numdevs = ios->layout->group_width * ios->layout->mirrors_p1; + + per_dev = &read_ios->per_dev[comp]; + if (!per_dev->length) { + per_dev->bio = bio_kmalloc(GFP_KERNEL, + ios->sp2d->pages_in_unit); + if (unlikely(!per_dev->bio)) { + ORE_DBGMSG("Failed to allocate BIO size=%u\n", + ios->sp2d->pages_in_unit); + return -ENOMEM; + } + per_dev->offset = si->obj_offset; + per_dev->dev = si->dev; + } else if (si->obj_offset != (per_dev->offset + per_dev->length)) { + u64 gap = si->obj_offset - (per_dev->offset + per_dev->length); + + _ore_add_sg_seg(per_dev, gap, true); + } + q = osd_request_queue(ore_comp_dev(read_ios->oc, per_dev->dev)); + added_len = bio_add_pc_page(q, per_dev->bio, page, PAGE_SIZE, 0); + if (unlikely(added_len != PAGE_SIZE)) { + ORE_DBGMSG("Failed to bio_add_pc_page bi_vcnt=%d\n", + per_dev->bio->bi_vcnt); + return -ENOMEM; + } + + per_dev->length += PAGE_SIZE; + return 0; +} + +static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret) +{ + struct bio_vec *bv; + unsigned i, d; + + /* loop on all devices all pages */ + for (d = 0; d < ios->numdevs; d++) { + struct bio *bio = ios->per_dev[d].bio; + + if (!bio) + continue; + + __bio_for_each_segment(bv, bio, i, 0) { + struct page *page = bv->bv_page; + + SetPageUptodate(page); + if (PageError(page)) + ClearPageError(page); + } + } +} + +/* read_4_write is hacked to read the start of the first stripe and/or + * the end of the last stripe. If needed, with an sg-gap at each device/page. + * It is assumed to be called after the to_be_written pages of the first stripe + * are populating ios->sp2d[][] + * + * NOTE: We call ios->r4w->lock_fn for all pages needed for parity calculations + * These pages are held at sp2d[p].pages[c] but with + * sp2d[p].page_is_read[c] = true. At _sp2d_reset these pages are + * ios->r4w->lock_fn(). The ios->r4w->lock_fn might signal that the page is + * @uptodate=true, so we don't need to read it, only unlock, after IO. + * + * TODO: The read_4_write should calc a need_to_read_pages_count, if bigger then + * to-be-written count, we should consider the xor-in-place mode. + * need_to_read_pages_count is the actual number of pages not present in cache. + * maybe "devs_in_group - ios->sp2d[p].write_count" is a good enough + * approximation? In this mode the read pages are put in the empty places of + * ios->sp2d[p][*], xor is calculated the same way. These pages are + * allocated/freed and don't go through cache + */ +static int _read_4_write(struct ore_io_state *ios) +{ + struct ore_io_state *ios_read; + struct ore_striping_info read_si; + struct __stripe_pages_2d *sp2d = ios->sp2d; + u64 offset = ios->si.first_stripe_start; + u64 last_stripe_end; + unsigned bytes_in_stripe = ios->si.bytes_in_stripe; + unsigned i, c, p, min_p = sp2d->pages_in_unit, max_p = -1; + int ret; + + if (offset == ios->offset) /* Go to start collect $200 */ + goto read_last_stripe; + + min_p = _sp2d_min_pg(sp2d); + max_p = _sp2d_max_pg(sp2d); + + for (c = 0; ; c++) { + ore_calc_stripe_info(ios->layout, offset, 0, &read_si); + read_si.obj_offset += min_p * PAGE_SIZE; + offset += min_p * PAGE_SIZE; + for (p = min_p; p <= max_p; p++) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + struct page **pp = &_1ps->pages[c]; + bool uptodate; + + if (*pp) + /* to-be-written pages start here */ + goto read_last_stripe; + + *pp = ios->r4w->get_page(ios->private, offset, + &uptodate); + if (unlikely(!*pp)) + return -ENOMEM; + + if (!uptodate) + _add_to_read_4_write(ios, &read_si, *pp); + + /* Mark read-pages to be cache_released */ + _1ps->page_is_read[c] = true; + read_si.obj_offset += PAGE_SIZE; + offset += PAGE_SIZE; + } + offset += (sp2d->pages_in_unit - p) * PAGE_SIZE; + } + +read_last_stripe: + offset = ios->offset + (ios->length + PAGE_SIZE - 1) / + PAGE_SIZE * PAGE_SIZE; + last_stripe_end = div_u64(offset + bytes_in_stripe - 1, bytes_in_stripe) + * bytes_in_stripe; + if (offset == last_stripe_end) /* Optimize for the aligned case */ + goto read_it; + + ore_calc_stripe_info(ios->layout, offset, 0, &read_si); + p = read_si.unit_off / PAGE_SIZE; + c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1, + ios->layout->mirrors_p1, read_si.par_dev, read_si.dev); + + BUG_ON(ios->si.first_stripe_start + bytes_in_stripe != last_stripe_end); + /* unaligned IO must be within a single stripe */ + + if (min_p == sp2d->pages_in_unit) { + /* Didn't do it yet */ + min_p = _sp2d_min_pg(sp2d); + max_p = _sp2d_max_pg(sp2d); + } + + while (offset < last_stripe_end) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + + if ((min_p <= p) && (p <= max_p)) { + struct page *page; + bool uptodate; + + BUG_ON(_1ps->pages[c]); + page = ios->r4w->get_page(ios->private, offset, + &uptodate); + if (unlikely(!page)) + return -ENOMEM; + + _1ps->pages[c] = page; + /* Mark read-pages to be cache_released */ + _1ps->page_is_read[c] = true; + if (!uptodate) + _add_to_read_4_write(ios, &read_si, page); + } + + offset += PAGE_SIZE; + if (p == (sp2d->pages_in_unit - 1)) { + ++c; + p = 0; + ore_calc_stripe_info(ios->layout, offset, 0, &read_si); + } else { + read_si.obj_offset += PAGE_SIZE; + ++p; + } + } + +read_it: + ios_read = ios->ios_read_4_write; + if (!ios_read) + return 0; + + /* FIXME: Ugly to signal _sbi_read_mirror that we have bio(s). Change + * to check for per_dev->bio + */ + ios_read->pages = ios->pages; + + /* Now read these devices */ + for (i = 0; i < ios_read->numdevs; i += ios_read->layout->mirrors_p1) { + ret = _ore_read_mirror(ios_read, i); + if (unlikely(ret)) + return ret; + } + + ret = ore_io_execute(ios_read); /* Synchronus execution */ + if (unlikely(ret)) { + ORE_DBGMSG("!! ore_io_execute => %d\n", ret); + return ret; + } + + _mark_read4write_pages_uptodate(ios_read, ret); + return 0; +} + +/* In writes @cur_len means length left. .i.e cur_len==0 is the last parity U */ +int _ore_add_parity_unit(struct ore_io_state *ios, + struct ore_striping_info *si, + struct ore_per_dev_state *per_dev, + unsigned cur_len) +{ + if (ios->reading) { + BUG_ON(per_dev->cur_sg >= ios->sgs_per_dev); + _ore_add_sg_seg(per_dev, cur_len, true); + } else { + struct __stripe_pages_2d *sp2d = ios->sp2d; + struct page **pages = ios->parity_pages + ios->cur_par_page; + unsigned num_pages; + unsigned array_start = 0; + unsigned i; + int ret; + + si->cur_pg = _sp2d_min_pg(sp2d); + num_pages = _sp2d_max_pg(sp2d) + 1 - si->cur_pg; + + if (!cur_len) /* If last stripe operate on parity comp */ + si->cur_comp = sp2d->data_devs; + + if (!per_dev->length) { + per_dev->offset += si->cur_pg * PAGE_SIZE; + /* If first stripe, Read in all read4write pages + * (if needed) before we calculate the first parity. + */ + _read_4_write(ios); + } + + for (i = 0; i < num_pages; i++) { + pages[i] = _raid_page_alloc(); + if (unlikely(!pages[i])) + return -ENOMEM; + + ++(ios->cur_par_page); + } + + BUG_ON(si->cur_comp != sp2d->data_devs); + BUG_ON(si->cur_pg + num_pages > sp2d->pages_in_unit); + + ret = _ore_add_stripe_unit(ios, &array_start, 0, pages, + per_dev, num_pages * PAGE_SIZE); + if (unlikely(ret)) + return ret; + + /* TODO: raid6 if (last_parity_dev) */ + _gen_xor_unit(sp2d); + _sp2d_reset(sp2d, ios->r4w, ios->private); + } + return 0; +} + +int _ore_post_alloc_raid_stuff(struct ore_io_state *ios) +{ + struct ore_layout *layout = ios->layout; + + if (ios->parity_pages) { + unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE; + unsigned stripe_size = ios->si.bytes_in_stripe; + u64 last_stripe, first_stripe; + + if (_sp2d_alloc(pages_in_unit, layout->group_width, + layout->parity, &ios->sp2d)) { + return -ENOMEM; + } + + BUG_ON(ios->offset % PAGE_SIZE); + + /* Round io down to last full strip */ + first_stripe = div_u64(ios->offset, stripe_size); + last_stripe = div_u64(ios->offset + ios->length, stripe_size); + + /* If an IO spans more then a single stripe it must end at + * a stripe boundary. The reminder at the end is pushed into the + * next IO. + */ + if (last_stripe != first_stripe) { + ios->length = last_stripe * stripe_size - ios->offset; + + BUG_ON(!ios->length); + ios->nr_pages = (ios->length + PAGE_SIZE - 1) / + PAGE_SIZE; + ios->si.length = ios->length; /*make it consistent */ + } + } + return 0; +} + +void _ore_free_raid_stuff(struct ore_io_state *ios) +{ + if (ios->sp2d) { /* writing and raid */ + unsigned i; + + for (i = 0; i < ios->cur_par_page; i++) { + struct page *page = ios->parity_pages[i]; + + if (page) + _raid_page_free(page); + } + if (ios->extra_part_alloc) + kfree(ios->parity_pages); + /* If IO returned an error pages might need unlocking */ + _sp2d_reset(ios->sp2d, ios->r4w, ios->private); + _sp2d_free(ios->sp2d); + } else { + /* Will only be set if raid reading && sglist is big */ + if (ios->extra_part_alloc) + kfree(ios->per_dev[0].sglist); + } + if (ios->ios_read_4_write) + ore_put_io_state(ios->ios_read_4_write); +} diff --git a/fs/exofs/ore_raid.h b/fs/exofs/ore_raid.h new file mode 100644 index 000000000000..2ffd2c3c6e46 --- /dev/null +++ b/fs/exofs/ore_raid.h @@ -0,0 +1,79 @@ +/* + * Copyright (C) from 2011 + * Boaz Harrosh <bharrosh@panasas.com> + * + * This file is part of the objects raid engine (ore). + * + * It is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * You should have received a copy of the GNU General Public License + * along with "ore". If not, write to the Free Software Foundation, Inc: + * "Free Software Foundation <info@fsf.org>" + */ + +#include <scsi/osd_ore.h> + +#define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a) + +#ifdef CONFIG_EXOFS_DEBUG +#define ORE_DBGMSG(fmt, a...) \ + printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a) +#else +#define ORE_DBGMSG(fmt, a...) \ + do { if (0) printk(fmt, ##a); } while (0) +#endif + +/* u64 has problems with printk this will cast it to unsigned long long */ +#define _LLU(x) (unsigned long long)(x) + +#define ORE_DBGMSG2(M...) do {} while (0) +/* #define ORE_DBGMSG2 ORE_DBGMSG */ + +/* Calculate the component order in a stripe. eg the logical data unit + * address within the stripe of @dev given the @par_dev of this stripe. + */ +static inline unsigned _dev_order(unsigned devs_in_group, unsigned mirrors_p1, + unsigned par_dev, unsigned dev) +{ + unsigned first_dev = dev - dev % devs_in_group; + + dev -= first_dev; + par_dev -= first_dev; + + if (devs_in_group == par_dev) /* The raid 0 case */ + return dev / mirrors_p1; + /* raid4/5/6 case */ + return ((devs_in_group + dev - par_dev - mirrors_p1) % devs_in_group) / + mirrors_p1; +} + +/* ios_raid.c stuff needed by ios.c */ +int _ore_post_alloc_raid_stuff(struct ore_io_state *ios); +void _ore_free_raid_stuff(struct ore_io_state *ios); + +void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len, + bool not_last); +int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si, + struct ore_per_dev_state *per_dev, unsigned cur_len); +void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d, + struct ore_striping_info *si, struct page *page); +static inline void _add_stripe_page(struct __stripe_pages_2d *sp2d, + struct ore_striping_info *si, struct page *page) +{ + if (!sp2d) /* Inline the fast path */ + return; /* Hay no raid stuff */ + _ore_add_stripe_page(sp2d, si, page); +} + +/* ios.c stuff needed by ios_raid.c */ +int _ore_get_io_state(struct ore_layout *layout, + struct ore_components *oc, unsigned numdevs, + unsigned sgs_per_dev, unsigned num_par_pages, + struct ore_io_state **pios); +int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, + unsigned pgbase, struct page **pages, + struct ore_per_dev_state *per_dev, int cur_len); +int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp); +int ore_io_execute(struct ore_io_state *ios); diff --git a/fs/exofs/pnfs.h b/fs/exofs/pnfs.h deleted file mode 100644 index c52e9888b8ab..000000000000 --- a/fs/exofs/pnfs.h +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (C) 2008, 2009 - * Boaz Harrosh <bharrosh@panasas.com> - * - * This file is part of exofs. - * - * exofs is free software; you can redistribute it and/or modify it under the - * terms of the GNU General Public License version 2 as published by the Free - * Software Foundation. - * - */ - -/* FIXME: Remove this file once pnfs hits mainline */ - -#ifndef __EXOFS_PNFS_H__ -#define __EXOFS_PNFS_H__ - -#if ! defined(__PNFS_OSD_XDR_H__) - -enum pnfs_iomode { - IOMODE_READ = 1, - IOMODE_RW = 2, - IOMODE_ANY = 3, -}; - -/* Layout Structure */ -enum pnfs_osd_raid_algorithm4 { - PNFS_OSD_RAID_0 = 1, - PNFS_OSD_RAID_4 = 2, - PNFS_OSD_RAID_5 = 3, - PNFS_OSD_RAID_PQ = 4 /* Reed-Solomon P+Q */ -}; - -struct pnfs_osd_data_map { - u32 odm_num_comps; - u64 odm_stripe_unit; - u32 odm_group_width; - u32 odm_group_depth; - u32 odm_mirror_cnt; - u32 odm_raid_algorithm; -}; - -#endif /* ! defined(__PNFS_OSD_XDR_H__) */ - -#endif /* __EXOFS_PNFS_H__ */ diff --git a/fs/exofs/super.c b/fs/exofs/super.c index c57beddcc217..e6085ec192d6 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -35,11 +35,14 @@ #include <linux/parser.h> #include <linux/vfs.h> #include <linux/random.h> +#include <linux/module.h> #include <linux/exportfs.h> #include <linux/slab.h> #include "exofs.h" +#define EXOFS_DBGMSG2(M...) do {} while (0) + /****************************************************************************** * MOUNT OPTIONS *****************************************************************************/ @@ -208,10 +211,48 @@ static void destroy_inodecache(void) } /****************************************************************************** - * SUPERBLOCK FUNCTIONS + * Some osd helpers *****************************************************************************/ -static const struct super_operations exofs_sops; -static const struct export_operations exofs_export_ops; +void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj) +{ + osd_sec_init_nosec_doall_caps(cred_a, obj, false, true); +} + +static int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj, + u64 offset, void *p, unsigned length) +{ + struct osd_request *or = osd_start_request(od, GFP_KERNEL); +/* struct osd_sense_info osi = {.key = 0};*/ + int ret; + + if (unlikely(!or)) { + EXOFS_DBGMSG("%s: osd_start_request failed.\n", __func__); + return -ENOMEM; + } + ret = osd_req_read_kern(or, obj, offset, p, length); + if (unlikely(ret)) { + EXOFS_DBGMSG("%s: osd_req_read_kern failed.\n", __func__); + goto out; + } + + ret = osd_finalize_request(or, 0, cred, NULL); + if (unlikely(ret)) { + EXOFS_DBGMSG("Failed to osd_finalize_request() => %d\n", ret); + goto out; + } + + ret = osd_execute_request(or); + if (unlikely(ret)) + EXOFS_DBGMSG("osd_execute_request() => %d\n", ret); + /* osd_req_decode_sense(or, ret); */ + +out: + osd_end_request(or); + EXOFS_DBGMSG2("read_kern(0x%llx) offset=0x%llx " + "length=0x%llx dev=%p ret=>%d\n", + _LLU(obj->id), _LLU(offset), _LLU(length), od, ret); + return ret; +} static const struct osd_attr g_attr_sb_stats = ATTR_DEF( EXOFS_APAGE_SB_DATA, @@ -223,21 +264,19 @@ static int __sbi_read_stats(struct exofs_sb_info *sbi) struct osd_attr attrs[] = { [0] = g_attr_sb_stats, }; - struct exofs_io_state *ios; + struct ore_io_state *ios; int ret; - ret = exofs_get_io_state(&sbi->layout, &ios); + ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios); if (unlikely(ret)) { - EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__); + EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); return ret; } - ios->cred = sbi->s_cred; - ios->in_attr = attrs; ios->in_attr_len = ARRAY_SIZE(attrs); - ret = exofs_sbi_read(ios); + ret = ore_read(ios); if (unlikely(ret)) { EXOFS_ERR("Error reading super_block stats => %d\n", ret); goto out; @@ -264,13 +303,13 @@ static int __sbi_read_stats(struct exofs_sb_info *sbi) } out: - exofs_put_io_state(ios); + ore_put_io_state(ios); return ret; } -static void stats_done(struct exofs_io_state *ios, void *p) +static void stats_done(struct ore_io_state *ios, void *p) { - exofs_put_io_state(ios); + ore_put_io_state(ios); /* Good thanks nothing to do anymore */ } @@ -280,12 +319,12 @@ int exofs_sbi_write_stats(struct exofs_sb_info *sbi) struct osd_attr attrs[] = { [0] = g_attr_sb_stats, }; - struct exofs_io_state *ios; + struct ore_io_state *ios; int ret; - ret = exofs_get_io_state(&sbi->layout, &ios); + ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios); if (unlikely(ret)) { - EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__); + EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); return ret; } @@ -293,29 +332,37 @@ int exofs_sbi_write_stats(struct exofs_sb_info *sbi) sbi->s_ess.s_numfiles = cpu_to_le64(sbi->s_numfiles); attrs[0].val_ptr = &sbi->s_ess; - ios->cred = sbi->s_cred; + ios->done = stats_done; ios->private = sbi; ios->out_attr = attrs; ios->out_attr_len = ARRAY_SIZE(attrs); - ret = exofs_sbi_write(ios); + ret = ore_write(ios); if (unlikely(ret)) { - EXOFS_ERR("%s: exofs_sbi_write failed.\n", __func__); - exofs_put_io_state(ios); + EXOFS_ERR("%s: ore_write failed.\n", __func__); + ore_put_io_state(ios); } return ret; } +/****************************************************************************** + * SUPERBLOCK FUNCTIONS + *****************************************************************************/ +static const struct super_operations exofs_sops; +static const struct export_operations exofs_export_ops; + /* * Write the superblock to the OSD */ -int exofs_sync_fs(struct super_block *sb, int wait) +static int exofs_sync_fs(struct super_block *sb, int wait) { struct exofs_sb_info *sbi; struct exofs_fscb *fscb; - struct exofs_io_state *ios; + struct ore_comp one_comp; + struct ore_components oc; + struct ore_io_state *ios; int ret = -ENOMEM; fscb = kmalloc(sizeof(*fscb), GFP_KERNEL); @@ -331,7 +378,10 @@ int exofs_sync_fs(struct super_block *sb, int wait) * version). Otherwise the exofs_fscb is read-only from mkfs time. All * the writeable info is set in exofs_sbi_write_stats() above. */ - ret = exofs_get_io_state(&sbi->layout, &ios); + + exofs_init_comps(&oc, &one_comp, sbi, EXOFS_SUPER_ID); + + ret = ore_get_io_state(&sbi->layout, &oc, &ios); if (unlikely(ret)) goto out; @@ -345,14 +395,12 @@ int exofs_sync_fs(struct super_block *sb, int wait) fscb->s_newfs = 0; fscb->s_version = EXOFS_FSCB_VER; - ios->obj.id = EXOFS_SUPER_ID; ios->offset = 0; ios->kern_buff = fscb; - ios->cred = sbi->s_cred; - ret = exofs_sbi_write(ios); + ret = ore_write(ios); if (unlikely(ret)) - EXOFS_ERR("%s: exofs_sbi_write failed.\n", __func__); + EXOFS_ERR("%s: ore_write failed.\n", __func__); else sb->s_dirt = 0; @@ -360,7 +408,7 @@ int exofs_sync_fs(struct super_block *sb, int wait) unlock_super(sb); out: EXOFS_DBGMSG("s_nextid=0x%llx ret=%d\n", _LLU(sbi->s_nextid), ret); - exofs_put_io_state(ios); + ore_put_io_state(ios); kfree(fscb); return ret; } @@ -382,17 +430,20 @@ static void _exofs_print_device(const char *msg, const char *dev_path, msg, dev_path ?: "", odi->osdname, _LLU(pid)); } -void exofs_free_sbi(struct exofs_sb_info *sbi) +static void exofs_free_sbi(struct exofs_sb_info *sbi) { - while (sbi->layout.s_numdevs) { - int i = --sbi->layout.s_numdevs; - struct osd_dev *od = sbi->layout.s_ods[i]; + unsigned numdevs = sbi->oc.numdevs; + + while (numdevs) { + unsigned i = --numdevs; + struct osd_dev *od = ore_comp_dev(&sbi->oc, i); if (od) { - sbi->layout.s_ods[i] = NULL; + ore_comp_set_dev(&sbi->oc, i, NULL); osduld_put_device(od); } } + kfree(sbi->oc.ods); kfree(sbi); } @@ -419,8 +470,8 @@ static void exofs_put_super(struct super_block *sb) msecs_to_jiffies(100)); } - _exofs_print_device("Unmounting", NULL, sbi->layout.s_ods[0], - sbi->layout.s_pid); + _exofs_print_device("Unmounting", NULL, ore_comp_dev(&sbi->oc, 0), + sbi->one_comp.obj.partition); bdi_destroy(&sbi->bdi); exofs_free_sbi(sbi); @@ -430,81 +481,34 @@ static void exofs_put_super(struct super_block *sb) static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, struct exofs_device_table *dt) { - u64 stripe_length; + int ret; - sbi->data_map.odm_num_comps = - le32_to_cpu(dt->dt_data_map.cb_num_comps); - sbi->data_map.odm_stripe_unit = + sbi->layout.stripe_unit = le64_to_cpu(dt->dt_data_map.cb_stripe_unit); - sbi->data_map.odm_group_width = + sbi->layout.group_width = le32_to_cpu(dt->dt_data_map.cb_group_width); - sbi->data_map.odm_group_depth = + sbi->layout.group_depth = le32_to_cpu(dt->dt_data_map.cb_group_depth); - sbi->data_map.odm_mirror_cnt = - le32_to_cpu(dt->dt_data_map.cb_mirror_cnt); - sbi->data_map.odm_raid_algorithm = + sbi->layout.mirrors_p1 = + le32_to_cpu(dt->dt_data_map.cb_mirror_cnt) + 1; + sbi->layout.raid_algorithm = le32_to_cpu(dt->dt_data_map.cb_raid_algorithm); -/* FIXME: Only raid0 for now. if not so, do not mount */ - if (sbi->data_map.odm_num_comps != numdevs) { - EXOFS_ERR("odm_num_comps(%u) != numdevs(%u)\n", - sbi->data_map.odm_num_comps, numdevs); - return -EINVAL; - } - if (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) { - EXOFS_ERR("Only RAID_0 for now\n"); - return -EINVAL; - } - if (0 != (numdevs % (sbi->data_map.odm_mirror_cnt + 1))) { - EXOFS_ERR("Data Map wrong, numdevs=%d mirrors=%d\n", - numdevs, sbi->data_map.odm_mirror_cnt); - return -EINVAL; - } - - if (0 != (sbi->data_map.odm_stripe_unit & ~PAGE_MASK)) { - EXOFS_ERR("Stripe Unit(0x%llx)" - " must be Multples of PAGE_SIZE(0x%lx)\n", - _LLU(sbi->data_map.odm_stripe_unit), PAGE_SIZE); - return -EINVAL; - } - - sbi->layout.stripe_unit = sbi->data_map.odm_stripe_unit; - sbi->layout.mirrors_p1 = sbi->data_map.odm_mirror_cnt + 1; - - if (sbi->data_map.odm_group_width) { - sbi->layout.group_width = sbi->data_map.odm_group_width; - sbi->layout.group_depth = sbi->data_map.odm_group_depth; - if (!sbi->layout.group_depth) { - EXOFS_ERR("group_depth == 0 && group_width != 0\n"); - return -EINVAL; - } - sbi->layout.group_count = sbi->data_map.odm_num_comps / - sbi->layout.mirrors_p1 / - sbi->data_map.odm_group_width; - } else { - if (sbi->data_map.odm_group_depth) { - printk(KERN_NOTICE "Warning: group_depth ignored " - "group_width == 0 && group_depth == %d\n", - sbi->data_map.odm_group_depth); - sbi->data_map.odm_group_depth = 0; - } - sbi->layout.group_width = sbi->data_map.odm_num_comps / - sbi->layout.mirrors_p1; - sbi->layout.group_depth = -1; - sbi->layout.group_count = 1; - } - - stripe_length = (u64)sbi->layout.group_width * sbi->layout.stripe_unit; - if (stripe_length >= (1ULL << 32)) { - EXOFS_ERR("Total Stripe length(0x%llx)" - " >= 32bit is not supported\n", _LLU(stripe_length)); - return -EINVAL; - } - - return 0; + ret = ore_verify_layout(numdevs, &sbi->layout); + + EXOFS_DBGMSG("exofs: layout: " + "num_comps=%u stripe_unit=0x%x group_width=%u " + "group_depth=0x%llx mirrors_p1=%u raid_algorithm=%u\n", + numdevs, + sbi->layout.stripe_unit, + sbi->layout.group_width, + _LLU(sbi->layout.group_depth), + sbi->layout.mirrors_p1, + sbi->layout.raid_algorithm); + return ret; } -static unsigned __ra_pages(struct exofs_layout *layout) +static unsigned __ra_pages(struct ore_layout *layout) { const unsigned _MIN_RA = 32; /* min 128K read-ahead */ unsigned ra_pages = layout->group_width * layout->stripe_unit / @@ -547,14 +551,40 @@ static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev, return !(odi->systemid_len || odi->osdname_len); } -static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi, +int __alloc_dev_table(struct exofs_sb_info *sbi, unsigned numdevs, + struct exofs_dev **peds) +{ + struct __alloc_ore_devs_and_exofs_devs { + /* Twice bigger table: See exofs_init_comps() and comment at + * exofs_read_lookup_dev_table() + */ + struct ore_dev *oreds[numdevs * 2 - 1]; + struct exofs_dev eds[numdevs]; + } *aoded; + struct exofs_dev *eds; + unsigned i; + + aoded = kzalloc(sizeof(*aoded), GFP_KERNEL); + if (unlikely(!aoded)) { + EXOFS_ERR("ERROR: faild allocating Device array[%d]\n", + numdevs); + return -ENOMEM; + } + + sbi->oc.ods = aoded->oreds; + *peds = eds = aoded->eds; + for (i = 0; i < numdevs; ++i) + aoded->oreds[i] = &eds[i].ored; + return 0; +} + +static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, + struct osd_dev *fscb_od, unsigned table_count) { - struct exofs_sb_info *sbi = *psbi; - struct osd_dev *fscb_od; - struct osd_obj_id obj = {.partition = sbi->layout.s_pid, - .id = EXOFS_DEVTABLE_ID}; + struct ore_comp comp; struct exofs_device_table *dt; + struct exofs_dev *eds; unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) + sizeof(*dt); unsigned numdevs, i; @@ -567,10 +597,14 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi, return -ENOMEM; } - fscb_od = sbi->layout.s_ods[0]; - sbi->layout.s_ods[0] = NULL; - sbi->layout.s_numdevs = 0; - ret = exofs_read_kern(fscb_od, sbi->s_cred, &obj, 0, dt, table_bytes); + sbi->oc.numdevs = 0; + + comp.obj.partition = sbi->one_comp.obj.partition; + comp.obj.id = EXOFS_DEVTABLE_ID; + exofs_make_credential(comp.cred, &comp.obj); + + ret = exofs_read_kern(fscb_od, comp.cred, &comp.obj, 0, dt, + table_bytes); if (unlikely(ret)) { EXOFS_ERR("ERROR: reading device table\n"); goto out; @@ -587,18 +621,16 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi, if (unlikely(ret)) goto out; - if (likely(numdevs > 1)) { - unsigned size = numdevs * sizeof(sbi->layout.s_ods[0]); - - sbi = krealloc(sbi, sizeof(*sbi) + size, GFP_KERNEL); - if (unlikely(!sbi)) { - ret = -ENOMEM; - goto out; - } - memset(&sbi->layout.s_ods[1], 0, - size - sizeof(sbi->layout.s_ods[0])); - *psbi = sbi; - } + ret = __alloc_dev_table(sbi, numdevs, &eds); + if (unlikely(ret)) + goto out; + /* exofs round-robins the device table view according to inode + * number. We hold a: twice bigger table hence inodes can point + * to any device and have a sequential view of the table + * starting at this device. See exofs_init_comps() + */ + memcpy(&sbi->oc.ods[numdevs], &sbi->oc.ods[0], + (numdevs - 1) * sizeof(sbi->oc.ods[0])); for (i = 0; i < numdevs; i++) { struct exofs_fscb fscb; @@ -614,13 +646,16 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi, printk(KERN_NOTICE "Add device[%d]: osd_name-%s\n", i, odi.osdname); + /* the exofs id is currently the table index */ + eds[i].did = i; + /* On all devices the device table is identical. The user can * specify any one of the participating devices on the command * line. We always keep them in device-table order. */ if (fscb_od && osduld_device_same(fscb_od, &odi)) { - sbi->layout.s_ods[i] = fscb_od; - ++sbi->layout.s_numdevs; + eds[i].ored.od = fscb_od; + ++sbi->oc.numdevs; fscb_od = NULL; continue; } @@ -633,13 +668,13 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi, goto out; } - sbi->layout.s_ods[i] = od; - ++sbi->layout.s_numdevs; + eds[i].ored.od = od; + ++sbi->oc.numdevs; /* Read the fscb of the other devices to make sure the FS * partition is there. */ - ret = exofs_read_kern(od, sbi->s_cred, &obj, 0, &fscb, + ret = exofs_read_kern(od, comp.cred, &comp.obj, 0, &fscb, sizeof(fscb)); if (unlikely(ret)) { EXOFS_ERR("ERROR: Malformed participating device " @@ -656,13 +691,11 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi, out: kfree(dt); - if (unlikely(!ret && fscb_od)) { - EXOFS_ERR( - "ERROR: Bad device-table container device not present\n"); - osduld_put_device(fscb_od); - ret = -EINVAL; + if (unlikely(fscb_od && !ret)) { + EXOFS_ERR("ERROR: Bad device-table container device not present\n"); + osduld_put_device(fscb_od); + return -EINVAL; } - return ret; } @@ -676,7 +709,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) struct exofs_sb_info *sbi; /*extended info */ struct osd_dev *od; /* Master device */ struct exofs_fscb fscb; /*on-disk superblock info */ - struct osd_obj_id obj; + struct ore_comp comp; unsigned table_count; int ret; @@ -684,10 +717,6 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) if (!sbi) return -ENOMEM; - ret = bdi_setup_and_register(&sbi->bdi, "exofs", BDI_CAP_MAP_COPY); - if (ret) - goto free_bdi; - /* use mount options to fill superblock */ if (opts->is_osdname) { struct osd_dev_info odi = {.systemid_len = 0}; @@ -695,6 +724,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) odi.osdname_len = strlen(opts->dev_name); odi.osdname = (u8 *)opts->dev_name; od = osduld_info_lookup(&odi); + kfree(opts->dev_name); + opts->dev_name = NULL; } else { od = osduld_path_lookup(opts->dev_name); } @@ -709,11 +740,15 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) sbi->layout.group_width = 1; sbi->layout.group_depth = -1; sbi->layout.group_count = 1; - sbi->layout.s_ods[0] = od; - sbi->layout.s_numdevs = 1; - sbi->layout.s_pid = opts->pid; sbi->s_timeout = opts->timeout; + sbi->one_comp.obj.partition = opts->pid; + sbi->one_comp.obj.id = 0; + exofs_make_credential(sbi->one_comp.cred, &sbi->one_comp.obj); + sbi->oc.numdevs = 1; + sbi->oc.single_comp = EC_SINGLE_COMP; + sbi->oc.comps = &sbi->one_comp; + /* fill in some other data by hand */ memset(sb->s_id, 0, sizeof(sb->s_id)); strcpy(sb->s_id, "exofs"); @@ -724,11 +759,11 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) sb->s_bdev = NULL; sb->s_dev = 0; - obj.partition = sbi->layout.s_pid; - obj.id = EXOFS_SUPER_ID; - exofs_make_credential(sbi->s_cred, &obj); + comp.obj.partition = sbi->one_comp.obj.partition; + comp.obj.id = EXOFS_SUPER_ID; + exofs_make_credential(comp.cred, &comp.obj); - ret = exofs_read_kern(od, sbi->s_cred, &obj, 0, &fscb, sizeof(fscb)); + ret = exofs_read_kern(od, comp.cred, &comp.obj, 0, &fscb, sizeof(fscb)); if (unlikely(ret)) goto free_sbi; @@ -757,9 +792,17 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) table_count = le64_to_cpu(fscb.s_dev_table_count); if (table_count) { - ret = exofs_read_lookup_dev_table(&sbi, table_count); + ret = exofs_read_lookup_dev_table(sbi, od, table_count); + if (unlikely(ret)) + goto free_sbi; + } else { + struct exofs_dev *eds; + + ret = __alloc_dev_table(sbi, 1, &eds); if (unlikely(ret)) goto free_sbi; + + ore_comp_set_dev(&sbi->oc, 0, od); } __sbi_read_stats(sbi); @@ -793,20 +836,21 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) goto free_sbi; } - _exofs_print_device("Mounting", opts->dev_name, sbi->layout.s_ods[0], - sbi->layout.s_pid); - if (opts->is_osdname) - kfree(opts->dev_name); + ret = bdi_setup_and_register(&sbi->bdi, "exofs", BDI_CAP_MAP_COPY); + if (ret) { + EXOFS_DBGMSG("Failed to bdi_setup_and_register\n"); + goto free_sbi; + } + + _exofs_print_device("Mounting", opts->dev_name, + ore_comp_dev(&sbi->oc, 0), + sbi->one_comp.obj.partition); return 0; free_sbi: - bdi_destroy(&sbi->bdi); -free_bdi: EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n", - opts->dev_name, sbi->layout.s_pid, ret); + opts->dev_name, sbi->one_comp.obj.partition, ret); exofs_free_sbi(sbi); - if (opts->is_osdname) - kfree(opts->dev_name); return ret; } @@ -837,7 +881,7 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct exofs_sb_info *sbi = sb->s_fs_info; - struct exofs_io_state *ios; + struct ore_io_state *ios; struct osd_attr attrs[] = { ATTR_DEF(OSD_APAGE_PARTITION_QUOTAS, OSD_ATTR_PQ_CAPACITY_QUOTA, sizeof(__be64)), @@ -846,21 +890,18 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf) }; uint64_t capacity = ULLONG_MAX; uint64_t used = ULLONG_MAX; - uint8_t cred_a[OSD_CAP_LEN]; int ret; - ret = exofs_get_io_state(&sbi->layout, &ios); + ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios); if (ret) { - EXOFS_DBGMSG("exofs_get_io_state failed.\n"); + EXOFS_DBGMSG("ore_get_io_state failed.\n"); return ret; } - exofs_make_credential(cred_a, &ios->obj); - ios->cred = sbi->s_cred; ios->in_attr = attrs; ios->in_attr_len = ARRAY_SIZE(attrs); - ret = exofs_sbi_read(ios); + ret = ore_read(ios); if (unlikely(ret)) goto out; @@ -889,7 +930,7 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_namelen = EXOFS_NAME_LEN; out: - exofs_put_io_state(ios); + ore_put_io_state(ios); return ret; } @@ -908,7 +949,7 @@ static const struct super_operations exofs_sops = { * EXPORT OPERATIONS *****************************************************************************/ -struct dentry *exofs_get_parent(struct dentry *child) +static struct dentry *exofs_get_parent(struct dentry *child) { unsigned long ino = exofs_parent_ino(child); diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index bfe651f9ae16..35d6a3cfd9ff 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -128,7 +128,7 @@ fail: /* * inode->i_mutex: don't care */ -static struct posix_acl * +struct posix_acl * ext2_get_acl(struct inode *inode, int type) { int name_index; @@ -194,12 +194,10 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl) case ACL_TYPE_ACCESS: name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS; if (acl) { - mode_t mode = inode->i_mode; - error = posix_acl_equiv_mode(acl, &mode); + error = posix_acl_equiv_mode(acl, &inode->i_mode); if (error < 0) return error; else { - inode->i_mode = mode; inode->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(inode); if (error == 0) @@ -231,29 +229,6 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl) return error; } -int -ext2_check_acl(struct inode *inode, int mask) -{ - struct posix_acl *acl; - - if (mask & MAY_NOT_BLOCK) { - if (!negative_cached_acl(inode, ACL_TYPE_ACCESS)) - return -ECHILD; - return -EAGAIN; - } - - acl = ext2_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl) { - int error = posix_acl_permission(inode, acl, mask); - posix_acl_release(acl); - return error; - } - - return -EAGAIN; -} - /* * Initialize the ACLs of a new inode. Called from ext2_new_inode. * @@ -276,29 +251,18 @@ ext2_init_acl(struct inode *inode, struct inode *dir) inode->i_mode &= ~current_umask(); } if (test_opt(inode->i_sb, POSIX_ACL) && acl) { - struct posix_acl *clone; - mode_t mode; - if (S_ISDIR(inode->i_mode)) { error = ext2_set_acl(inode, ACL_TYPE_DEFAULT, acl); if (error) goto cleanup; } - clone = posix_acl_clone(acl, GFP_KERNEL); - error = -ENOMEM; - if (!clone) - goto cleanup; - mode = inode->i_mode; - error = posix_acl_create_masq(clone, &mode); - if (error >= 0) { - inode->i_mode = mode; - if (error > 0) { - /* This is an extended ACL */ - error = ext2_set_acl(inode, - ACL_TYPE_ACCESS, clone); - } + error = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode); + if (error < 0) + return error; + if (error > 0) { + /* This is an extended ACL */ + error = ext2_set_acl(inode, ACL_TYPE_ACCESS, acl); } - posix_acl_release(clone); } cleanup: posix_acl_release(acl); @@ -322,7 +286,7 @@ cleanup: int ext2_acl_chmod(struct inode *inode) { - struct posix_acl *acl, *clone; + struct posix_acl *acl; int error; if (!test_opt(inode->i_sb, POSIX_ACL)) @@ -332,14 +296,11 @@ ext2_acl_chmod(struct inode *inode) acl = ext2_get_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR(acl) || !acl) return PTR_ERR(acl); - clone = posix_acl_clone(acl, GFP_KERNEL); + error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); + if (error) + return error; + error = ext2_set_acl(inode, ACL_TYPE_ACCESS, acl); posix_acl_release(acl); - if (!clone) - return -ENOMEM; - error = posix_acl_chmod_masq(clone, inode->i_mode); - if (!error) - error = ext2_set_acl(inode, ACL_TYPE_ACCESS, clone); - posix_acl_release(clone); return error; } diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h index 3ff6cbb9ac44..503bfb0ed79b 100644 --- a/fs/ext2/acl.h +++ b/fs/ext2/acl.h @@ -54,13 +54,12 @@ static inline int ext2_acl_count(size_t size) #ifdef CONFIG_EXT2_FS_POSIX_ACL /* acl.c */ -extern int ext2_check_acl (struct inode *, int); +extern struct posix_acl *ext2_get_acl(struct inode *inode, int type); extern int ext2_acl_chmod (struct inode *); extern int ext2_init_acl (struct inode *, struct inode *); #else #include <linux/sched.h> -#define ext2_check_acl NULL #define ext2_get_acl NULL #define ext2_set_acl NULL diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c index 8f44cef1b3ef..a8cbe1bc6ad4 100644 --- a/fs/ext2/balloc.c +++ b/fs/ext2/balloc.c @@ -421,7 +421,7 @@ static inline int rsv_is_empty(struct ext2_reserve_window *rsv) void ext2_init_block_alloc_info(struct inode *inode) { struct ext2_inode_info *ei = EXT2_I(inode); - struct ext2_block_alloc_info *block_i = ei->i_block_alloc_info; + struct ext2_block_alloc_info *block_i; struct super_block *sb = inode->i_sb; block_i = kmalloc(sizeof(*block_i), GFP_NOFS); diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index af9fc89b1b2d..9a4e5e206d08 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -135,10 +135,10 @@ extern long ext2_compat_ioctl(struct file *, unsigned int, unsigned long); struct dentry *ext2_get_parent(struct dentry *child); /* super.c */ -extern void ext2_error (struct super_block *, const char *, const char *, ...) - __attribute__ ((format (printf, 3, 4))); -extern void ext2_msg(struct super_block *, const char *, const char *, ...) - __attribute__ ((format (printf, 3, 4))); +extern __printf(3, 4) +void ext2_error(struct super_block *, const char *, const char *, ...); +extern __printf(3, 4) +void ext2_msg(struct super_block *, const char *, const char *, ...); extern void ext2_update_dynamic_rev (struct super_block *sb); extern void ext2_write_super (struct super_block *); diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 82e06321de35..a5b3a5db3120 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -102,6 +102,6 @@ const struct inode_operations ext2_file_inode_operations = { .removexattr = generic_removexattr, #endif .setattr = ext2_setattr, - .check_acl = ext2_check_acl, + .get_acl = ext2_get_acl, .fiemap = ext2_fiemap, }; diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index ee9ed31948e1..c4e81dfb74ba 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -601,7 +601,7 @@ fail_free_drop: fail_drop: dquot_drop(inode); inode->i_flags |= S_NOQUOTA; - inode->i_nlink = 0; + clear_nlink(inode); unlock_new_inode(inode); iput(inode); return ERR_PTR(err); diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index a8a58f63f07c..91a6945af6d8 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -1321,7 +1321,7 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino) inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; } - inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); + set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); inode->i_size = le32_to_cpu(raw_inode->i_size); inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime); inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime); diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index d60b7099e2db..761fde807fc9 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -408,7 +408,7 @@ const struct inode_operations ext2_dir_inode_operations = { .removexattr = generic_removexattr, #endif .setattr = ext2_setattr, - .check_acl = ext2_check_acl, + .get_acl = ext2_get_acl, }; const struct inode_operations ext2_special_inode_operations = { @@ -419,5 +419,5 @@ const struct inode_operations ext2_special_inode_operations = { .removexattr = generic_removexattr, #endif .setattr = ext2_setattr, - .check_acl = ext2_check_acl, + .get_acl = ext2_get_acl, }; diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 1dd62ed35b85..bd8ac164a3bf 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -327,10 +327,10 @@ static struct inode *ext2_nfs_get_inode(struct super_block *sb, if (ino > le32_to_cpu(EXT2_SB(sb)->s_es->s_inodes_count)) return ERR_PTR(-ESTALE); - /* iget isn't really right if the inode is currently unallocated!! - * ext2_read_inode currently does appropriate checks, but - * it might be "neater" to call ext2_get_inode first and check - * if the inode is valid..... + /* + * ext2_iget isn't quite right if the inode is currently unallocated! + * However ext2_iget currently does appropriate checks to handle stale + * inodes so everything is OK. */ inode = ext2_iget(sb, ino); if (IS_ERR(inode)) diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index 529970617a21..d27b71f1d183 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -161,6 +161,10 @@ ext2_xattr_get(struct inode *inode, int name_index, const char *name, if (name == NULL) return -EINVAL; + name_len = strlen(name); + if (name_len > 255) + return -ERANGE; + down_read(&EXT2_I(inode)->xattr_sem); error = -ENODATA; if (!EXT2_I(inode)->i_file_acl) @@ -181,12 +185,8 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_get", error = -EIO; goto cleanup; } - /* find named attribute */ - name_len = strlen(name); - error = -ERANGE; - if (name_len > 255) - goto cleanup; + /* find named attribute */ entry = FIRST_ENTRY(bh); while (!IS_LAST_ENTRY(entry)) { struct ext2_xattr_entry *next = diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c index 5d979b4347b0..c922adc8ef41 100644 --- a/fs/ext2/xattr_security.c +++ b/fs/ext2/xattr_security.c @@ -46,28 +46,30 @@ ext2_xattr_security_set(struct dentry *dentry, const char *name, value, size, flags); } -int -ext2_init_security(struct inode *inode, struct inode *dir, - const struct qstr *qstr) +int ext2_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) { - int err; - size_t len; - void *value; - char *name; + const struct xattr *xattr; + int err = 0; - err = security_inode_init_security(inode, dir, qstr, &name, &value, &len); - if (err) { - if (err == -EOPNOTSUPP) - return 0; - return err; + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + err = ext2_xattr_set(inode, EXT2_XATTR_INDEX_SECURITY, + xattr->name, xattr->value, + xattr->value_len, 0); + if (err < 0) + break; } - err = ext2_xattr_set(inode, EXT2_XATTR_INDEX_SECURITY, - name, value, len, 0); - kfree(name); - kfree(value); return err; } +int +ext2_init_security(struct inode *inode, struct inode *dir, + const struct qstr *qstr) +{ + return security_inode_init_security(inode, dir, qstr, + &ext2_initxattrs, NULL); +} + const struct xattr_handler ext2_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .list = ext2_xattr_security_list, diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c index edfeb293d4cb..3091f62e55b6 100644 --- a/fs/ext3/acl.c +++ b/fs/ext3/acl.c @@ -131,7 +131,7 @@ fail: * * inode->i_mutex: don't care */ -static struct posix_acl * +struct posix_acl * ext3_get_acl(struct inode *inode, int type) { int name_index; @@ -199,12 +199,10 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type, case ACL_TYPE_ACCESS: name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS; if (acl) { - mode_t mode = inode->i_mode; - error = posix_acl_equiv_mode(acl, &mode); + error = posix_acl_equiv_mode(acl, &inode->i_mode); if (error < 0) return error; else { - inode->i_mode = mode; inode->i_ctime = CURRENT_TIME_SEC; ext3_mark_inode_dirty(handle, inode); if (error == 0) @@ -239,29 +237,6 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type, return error; } -int -ext3_check_acl(struct inode *inode, int mask) -{ - struct posix_acl *acl; - - if (mask & MAY_NOT_BLOCK) { - if (!negative_cached_acl(inode, ACL_TYPE_ACCESS)) - return -ECHILD; - return -EAGAIN; - } - - acl = ext3_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl) { - int error = posix_acl_permission(inode, acl, mask); - posix_acl_release(acl); - return error; - } - - return -EAGAIN; -} - /* * Initialize the ACLs of a new inode. Called from ext3_new_inode. * @@ -284,31 +259,20 @@ ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) inode->i_mode &= ~current_umask(); } if (test_opt(inode->i_sb, POSIX_ACL) && acl) { - struct posix_acl *clone; - mode_t mode; - if (S_ISDIR(inode->i_mode)) { error = ext3_set_acl(handle, inode, ACL_TYPE_DEFAULT, acl); if (error) goto cleanup; } - clone = posix_acl_clone(acl, GFP_NOFS); - error = -ENOMEM; - if (!clone) - goto cleanup; - - mode = inode->i_mode; - error = posix_acl_create_masq(clone, &mode); - if (error >= 0) { - inode->i_mode = mode; - if (error > 0) { - /* This is an extended ACL */ - error = ext3_set_acl(handle, inode, - ACL_TYPE_ACCESS, clone); - } + error = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); + if (error < 0) + return error; + + if (error > 0) { + /* This is an extended ACL */ + error = ext3_set_acl(handle, inode, ACL_TYPE_ACCESS, acl); } - posix_acl_release(clone); } cleanup: posix_acl_release(acl); @@ -332,7 +296,9 @@ cleanup: int ext3_acl_chmod(struct inode *inode) { - struct posix_acl *acl, *clone; + struct posix_acl *acl; + handle_t *handle; + int retries = 0; int error; if (S_ISLNK(inode->i_mode)) @@ -342,31 +308,24 @@ ext3_acl_chmod(struct inode *inode) acl = ext3_get_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR(acl) || !acl) return PTR_ERR(acl); - clone = posix_acl_clone(acl, GFP_KERNEL); - posix_acl_release(acl); - if (!clone) - return -ENOMEM; - error = posix_acl_chmod_masq(clone, inode->i_mode); - if (!error) { - handle_t *handle; - int retries = 0; - - retry: - handle = ext3_journal_start(inode, - EXT3_DATA_TRANS_BLOCKS(inode->i_sb)); - if (IS_ERR(handle)) { - error = PTR_ERR(handle); - ext3_std_error(inode->i_sb, error); - goto out; - } - error = ext3_set_acl(handle, inode, ACL_TYPE_ACCESS, clone); - ext3_journal_stop(handle); - if (error == -ENOSPC && - ext3_should_retry_alloc(inode->i_sb, &retries)) - goto retry; + error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); + if (error) + return error; +retry: + handle = ext3_journal_start(inode, + EXT3_DATA_TRANS_BLOCKS(inode->i_sb)); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + ext3_std_error(inode->i_sb, error); + goto out; } + error = ext3_set_acl(handle, inode, ACL_TYPE_ACCESS, acl); + ext3_journal_stop(handle); + if (error == -ENOSPC && + ext3_should_retry_alloc(inode->i_sb, &retries)) + goto retry; out: - posix_acl_release(clone); + posix_acl_release(acl); return error; } diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h index 597334626de9..dbc921e458c5 100644 --- a/fs/ext3/acl.h +++ b/fs/ext3/acl.h @@ -54,13 +54,13 @@ static inline int ext3_acl_count(size_t size) #ifdef CONFIG_EXT3_FS_POSIX_ACL /* acl.c */ -extern int ext3_check_acl (struct inode *, int); +extern struct posix_acl *ext3_get_acl(struct inode *inode, int type); extern int ext3_acl_chmod (struct inode *); extern int ext3_init_acl (handle_t *, struct inode *, struct inode *); #else /* CONFIG_EXT3_FS_POSIX_ACL */ #include <linux/sched.h> -#define ext3_check_acl NULL +#define ext3_get_acl NULL static inline int ext3_acl_chmod(struct inode *inode) diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c index fe52297e31ad..a2038928f9a3 100644 --- a/fs/ext3/balloc.c +++ b/fs/ext3/balloc.c @@ -21,6 +21,7 @@ #include <linux/quotaops.h> #include <linux/buffer_head.h> #include <linux/blkdev.h> +#include <trace/events/ext3.h> /* * balloc.c contains the blocks allocation and deallocation routines @@ -161,6 +162,7 @@ read_block_bitmap(struct super_block *sb, unsigned int block_group) desc = ext3_get_group_desc(sb, block_group, NULL); if (!desc) return NULL; + trace_ext3_read_block_bitmap(sb, block_group); bitmap_blk = le32_to_cpu(desc->bg_block_bitmap); bh = sb_getblk(sb, bitmap_blk); if (unlikely(!bh)) { @@ -351,6 +353,7 @@ void ext3_rsv_window_add(struct super_block *sb, struct rb_node * parent = NULL; struct ext3_reserve_window_node *this; + trace_ext3_rsv_window_add(sb, rsv); while (*p) { parent = *p; @@ -424,7 +427,7 @@ static inline int rsv_is_empty(struct ext3_reserve_window *rsv) void ext3_init_block_alloc_info(struct inode *inode) { struct ext3_inode_info *ei = EXT3_I(inode); - struct ext3_block_alloc_info *block_i = ei->i_block_alloc_info; + struct ext3_block_alloc_info *block_i; struct super_block *sb = inode->i_sb; block_i = kmalloc(sizeof(*block_i), GFP_NOFS); @@ -476,8 +479,10 @@ void ext3_discard_reservation(struct inode *inode) rsv = &block_i->rsv_window_node; if (!rsv_is_empty(&rsv->rsv_window)) { spin_lock(rsv_lock); - if (!rsv_is_empty(&rsv->rsv_window)) + if (!rsv_is_empty(&rsv->rsv_window)) { + trace_ext3_discard_reservation(inode, rsv); rsv_window_remove(inode->i_sb, rsv); + } spin_unlock(rsv_lock); } } @@ -683,14 +688,10 @@ error_return: void ext3_free_blocks(handle_t *handle, struct inode *inode, ext3_fsblk_t block, unsigned long count) { - struct super_block * sb; + struct super_block *sb = inode->i_sb; unsigned long dquot_freed_blocks; - sb = inode->i_sb; - if (!sb) { - printk ("ext3_free_blocks: nonexistent device"); - return; - } + trace_ext3_free_blocks(inode, block, count); ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); if (dquot_freed_blocks) dquot_free_block(inode, dquot_freed_blocks); @@ -1136,6 +1137,7 @@ static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv, else start_block = grp_goal + group_first_block; + trace_ext3_alloc_new_reservation(sb, start_block); size = my_rsv->rsv_goal_size; if (!rsv_is_empty(&my_rsv->rsv_window)) { @@ -1230,8 +1232,11 @@ retry: * check if the first free block is within the * free space we just reserved */ - if (start_block >= my_rsv->rsv_start && start_block <= my_rsv->rsv_end) + if (start_block >= my_rsv->rsv_start && + start_block <= my_rsv->rsv_end) { + trace_ext3_reserved(sb, start_block, my_rsv); return 0; /* success */ + } /* * if the first free bit we found is out of the reservable space * continue search for next reservable space, @@ -1435,14 +1440,14 @@ out: * * Check if filesystem has at least 1 free block available for allocation. */ -static int ext3_has_free_blocks(struct ext3_sb_info *sbi) +static int ext3_has_free_blocks(struct ext3_sb_info *sbi, int use_reservation) { ext3_fsblk_t free_blocks, root_blocks; free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count); if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) && - sbi->s_resuid != current_fsuid() && + !use_reservation && sbi->s_resuid != current_fsuid() && (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) { return 0; } @@ -1463,7 +1468,7 @@ static int ext3_has_free_blocks(struct ext3_sb_info *sbi) */ int ext3_should_retry_alloc(struct super_block *sb, int *retries) { - if (!ext3_has_free_blocks(EXT3_SB(sb)) || (*retries)++ > 3) + if (!ext3_has_free_blocks(EXT3_SB(sb), 0) || (*retries)++ > 3) return 0; jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id); @@ -1514,10 +1519,6 @@ ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode, *errp = -ENOSPC; sb = inode->i_sb; - if (!sb) { - printk("ext3_new_block: nonexistent device"); - return 0; - } /* * Check quota for allocation of this block. @@ -1528,8 +1529,10 @@ ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode, return 0; } + trace_ext3_request_blocks(inode, goal, num); + sbi = EXT3_SB(sb); - es = EXT3_SB(sb)->s_es; + es = sbi->s_es; ext3_debug("goal=%lu.\n", goal); /* * Allocate a block from reservation only when @@ -1543,7 +1546,7 @@ ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode, if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0)) my_rsv = &block_i->rsv_window_node; - if (!ext3_has_free_blocks(sbi)) { + if (!ext3_has_free_blocks(sbi, IS_NOQUOTA(inode))) { *errp = -ENOSPC; goto out; } @@ -1742,6 +1745,10 @@ allocated: brelse(bitmap_bh); dquot_free_block(inode, *count-num); *count = num; + + trace_ext3_allocate_blocks(inode, goal, num, + (unsigned long long)ret_block); + return ret_block; io_error: @@ -1917,9 +1924,10 @@ unsigned long ext3_bg_num_gdb(struct super_block *sb, int group) * reaches any used block. Then issue a TRIM command on this extent and free * the extent in the block bitmap. This is done until whole group is scanned. */ -ext3_grpblk_t ext3_trim_all_free(struct super_block *sb, unsigned int group, - ext3_grpblk_t start, ext3_grpblk_t max, - ext3_grpblk_t minblocks) +static ext3_grpblk_t ext3_trim_all_free(struct super_block *sb, + unsigned int group, + ext3_grpblk_t start, ext3_grpblk_t max, + ext3_grpblk_t minblocks) { handle_t *handle; ext3_grpblk_t next, free_blocks, bit, freed, count = 0; @@ -1996,6 +2004,7 @@ ext3_grpblk_t ext3_trim_all_free(struct super_block *sb, unsigned int group, if ((next - start) < minblocks) goto free_extent; + trace_ext3_discard_blocks(sb, discard_block, next - start); /* Send the TRIM command down to the device */ err = sb_issue_discard(sb, discard_block, next - start, GFP_NOFS, 0); @@ -2100,7 +2109,7 @@ int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range) if (unlikely(minlen > EXT3_BLOCKS_PER_GROUP(sb))) return -EINVAL; if (start >= max_blks) - goto out; + return -EINVAL; if (start + len > max_blks) len = max_blks - start; @@ -2148,8 +2157,6 @@ int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range) if (ret >= 0) ret = 0; - -out: range->len = trimmed * sb->s_blocksize; return ret; diff --git a/fs/ext3/file.c b/fs/ext3/file.c index f55df0e61cbd..724df69847dc 100644 --- a/fs/ext3/file.c +++ b/fs/ext3/file.c @@ -71,7 +71,6 @@ const struct file_operations ext3_file_operations = { }; const struct inode_operations ext3_file_inode_operations = { - .truncate = ext3_truncate, .setattr = ext3_setattr, #ifdef CONFIG_EXT3_FS_XATTR .setxattr = generic_setxattr, @@ -79,7 +78,7 @@ const struct inode_operations ext3_file_inode_operations = { .listxattr = ext3_listxattr, .removexattr = generic_removexattr, #endif - .check_acl = ext3_check_acl, + .get_acl = ext3_get_acl, .fiemap = ext3_fiemap, }; diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c index 0bcf63adb80a..1860ed356323 100644 --- a/fs/ext3/fsync.c +++ b/fs/ext3/fsync.c @@ -30,6 +30,7 @@ #include <linux/jbd.h> #include <linux/ext3_fs.h> #include <linux/ext3_jbd.h> +#include <trace/events/ext3.h> /* * akpm: A new design for ext3_sync_file(). @@ -51,19 +52,14 @@ int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync) int ret, needs_barrier = 0; tid_t commit_tid; + trace_ext3_sync_file_enter(file, datasync); + if (inode->i_sb->s_flags & MS_RDONLY) return 0; ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret) - return ret; - - /* - * Taking the mutex here just to keep consistent with how fsync was - * called previously, however it looks like we don't need to take - * i_mutex at all. - */ - mutex_lock(&inode->i_mutex); + goto out; J_ASSERT(ext3_journal_current_handle() == NULL); @@ -82,8 +78,8 @@ int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * safe in-journal, which is all fsync() needs to ensure. */ if (ext3_should_journal_data(inode)) { - mutex_unlock(&inode->i_mutex); - return ext3_force_commit(inode->i_sb); + ret = ext3_force_commit(inode->i_sb); + goto out; } if (datasync) @@ -104,6 +100,7 @@ int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ if (needs_barrier) blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); - mutex_unlock(&inode->i_mutex); +out: + trace_ext3_sync_file_exit(inode, ret); return ret; } diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c index bfc2dc43681d..5c866e06e7ab 100644 --- a/fs/ext3/ialloc.c +++ b/fs/ext3/ialloc.c @@ -23,6 +23,7 @@ #include <linux/buffer_head.h> #include <linux/random.h> #include <linux/bitops.h> +#include <trace/events/ext3.h> #include <asm/byteorder.h> @@ -118,6 +119,7 @@ void ext3_free_inode (handle_t *handle, struct inode * inode) ino = inode->i_ino; ext3_debug ("freeing inode %lu\n", ino); + trace_ext3_free_inode(inode); is_directory = S_ISDIR(inode->i_mode); @@ -176,42 +178,6 @@ error_return: } /* - * There are two policies for allocating an inode. If the new inode is - * a directory, then a forward search is made for a block group with both - * free space and a low directory-to-inode ratio; if that fails, then of - * the groups with above-average free space, that group with the fewest - * directories already is chosen. - * - * For other inodes, search forward from the parent directory\'s block - * group to find a free inode. - */ -static int find_group_dir(struct super_block *sb, struct inode *parent) -{ - int ngroups = EXT3_SB(sb)->s_groups_count; - unsigned int freei, avefreei; - struct ext3_group_desc *desc, *best_desc = NULL; - int group, best_group = -1; - - freei = percpu_counter_read_positive(&EXT3_SB(sb)->s_freeinodes_counter); - avefreei = freei / ngroups; - - for (group = 0; group < ngroups; group++) { - desc = ext3_get_group_desc (sb, group, NULL); - if (!desc || !desc->bg_free_inodes_count) - continue; - if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei) - continue; - if (!best_desc || - (le16_to_cpu(desc->bg_free_blocks_count) > - le16_to_cpu(best_desc->bg_free_blocks_count))) { - best_group = group; - best_desc = desc; - } - } - return best_group; -} - -/* * Orlov's allocator for directories. * * We always try to spread first-level directories. @@ -426,6 +392,7 @@ struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, return ERR_PTR(-EPERM); sb = dir->i_sb; + trace_ext3_request_inode(dir, mode); inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOMEM); @@ -433,12 +400,9 @@ struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, sbi = EXT3_SB(sb); es = sbi->s_es; - if (S_ISDIR(mode)) { - if (test_opt (sb, OLDALLOC)) - group = find_group_dir(sb, dir); - else - group = find_group_orlov(sb, dir); - } else + if (S_ISDIR(mode)) + group = find_group_orlov(sb, dir); + else group = find_group_other(sb, dir); err = -ENOSPC; @@ -601,6 +565,7 @@ got: } ext3_debug("allocating inode %lu\n", inode->i_ino); + trace_ext3_allocate_inode(inode, dir, mode); goto really_out; fail: ext3_std_error(sb, err); @@ -617,7 +582,7 @@ fail_free_drop: fail_drop: dquot_drop(inode); inode->i_flags |= S_NOQUOTA; - inode->i_nlink = 0; + clear_nlink(inode); unlock_new_inode(inode); iput(inode); brelse(bitmap_bh); diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 2978a2a17a59..85fe655fe3e0 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -38,10 +38,12 @@ #include <linux/bio.h> #include <linux/fiemap.h> #include <linux/namei.h> +#include <trace/events/ext3.h> #include "xattr.h" #include "acl.h" static int ext3_writepage_trans_blocks(struct inode *inode); +static int ext3_block_truncate_page(struct inode *inode, loff_t from); /* * Test whether an inode is a fast symlink. @@ -70,6 +72,7 @@ int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode, might_sleep(); + trace_ext3_forget(inode, is_metadata, blocknr); BUFFER_TRACE(bh, "enter"); jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " @@ -194,20 +197,47 @@ static int truncate_restart_transaction(handle_t *handle, struct inode *inode) */ void ext3_evict_inode (struct inode *inode) { + struct ext3_inode_info *ei = EXT3_I(inode); struct ext3_block_alloc_info *rsv; handle_t *handle; int want_delete = 0; + trace_ext3_evict_inode(inode); if (!inode->i_nlink && !is_bad_inode(inode)) { dquot_initialize(inode); want_delete = 1; } + /* + * When journalling data dirty buffers are tracked only in the journal. + * So although mm thinks everything is clean and ready for reaping the + * inode might still have some pages to write in the running + * transaction or waiting to be checkpointed. Thus calling + * journal_invalidatepage() (via truncate_inode_pages()) to discard + * these buffers can cause data loss. Also even if we did not discard + * these buffers, we would have no way to find them after the inode + * is reaped and thus user could see stale data if he tries to read + * them before the transaction is checkpointed. So be careful and + * force everything to disk here... We use ei->i_datasync_tid to + * store the newest transaction containing inode's data. + * + * Note that directories do not have this problem because they don't + * use page cache. + */ + if (inode->i_nlink && ext3_should_journal_data(inode) && + (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) { + tid_t commit_tid = atomic_read(&ei->i_datasync_tid); + journal_t *journal = EXT3_SB(inode->i_sb)->s_journal; + + log_start_commit(journal, commit_tid); + log_wait_commit(journal, commit_tid); + filemap_write_and_wait(&inode->i_data); + } truncate_inode_pages(&inode->i_data, 0); ext3_discard_reservation(inode); - rsv = EXT3_I(inode)->i_block_alloc_info; - EXT3_I(inode)->i_block_alloc_info = NULL; + rsv = ei->i_block_alloc_info; + ei->i_block_alloc_info = NULL; if (unlikely(rsv)) kfree(rsv); @@ -231,15 +261,13 @@ void ext3_evict_inode (struct inode *inode) if (inode->i_blocks) ext3_truncate(inode); /* - * Kill off the orphan record which ext3_truncate created. - * AKPM: I think this can be inside the above `if'. - * Note that ext3_orphan_del() has to be able to cope with the - * deletion of a non-existent orphan - this is because we don't - * know if ext3_truncate() actually created an orphan record. - * (Well, we could do this if we need to, but heck - it works) + * Kill off the orphan record created when the inode lost the last + * link. Note that ext3_orphan_del() has to be able to cope with the + * deletion of a non-existent orphan - ext3_truncate() could + * have removed the record. */ ext3_orphan_del(handle, inode); - EXT3_I(inode)->i_dtime = get_seconds(); + ei->i_dtime = get_seconds(); /* * One subtle ordering requirement: if anything has gone wrong @@ -842,6 +870,7 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, ext3_fsblk_t first_block = 0; + trace_ext3_get_blocks_enter(inode, iblock, maxblocks, create); J_ASSERT(handle != NULL || create == 0); depth = ext3_block_to_path(inode,iblock,offsets,&blocks_to_boundary); @@ -886,6 +915,9 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, if (!create || err == -EIO) goto cleanup; + /* + * Block out ext3_truncate while we alter the tree + */ mutex_lock(&ei->truncate_mutex); /* @@ -934,9 +966,6 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, */ count = ext3_blks_to_allocate(partial, indirect_blks, maxblocks, blocks_to_boundary); - /* - * Block out ext3_truncate while we alter the tree - */ err = ext3_alloc_branch(handle, inode, indirect_blks, &count, goal, offsets + (partial - chain), partial); @@ -970,6 +999,9 @@ cleanup: } BUFFER_TRACE(bh_result, "returned"); out: + trace_ext3_get_blocks_exit(inode, iblock, + depth ? le32_to_cpu(chain[depth-1].key) : 0, + count, err); return err; } @@ -1102,7 +1134,7 @@ struct buffer_head *ext3_bread(handle_t *handle, struct inode *inode, return bh; if (buffer_uptodate(bh)) return bh; - ll_rw_block(READ_META, 1, &bh); + ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh); wait_on_buffer(bh); if (buffer_uptodate(bh)) return bh; @@ -1202,6 +1234,16 @@ static void ext3_truncate_failed_write(struct inode *inode) ext3_truncate(inode); } +/* + * Truncate blocks that were not used by direct IO write. We have to zero out + * the last file block as well because direct IO might have written to it. + */ +static void ext3_truncate_failed_direct_write(struct inode *inode) +{ + ext3_block_truncate_page(inode, inode->i_size); + ext3_truncate(inode); +} + static int ext3_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -1217,6 +1259,8 @@ static int ext3_write_begin(struct file *file, struct address_space *mapping, * we allocate blocks but write fails for some reason */ int needed_blocks = ext3_writepage_trans_blocks(inode) + 1; + trace_ext3_write_begin(inode, pos, len, flags); + index = pos >> PAGE_CACHE_SHIFT; from = pos & (PAGE_CACHE_SIZE - 1); to = from + len; @@ -1332,6 +1376,7 @@ static int ext3_ordered_write_end(struct file *file, unsigned from, to; int ret = 0, ret2; + trace_ext3_ordered_write_end(inode, pos, len, copied); copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); from = pos & (PAGE_CACHE_SIZE - 1); @@ -1367,6 +1412,7 @@ static int ext3_writeback_write_end(struct file *file, struct inode *inode = file->f_mapping->host; int ret; + trace_ext3_writeback_write_end(inode, pos, len, copied); copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); update_file_sizes(inode, pos, copied); /* @@ -1391,10 +1437,12 @@ static int ext3_journalled_write_end(struct file *file, { handle_t *handle = ext3_journal_current_handle(); struct inode *inode = mapping->host; + struct ext3_inode_info *ei = EXT3_I(inode); int ret = 0, ret2; int partial = 0; unsigned from, to; + trace_ext3_journalled_write_end(inode, pos, len, copied); from = pos & (PAGE_CACHE_SIZE - 1); to = from + len; @@ -1419,8 +1467,9 @@ static int ext3_journalled_write_end(struct file *file, if (pos + len > inode->i_size && ext3_can_truncate(inode)) ext3_orphan_add(handle, inode); ext3_set_inode_state(inode, EXT3_STATE_JDATA); - if (inode->i_size > EXT3_I(inode)->i_disksize) { - EXT3_I(inode)->i_disksize = inode->i_size; + atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid); + if (inode->i_size > ei->i_disksize) { + ei->i_disksize = inode->i_size; ret2 = ext3_mark_inode_dirty(handle, inode); if (!ret) ret = ret2; @@ -1577,6 +1626,7 @@ static int ext3_ordered_writepage(struct page *page, if (ext3_journal_current_handle()) goto out_fail; + trace_ext3_ordered_writepage(page); if (!page_has_buffers(page)) { create_empty_buffers(page, inode->i_sb->s_blocksize, (1 << BH_Dirty)|(1 << BH_Uptodate)); @@ -1647,6 +1697,7 @@ static int ext3_writeback_writepage(struct page *page, if (ext3_journal_current_handle()) goto out_fail; + trace_ext3_writeback_writepage(page); if (page_has_buffers(page)) { if (!walk_page_buffers(NULL, page_buffers(page), 0, PAGE_CACHE_SIZE, NULL, buffer_unmapped)) { @@ -1689,6 +1740,7 @@ static int ext3_journalled_writepage(struct page *page, if (ext3_journal_current_handle()) goto no_write; + trace_ext3_journalled_writepage(page); handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode)); if (IS_ERR(handle)) { ret = PTR_ERR(handle); @@ -1715,6 +1767,8 @@ static int ext3_journalled_writepage(struct page *page, if (ret == 0) ret = err; ext3_set_inode_state(inode, EXT3_STATE_JDATA); + atomic_set(&EXT3_I(inode)->i_datasync_tid, + handle->h_transaction->t_tid); unlock_page(page); } else { /* @@ -1739,6 +1793,7 @@ out_unlock: static int ext3_readpage(struct file *file, struct page *page) { + trace_ext3_readpage(page); return mpage_readpage(page, ext3_get_block); } @@ -1753,6 +1808,8 @@ static void ext3_invalidatepage(struct page *page, unsigned long offset) { journal_t *journal = EXT3_JOURNAL(page->mapping->host); + trace_ext3_invalidatepage(page, offset); + /* * If it's a full truncate we just forget about the pending dirtying */ @@ -1766,6 +1823,7 @@ static int ext3_releasepage(struct page *page, gfp_t wait) { journal_t *journal = EXT3_JOURNAL(page->mapping->host); + trace_ext3_releasepage(page); WARN_ON(PageChecked(page)); if (!page_has_buffers(page)) return 0; @@ -1794,6 +1852,8 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb, size_t count = iov_length(iov, nr_segs); int retries = 0; + trace_ext3_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw); + if (rw == WRITE) { loff_t final_size = offset + count; @@ -1827,7 +1887,7 @@ retry: loff_t end = offset + iov_length(iov, nr_segs); if (end > isize) - vmtruncate(inode, isize); + ext3_truncate_failed_direct_write(inode); } if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) goto retry; @@ -1841,7 +1901,7 @@ retry: /* This is really bad luck. We've written the data * but cannot extend i_size. Truncate allocated blocks * and pretend the write failed... */ - ext3_truncate(inode); + ext3_truncate_failed_direct_write(inode); ret = PTR_ERR(handle); goto out; } @@ -1867,6 +1927,8 @@ retry: ret = err; } out: + trace_ext3_direct_IO_exit(inode, offset, + iov_length(iov, nr_segs), rw, ret); return ret; } @@ -1949,17 +2011,24 @@ void ext3_set_aops(struct inode *inode) * This required during truncate. We need to physically zero the tail end * of that block so it doesn't yield old data if the file is later grown. */ -static int ext3_block_truncate_page(handle_t *handle, struct page *page, - struct address_space *mapping, loff_t from) +static int ext3_block_truncate_page(struct inode *inode, loff_t from) { ext3_fsblk_t index = from >> PAGE_CACHE_SHIFT; - unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned offset = from & (PAGE_CACHE_SIZE - 1); unsigned blocksize, iblock, length, pos; - struct inode *inode = mapping->host; + struct page *page; + handle_t *handle = NULL; struct buffer_head *bh; int err = 0; + /* Truncated on block boundary - nothing to do */ blocksize = inode->i_sb->s_blocksize; + if ((from & (blocksize - 1)) == 0) + return 0; + + page = grab_cache_page(inode->i_mapping, index); + if (!page) + return -ENOMEM; length = blocksize - (offset & (blocksize - 1)); iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); @@ -2004,11 +2073,23 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page, goto unlock; } + /* data=writeback mode doesn't need transaction to zero-out data */ + if (!ext3_should_writeback_data(inode)) { + /* We journal at most one block */ + handle = ext3_journal_start(inode, 1); + if (IS_ERR(handle)) { + clear_highpage(page); + flush_dcache_page(page); + err = PTR_ERR(handle); + goto unlock; + } + } + if (ext3_should_journal_data(inode)) { BUFFER_TRACE(bh, "get write access"); err = ext3_journal_get_write_access(handle, bh); if (err) - goto unlock; + goto stop; } zero_user(page, offset, length); @@ -2022,6 +2103,9 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page, err = ext3_journal_dirty_data(handle, bh); mark_buffer_dirty(bh); } +stop: + if (handle) + ext3_journal_stop(handle); unlock: unlock_page(page); @@ -2390,8 +2474,6 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode, int ext3_can_truncate(struct inode *inode) { - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) - return 0; if (S_ISREG(inode->i_mode)) return 1; if (S_ISDIR(inode->i_mode)) @@ -2435,7 +2517,6 @@ void ext3_truncate(struct inode *inode) struct ext3_inode_info *ei = EXT3_I(inode); __le32 *i_data = ei->i_data; int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb); - struct address_space *mapping = inode->i_mapping; int offsets[4]; Indirect chain[4]; Indirect *partial; @@ -2443,7 +2524,8 @@ void ext3_truncate(struct inode *inode) int n; long last_block; unsigned blocksize = inode->i_sb->s_blocksize; - struct page *page; + + trace_ext3_truncate_enter(inode); if (!ext3_can_truncate(inode)) goto out_notrans; @@ -2451,37 +2533,12 @@ void ext3_truncate(struct inode *inode) if (inode->i_size == 0 && ext3_should_writeback_data(inode)) ext3_set_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE); - /* - * We have to lock the EOF page here, because lock_page() nests - * outside journal_start(). - */ - if ((inode->i_size & (blocksize - 1)) == 0) { - /* Block boundary? Nothing to do */ - page = NULL; - } else { - page = grab_cache_page(mapping, - inode->i_size >> PAGE_CACHE_SHIFT); - if (!page) - goto out_notrans; - } - handle = start_transaction(inode); - if (IS_ERR(handle)) { - if (page) { - clear_highpage(page); - flush_dcache_page(page); - unlock_page(page); - page_cache_release(page); - } + if (IS_ERR(handle)) goto out_notrans; - } last_block = (inode->i_size + blocksize-1) >> EXT3_BLOCK_SIZE_BITS(inode->i_sb); - - if (page) - ext3_block_truncate_page(handle, page, mapping, inode->i_size); - n = ext3_block_to_path(inode, last_block, offsets, NULL); if (n == 0) goto out_stop; /* error */ @@ -2596,6 +2653,7 @@ out_stop: ext3_orphan_del(handle, inode); ext3_journal_stop(handle); + trace_ext3_truncate_exit(inode); return; out_notrans: /* @@ -2604,6 +2662,7 @@ out_notrans: */ if (inode->i_nlink) ext3_orphan_del(NULL, inode); + trace_ext3_truncate_exit(inode); } static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb, @@ -2745,9 +2804,10 @@ make_io: * has in-inode xattrs, or we don't have this inode in memory. * Read the block from disk. */ + trace_ext3_load_inode(inode); get_bh(bh); bh->b_end_io = end_buffer_read_sync; - submit_bh(READ_META, bh); + submit_bh(READ | REQ_META | REQ_PRIO, bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { ext3_error(inode->i_sb, "ext3_get_inode_loc", @@ -2839,7 +2899,7 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino) inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; } - inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); + set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); inode->i_size = le32_to_cpu(raw_inode->i_size); inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime); inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime); @@ -3229,18 +3289,36 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr) } error = ext3_orphan_add(handle, inode); + if (error) { + ext3_journal_stop(handle); + goto err_out; + } EXT3_I(inode)->i_disksize = attr->ia_size; - rc = ext3_mark_inode_dirty(handle, inode); - if (!error) - error = rc; + error = ext3_mark_inode_dirty(handle, inode); ext3_journal_stop(handle); + if (error) { + /* Some hard fs error must have happened. Bail out. */ + ext3_orphan_del(NULL, inode); + goto err_out; + } + rc = ext3_block_truncate_page(inode, attr->ia_size); + if (rc) { + /* Cleanup orphan list and exit */ + handle = ext3_journal_start(inode, 3); + if (IS_ERR(handle)) { + ext3_orphan_del(NULL, inode); + goto err_out; + } + ext3_orphan_del(handle, inode); + ext3_journal_stop(handle); + goto err_out; + } } if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size != i_size_read(inode)) { - rc = vmtruncate(inode, attr->ia_size); - if (rc) - goto err_out; + truncate_setsize(inode, attr->ia_size); + ext3_truncate(inode); } setattr_copy(inode, attr); @@ -3374,6 +3452,7 @@ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode) int err; might_sleep(); + trace_ext3_mark_inode_dirty(inode, _RET_IP_); err = ext3_reserve_inode_write(handle, inode, &iloc); if (!err) err = ext3_mark_iloc_dirty(handle, inode, &iloc); diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c index f4090bd2f345..ba1b54e23cae 100644 --- a/fs/ext3/ioctl.c +++ b/fs/ext3/ioctl.c @@ -150,30 +150,6 @@ setversion_out: mnt_drop_write(filp->f_path.mnt); return err; } -#ifdef CONFIG_JBD_DEBUG - case EXT3_IOC_WAIT_FOR_READONLY: - /* - * This is racy - by the time we're woken up and running, - * the superblock could be released. And the module could - * have been unloaded. So sue me. - * - * Returns 1 if it slept, else zero. - */ - { - struct super_block *sb = inode->i_sb; - DECLARE_WAITQUEUE(wait, current); - int ret = 0; - - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&EXT3_SB(sb)->ro_wait_queue, &wait); - if (timer_pending(&EXT3_SB(sb)->turn_ro_timer)) { - schedule(); - ret = 1; - } - remove_wait_queue(&EXT3_SB(sb)->ro_wait_queue, &wait); - return ret; - } -#endif case EXT3_IOC_GETRSVSZ: if (test_opt(inode->i_sb, RESERVATION) && S_ISREG(inode->i_mode) @@ -285,7 +261,7 @@ group_add_out: if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (copy_from_user(&range, (struct fstrim_range *)arg, + if (copy_from_user(&range, (struct fstrim_range __user *)arg, sizeof(range))) return -EFAULT; @@ -293,7 +269,7 @@ group_add_out: if (ret < 0) return ret; - if (copy_to_user((struct fstrim_range *)arg, &range, + if (copy_to_user((struct fstrim_range __user *)arg, &range, sizeof(range))) return -EFAULT; diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index c095cf5640c7..642dc6d66dfd 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -36,6 +36,7 @@ #include <linux/quotaops.h> #include <linux/buffer_head.h> #include <linux/bio.h> +#include <trace/events/ext3.h> #include "namei.h" #include "xattr.h" @@ -287,7 +288,7 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext3_dir_ent while (len--) printk("%c", *name++); ext3fs_dirhash(de->name, de->name_len, &h); printk(":%x.%u ", h.hash, - ((char *) de - base)); + (unsigned) ((char *) de - base)); } space += EXT3_DIR_REC_LEN(de->name_len); names++; @@ -921,7 +922,8 @@ restart: bh = ext3_getblk(NULL, dir, b++, 0, &err); bh_use[ra_max] = bh; if (bh) - ll_rw_block(READ_META, 1, &bh); + ll_rw_block(READ | REQ_META | REQ_PRIO, + 1, &bh); } } if ((bh = bh_use[ra_ptr++]) == NULL) @@ -1013,7 +1015,7 @@ static struct buffer_head * ext3_dx_find_entry(struct inode *dir, *err = -ENOENT; errout: - dxtrace(printk("%s not found\n", name)); + dxtrace(printk("%s not found\n", entry->name)); dx_release (frames); return NULL; } @@ -1819,7 +1821,7 @@ retry: de->name_len = 2; strcpy (de->name, ".."); ext3_set_de_type(dir->i_sb, de, S_IFDIR); - inode->i_nlink = 2; + set_nlink(inode, 2); BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata"); err = ext3_journal_dirty_metadata(handle, dir_block); if (err) @@ -1831,7 +1833,7 @@ retry: if (err) { out_clear_inode: - inode->i_nlink = 0; + clear_nlink(inode); unlock_new_inode(inode); ext3_mark_inode_dirty(handle, inode); iput (inode); @@ -2140,6 +2142,7 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry) struct ext3_dir_entry_2 * de; handle_t *handle; + trace_ext3_unlink_enter(dir, dentry); /* Initialize quotas before so that eventual writes go * in separate transaction */ dquot_initialize(dir); @@ -2167,7 +2170,7 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry) ext3_warning (inode->i_sb, "ext3_unlink", "Deleting nonexistent file (%lu), %d", inode->i_ino, inode->i_nlink); - inode->i_nlink = 1; + set_nlink(inode, 1); } retval = ext3_delete_entry(handle, dir, de, bh); if (retval) @@ -2185,6 +2188,7 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry) end_unlink: ext3_journal_stop(handle); brelse (bh); + trace_ext3_unlink_exit(dentry, retval); return retval; } @@ -2206,9 +2210,11 @@ static int ext3_symlink (struct inode * dir, /* * For non-fast symlinks, we just allocate inode and put it on * orphan list in the first transaction => we need bitmap, - * group descriptor, sb, inode block, quota blocks. + * group descriptor, sb, inode block, quota blocks, and + * possibly selinux xattr blocks. */ - credits = 4 + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb); + credits = 4 + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) + + EXT3_XATTR_TRANS_BLOCKS; } else { /* * Fast symlink. We have to add entry to directory @@ -2529,7 +2535,7 @@ const struct inode_operations ext3_dir_inode_operations = { .listxattr = ext3_listxattr, .removexattr = generic_removexattr, #endif - .check_acl = ext3_check_acl, + .get_acl = ext3_get_acl, }; const struct inode_operations ext3_special_inode_operations = { @@ -2540,5 +2546,5 @@ const struct inode_operations ext3_special_inode_operations = { .listxattr = ext3_listxattr, .removexattr = generic_removexattr, #endif - .check_acl = ext3_check_acl, + .get_acl = ext3_get_acl, }; diff --git a/fs/ext3/super.c b/fs/ext3/super.c index b57ea2f91269..922d289aeeb3 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -44,6 +44,9 @@ #include "acl.h" #include "namei.h" +#define CREATE_TRACE_POINTS +#include <trace/events/ext3.h> + #ifdef CONFIG_EXT3_DEFAULTS_TO_ORDERED #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_ORDERED_DATA #else @@ -497,6 +500,14 @@ static struct inode *ext3_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } +static int ext3_drop_inode(struct inode *inode) +{ + int drop = generic_drop_inode(inode); + + trace_ext3_drop_inode(inode, drop); + return drop; +} + static void ext3_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); @@ -641,8 +652,6 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",nouid32"); if (test_opt(sb, DEBUG)) seq_puts(seq, ",debug"); - if (test_opt(sb, OLDALLOC)) - seq_puts(seq, ",oldalloc"); #ifdef CONFIG_EXT3_FS_XATTR if (test_opt(sb, XATTR_USER)) seq_puts(seq, ",user_xattr"); @@ -788,6 +797,7 @@ static const struct super_operations ext3_sops = { .destroy_inode = ext3_destroy_inode, .write_inode = ext3_write_inode, .dirty_inode = ext3_dirty_inode, + .drop_inode = ext3_drop_inode, .evict_inode = ext3_evict_inode, .put_super = ext3_put_super, .sync_fs = ext3_sync_fs, @@ -1037,10 +1047,12 @@ static int parse_options (char *options, struct super_block *sb, set_opt (sbi->s_mount_opt, DEBUG); break; case Opt_oldalloc: - set_opt (sbi->s_mount_opt, OLDALLOC); + ext3_msg(sb, KERN_WARNING, + "Ignoring deprecated oldalloc option"); break; case Opt_orlov: - clear_opt (sbi->s_mount_opt, OLDALLOC); + ext3_msg(sb, KERN_WARNING, + "Ignoring deprecated orlov option"); break; #ifdef CONFIG_EXT3_FS_XATTR case Opt_user_xattr: @@ -2509,6 +2521,7 @@ static int ext3_sync_fs(struct super_block *sb, int wait) { tid_t target; + trace_ext3_sync_fs(sb, wait); if (journal_start_commit(EXT3_SB(sb)->s_journal, &target)) { if (wait) log_wait_commit(EXT3_SB(sb)->s_journal, target); @@ -2656,13 +2669,13 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) /* * If we have an unprocessed orphan list hanging * around from a previously readonly bdev mount, - * require a full umount/remount for now. + * require a full umount & mount for now. */ if (es->s_last_orphan) { ext3_msg(sb, KERN_WARNING, "warning: couldn't " "remount RDWR because of unprocessed " "orphan inode list. Please " - "umount/remount instead."); + "umount & mount instead."); err = -EINVAL; goto restore_opts; } diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c index 32e6cc23bd9a..d565759d82ee 100644 --- a/fs/ext3/xattr.c +++ b/fs/ext3/xattr.c @@ -803,8 +803,16 @@ inserted: /* We need to allocate a new block */ ext3_fsblk_t goal = ext3_group_first_block_no(sb, EXT3_I(inode)->i_block_group); - ext3_fsblk_t block = ext3_new_block(handle, inode, - goal, &error); + ext3_fsblk_t block; + + /* + * Protect us agaist concurrent allocations to the + * same inode from ext3_..._writepage(). Reservation + * code does not expect racing allocations. + */ + mutex_lock(&EXT3_I(inode)->truncate_mutex); + block = ext3_new_block(handle, inode, goal, &error); + mutex_unlock(&EXT3_I(inode)->truncate_mutex); if (error) goto cleanup; ea_idebug(inode, "creating block %d", block); diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c index b8d9f83aa5c5..3c218b8a51d4 100644 --- a/fs/ext3/xattr_security.c +++ b/fs/ext3/xattr_security.c @@ -48,28 +48,32 @@ ext3_xattr_security_set(struct dentry *dentry, const char *name, name, value, size, flags); } -int -ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir, - const struct qstr *qstr) +int ext3_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) { - int err; - size_t len; - void *value; - char *name; + const struct xattr *xattr; + handle_t *handle = fs_info; + int err = 0; - err = security_inode_init_security(inode, dir, qstr, &name, &value, &len); - if (err) { - if (err == -EOPNOTSUPP) - return 0; - return err; + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + err = ext3_xattr_set_handle(handle, inode, + EXT3_XATTR_INDEX_SECURITY, + xattr->name, xattr->value, + xattr->value_len, 0); + if (err < 0) + break; } - err = ext3_xattr_set_handle(handle, inode, EXT3_XATTR_INDEX_SECURITY, - name, value, len, 0); - kfree(name); - kfree(value); return err; } +int +ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir, + const struct qstr *qstr) +{ + return security_inode_init_security(inode, dir, qstr, + &ext3_initxattrs, handle); +} + const struct xattr_handler ext3_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .list = ext3_xattr_security_list, diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index 04109460ba9e..56fd8f865930 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -7,7 +7,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ - mmp.o + mmp.o indirect.o ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 60d900fcc3db..a5c29bb3b835 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -131,7 +131,7 @@ fail: * * inode->i_mutex: don't care */ -static struct posix_acl * +struct posix_acl * ext4_get_acl(struct inode *inode, int type) { int name_index; @@ -198,12 +198,10 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type, case ACL_TYPE_ACCESS: name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; if (acl) { - mode_t mode = inode->i_mode; - error = posix_acl_equiv_mode(acl, &mode); + error = posix_acl_equiv_mode(acl, &inode->i_mode); if (error < 0) return error; else { - inode->i_mode = mode; inode->i_ctime = ext4_current_time(inode); ext4_mark_inode_dirty(handle, inode); if (error == 0) @@ -237,29 +235,6 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type, return error; } -int -ext4_check_acl(struct inode *inode, int mask) -{ - struct posix_acl *acl; - - if (mask & MAY_NOT_BLOCK) { - if (!negative_cached_acl(inode, ACL_TYPE_ACCESS)) - return -ECHILD; - return -EAGAIN; - } - - acl = ext4_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl) { - int error = posix_acl_permission(inode, acl, mask); - posix_acl_release(acl); - return error; - } - - return -EAGAIN; -} - /* * Initialize the ACLs of a new inode. Called from ext4_new_inode. * @@ -282,31 +257,20 @@ ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) inode->i_mode &= ~current_umask(); } if (test_opt(inode->i_sb, POSIX_ACL) && acl) { - struct posix_acl *clone; - mode_t mode; - if (S_ISDIR(inode->i_mode)) { error = ext4_set_acl(handle, inode, ACL_TYPE_DEFAULT, acl); if (error) goto cleanup; } - clone = posix_acl_clone(acl, GFP_NOFS); - error = -ENOMEM; - if (!clone) - goto cleanup; - - mode = inode->i_mode; - error = posix_acl_create_masq(clone, &mode); - if (error >= 0) { - inode->i_mode = mode; - if (error > 0) { - /* This is an extended ACL */ - error = ext4_set_acl(handle, inode, - ACL_TYPE_ACCESS, clone); - } + error = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); + if (error < 0) + return error; + + if (error > 0) { + /* This is an extended ACL */ + error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, acl); } - posix_acl_release(clone); } cleanup: posix_acl_release(acl); @@ -330,9 +294,12 @@ cleanup: int ext4_acl_chmod(struct inode *inode) { - struct posix_acl *acl, *clone; + struct posix_acl *acl; + handle_t *handle; + int retries = 0; int error; + if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; if (!test_opt(inode->i_sb, POSIX_ACL)) @@ -340,31 +307,24 @@ ext4_acl_chmod(struct inode *inode) acl = ext4_get_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR(acl) || !acl) return PTR_ERR(acl); - clone = posix_acl_clone(acl, GFP_KERNEL); - posix_acl_release(acl); - if (!clone) - return -ENOMEM; - error = posix_acl_chmod_masq(clone, inode->i_mode); - if (!error) { - handle_t *handle; - int retries = 0; - - retry: - handle = ext4_journal_start(inode, - EXT4_DATA_TRANS_BLOCKS(inode->i_sb)); - if (IS_ERR(handle)) { - error = PTR_ERR(handle); - ext4_std_error(inode->i_sb, error); - goto out; - } - error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, clone); - ext4_journal_stop(handle); - if (error == -ENOSPC && - ext4_should_retry_alloc(inode->i_sb, &retries)) - goto retry; + error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); + if (error) + return error; +retry: + handle = ext4_journal_start(inode, + EXT4_DATA_TRANS_BLOCKS(inode->i_sb)); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + ext4_std_error(inode->i_sb, error); + goto out; } + error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, acl); + ext4_journal_stop(handle); + if (error == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; out: - posix_acl_release(clone); + posix_acl_release(acl); return error; } diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h index 9d843d5deac4..18cb39ed7c7b 100644 --- a/fs/ext4/acl.h +++ b/fs/ext4/acl.h @@ -54,13 +54,13 @@ static inline int ext4_acl_count(size_t size) #ifdef CONFIG_EXT4_FS_POSIX_ACL /* acl.c */ -extern int ext4_check_acl(struct inode *, int); +struct posix_acl *ext4_get_acl(struct inode *inode, int type); extern int ext4_acl_chmod(struct inode *); extern int ext4_init_acl(handle_t *, struct inode *, struct inode *); #else /* CONFIG_EXT4_FS_POSIX_ACL */ #include <linux/sched.h> -#define ext4_check_acl NULL +#define ext4_get_acl NULL static inline int ext4_acl_chmod(struct inode *inode) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 264f6949511e..12ccacda44e0 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -28,7 +28,8 @@ */ /* - * Calculate the block group number and offset, given a block number + * Calculate the block group number and offset into the block/cluster + * allocation bitmap, given a block number */ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp) @@ -37,7 +38,8 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, ext4_grpblk_t offset; blocknr = blocknr - le32_to_cpu(es->s_first_data_block); - offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb)); + offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb)) >> + EXT4_SB(sb)->s_cluster_bits; if (offsetp) *offsetp = offset; if (blockgrpp) @@ -55,130 +57,169 @@ static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block, return 0; } -static int ext4_group_used_meta_blocks(struct super_block *sb, - ext4_group_t block_group, - struct ext4_group_desc *gdp) +/* Return the number of clusters used for file system metadata; this + * represents the overhead needed by the file system. + */ +unsigned ext4_num_overhead_clusters(struct super_block *sb, + ext4_group_t block_group, + struct ext4_group_desc *gdp) { - ext4_fsblk_t tmp; + unsigned num_clusters; + int block_cluster = -1, inode_cluster = -1, itbl_cluster = -1, i, c; + ext4_fsblk_t start = ext4_group_first_block_no(sb, block_group); + ext4_fsblk_t itbl_blk; struct ext4_sb_info *sbi = EXT4_SB(sb); - /* block bitmap, inode bitmap, and inode table blocks */ - int used_blocks = sbi->s_itb_per_group + 2; - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) { - if (!ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), - block_group)) - used_blocks--; - - if (!ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp), - block_group)) - used_blocks--; - - tmp = ext4_inode_table(sb, gdp); - for (; tmp < ext4_inode_table(sb, gdp) + - sbi->s_itb_per_group; tmp++) { - if (!ext4_block_in_group(sb, tmp, block_group)) - used_blocks -= 1; + /* This is the number of clusters used by the superblock, + * block group descriptors, and reserved block group + * descriptor blocks */ + num_clusters = ext4_num_base_meta_clusters(sb, block_group); + + /* + * For the allocation bitmaps and inode table, we first need + * to check to see if the block is in the block group. If it + * is, then check to see if the cluster is already accounted + * for in the clusters used for the base metadata cluster, or + * if we can increment the base metadata cluster to include + * that block. Otherwise, we will have to track the cluster + * used for the allocation bitmap or inode table explicitly. + * Normally all of these blocks are contiguous, so the special + * case handling shouldn't be necessary except for *very* + * unusual file system layouts. + */ + if (ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), block_group)) { + block_cluster = EXT4_B2C(sbi, (start - + ext4_block_bitmap(sb, gdp))); + if (block_cluster < num_clusters) + block_cluster = -1; + else if (block_cluster == num_clusters) { + num_clusters++; + block_cluster = -1; } } - return used_blocks; -} -/* Initializes an uninitialized block bitmap if given, and returns the - * number of blocks free in the group. */ -unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, - ext4_group_t block_group, struct ext4_group_desc *gdp) -{ - int bit, bit_max; - ext4_group_t ngroups = ext4_get_groups_count(sb); - unsigned free_blocks, group_blocks; - struct ext4_sb_info *sbi = EXT4_SB(sb); - - if (bh) { - J_ASSERT_BH(bh, buffer_locked(bh)); - - /* If checksum is bad mark all blocks used to prevent allocation - * essentially implementing a per-group read-only flag. */ - if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { - ext4_error(sb, "Checksum bad for group %u", - block_group); - ext4_free_blks_set(sb, gdp, 0); - ext4_free_inodes_set(sb, gdp, 0); - ext4_itable_unused_set(sb, gdp, 0); - memset(bh->b_data, 0xff, sb->s_blocksize); - return 0; + if (ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp), block_group)) { + inode_cluster = EXT4_B2C(sbi, + start - ext4_inode_bitmap(sb, gdp)); + if (inode_cluster < num_clusters) + inode_cluster = -1; + else if (inode_cluster == num_clusters) { + num_clusters++; + inode_cluster = -1; } - memset(bh->b_data, 0, sb->s_blocksize); } - /* Check for superblock and gdt backups in this group */ - bit_max = ext4_bg_has_super(sb, block_group); - - if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) || - block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) * - sbi->s_desc_per_block) { - if (bit_max) { - bit_max += ext4_bg_num_gdb(sb, block_group); - bit_max += - le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks); + itbl_blk = ext4_inode_table(sb, gdp); + for (i = 0; i < sbi->s_itb_per_group; i++) { + if (ext4_block_in_group(sb, itbl_blk + i, block_group)) { + c = EXT4_B2C(sbi, start - itbl_blk + i); + if ((c < num_clusters) || (c == inode_cluster) || + (c == block_cluster) || (c == itbl_cluster)) + continue; + if (c == num_clusters) { + num_clusters++; + continue; + } + num_clusters++; + itbl_cluster = c; } - } else { /* For META_BG_BLOCK_GROUPS */ - bit_max += ext4_bg_num_gdb(sb, block_group); } - if (block_group == ngroups - 1) { + if (block_cluster != -1) + num_clusters++; + if (inode_cluster != -1) + num_clusters++; + + return num_clusters; +} + +static unsigned int num_clusters_in_group(struct super_block *sb, + ext4_group_t block_group) +{ + unsigned int blocks; + + if (block_group == ext4_get_groups_count(sb) - 1) { /* - * Even though mke2fs always initialize first and last group - * if some other tool enabled the EXT4_BG_BLOCK_UNINIT we need - * to make sure we calculate the right free blocks + * Even though mke2fs always initializes the first and + * last group, just in case some other tool was used, + * we need to make sure we calculate the right free + * blocks. */ - group_blocks = ext4_blocks_count(sbi->s_es) - - ext4_group_first_block_no(sb, ngroups - 1); - } else { - group_blocks = EXT4_BLOCKS_PER_GROUP(sb); - } + blocks = ext4_blocks_count(EXT4_SB(sb)->s_es) - + ext4_group_first_block_no(sb, block_group); + } else + blocks = EXT4_BLOCKS_PER_GROUP(sb); + return EXT4_NUM_B2C(EXT4_SB(sb), blocks); +} - free_blocks = group_blocks - bit_max; +/* Initializes an uninitialized block bitmap */ +void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, + ext4_group_t block_group, + struct ext4_group_desc *gdp) +{ + unsigned int bit, bit_max; + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_fsblk_t start, tmp; + int flex_bg = 0; + + J_ASSERT_BH(bh, buffer_locked(bh)); + + /* If checksum is bad mark all blocks used to prevent allocation + * essentially implementing a per-group read-only flag. */ + if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { + ext4_error(sb, "Checksum bad for group %u", block_group); + ext4_free_group_clusters_set(sb, gdp, 0); + ext4_free_inodes_set(sb, gdp, 0); + ext4_itable_unused_set(sb, gdp, 0); + memset(bh->b_data, 0xff, sb->s_blocksize); + return; + } + memset(bh->b_data, 0, sb->s_blocksize); - if (bh) { - ext4_fsblk_t start, tmp; - int flex_bg = 0; + bit_max = ext4_num_base_meta_clusters(sb, block_group); + for (bit = 0; bit < bit_max; bit++) + ext4_set_bit(bit, bh->b_data); - for (bit = 0; bit < bit_max; bit++) - ext4_set_bit(bit, bh->b_data); + start = ext4_group_first_block_no(sb, block_group); - start = ext4_group_first_block_no(sb, block_group); + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) + flex_bg = 1; - if (EXT4_HAS_INCOMPAT_FEATURE(sb, - EXT4_FEATURE_INCOMPAT_FLEX_BG)) - flex_bg = 1; + /* Set bits for block and inode bitmaps, and inode table */ + tmp = ext4_block_bitmap(sb, gdp); + if (!flex_bg || ext4_block_in_group(sb, tmp, block_group)) + ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data); - /* Set bits for block and inode bitmaps, and inode table */ - tmp = ext4_block_bitmap(sb, gdp); - if (!flex_bg || ext4_block_in_group(sb, tmp, block_group)) - ext4_set_bit(tmp - start, bh->b_data); + tmp = ext4_inode_bitmap(sb, gdp); + if (!flex_bg || ext4_block_in_group(sb, tmp, block_group)) + ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data); - tmp = ext4_inode_bitmap(sb, gdp); + tmp = ext4_inode_table(sb, gdp); + for (; tmp < ext4_inode_table(sb, gdp) + + sbi->s_itb_per_group; tmp++) { if (!flex_bg || ext4_block_in_group(sb, tmp, block_group)) - ext4_set_bit(tmp - start, bh->b_data); - - tmp = ext4_inode_table(sb, gdp); - for (; tmp < ext4_inode_table(sb, gdp) + - sbi->s_itb_per_group; tmp++) { - if (!flex_bg || - ext4_block_in_group(sb, tmp, block_group)) - ext4_set_bit(tmp - start, bh->b_data); - } - /* - * Also if the number of blocks within the group is - * less than the blocksize * 8 ( which is the size - * of bitmap ), set rest of the block bitmap to 1 - */ - ext4_mark_bitmap_end(group_blocks, sb->s_blocksize * 8, - bh->b_data); + ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data); } - return free_blocks - ext4_group_used_meta_blocks(sb, block_group, gdp); + + /* + * Also if the number of blocks within the group is less than + * the blocksize * 8 ( which is the size of bitmap ), set rest + * of the block bitmap to 1 + */ + ext4_mark_bitmap_end(num_clusters_in_group(sb, block_group), + sb->s_blocksize * 8, bh->b_data); } +/* Return the number of free blocks in a block group. It is used when + * the block bitmap is uninitialized, so we can't just count the bits + * in the bitmap. */ +unsigned ext4_free_clusters_after_init(struct super_block *sb, + ext4_group_t block_group, + struct ext4_group_desc *gdp) +{ + return num_clusters_in_group(sb, block_group) - + ext4_num_overhead_clusters(sb, block_group, gdp); +} /* * The free blocks are managed by bitmaps. A file system contains several @@ -362,53 +403,54 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) } /** - * ext4_has_free_blocks() + * ext4_has_free_clusters() * @sbi: in-core super block structure. - * @nblocks: number of needed blocks + * @nclusters: number of needed blocks + * @flags: flags from ext4_mb_new_blocks() * - * Check if filesystem has nblocks free & available for allocation. + * Check if filesystem has nclusters free & available for allocation. * On success return 1, return 0 on failure. */ -static int ext4_has_free_blocks(struct ext4_sb_info *sbi, - s64 nblocks, unsigned int flags) +static int ext4_has_free_clusters(struct ext4_sb_info *sbi, + s64 nclusters, unsigned int flags) { - s64 free_blocks, dirty_blocks, root_blocks; - struct percpu_counter *fbc = &sbi->s_freeblocks_counter; - struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter; - - free_blocks = percpu_counter_read_positive(fbc); - dirty_blocks = percpu_counter_read_positive(dbc); - root_blocks = ext4_r_blocks_count(sbi->s_es); - - if (free_blocks - (nblocks + root_blocks + dirty_blocks) < - EXT4_FREEBLOCKS_WATERMARK) { - free_blocks = percpu_counter_sum_positive(fbc); - dirty_blocks = percpu_counter_sum_positive(dbc); + s64 free_clusters, dirty_clusters, root_clusters; + struct percpu_counter *fcc = &sbi->s_freeclusters_counter; + struct percpu_counter *dcc = &sbi->s_dirtyclusters_counter; + + free_clusters = percpu_counter_read_positive(fcc); + dirty_clusters = percpu_counter_read_positive(dcc); + root_clusters = EXT4_B2C(sbi, ext4_r_blocks_count(sbi->s_es)); + + if (free_clusters - (nclusters + root_clusters + dirty_clusters) < + EXT4_FREECLUSTERS_WATERMARK) { + free_clusters = EXT4_C2B(sbi, percpu_counter_sum_positive(fcc)); + dirty_clusters = percpu_counter_sum_positive(dcc); } - /* Check whether we have space after - * accounting for current dirty blocks & root reserved blocks. + /* Check whether we have space after accounting for current + * dirty clusters & root reserved clusters. */ - if (free_blocks >= ((root_blocks + nblocks) + dirty_blocks)) + if (free_clusters >= ((root_clusters + nclusters) + dirty_clusters)) return 1; - /* Hm, nope. Are (enough) root reserved blocks available? */ + /* Hm, nope. Are (enough) root reserved clusters available? */ if (sbi->s_resuid == current_fsuid() || ((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) || capable(CAP_SYS_RESOURCE) || (flags & EXT4_MB_USE_ROOT_BLOCKS)) { - if (free_blocks >= (nblocks + dirty_blocks)) + if (free_clusters >= (nclusters + dirty_clusters)) return 1; } return 0; } -int ext4_claim_free_blocks(struct ext4_sb_info *sbi, - s64 nblocks, unsigned int flags) +int ext4_claim_free_clusters(struct ext4_sb_info *sbi, + s64 nclusters, unsigned int flags) { - if (ext4_has_free_blocks(sbi, nblocks, flags)) { - percpu_counter_add(&sbi->s_dirtyblocks_counter, nblocks); + if (ext4_has_free_clusters(sbi, nclusters, flags)) { + percpu_counter_add(&sbi->s_dirtyclusters_counter, nclusters); return 0; } else return -ENOSPC; @@ -428,7 +470,7 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi, */ int ext4_should_retry_alloc(struct super_block *sb, int *retries) { - if (!ext4_has_free_blocks(EXT4_SB(sb), 1, 0) || + if (!ext4_has_free_clusters(EXT4_SB(sb), 1, 0) || (*retries)++ > 3 || !EXT4_SB(sb)->s_journal) return 0; @@ -444,7 +486,7 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries) * @handle: handle to this transaction * @inode: file inode * @goal: given target block(filesystem wide) - * @count: pointer to total number of blocks needed + * @count: pointer to total number of clusters needed * @errp: error code * * Return 1st allocated block number on success, *count stores total account @@ -476,18 +518,19 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, spin_lock(&EXT4_I(inode)->i_block_reservation_lock); EXT4_I(inode)->i_allocated_meta_blocks += ar.len; spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); - dquot_alloc_block_nofail(inode, ar.len); + dquot_alloc_block_nofail(inode, + EXT4_C2B(EXT4_SB(inode->i_sb), ar.len)); } return ret; } /** - * ext4_count_free_blocks() -- count filesystem free blocks + * ext4_count_free_clusters() -- count filesystem free clusters * @sb: superblock * - * Adds up the number of free blocks from each block group. + * Adds up the number of free clusters from each block group. */ -ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb) +ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb) { ext4_fsblk_t desc_count; struct ext4_group_desc *gdp; @@ -508,7 +551,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb) gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; - desc_count += ext4_free_blks_count(sb, gdp); + desc_count += ext4_free_group_clusters(sb, gdp); brelse(bitmap_bh); bitmap_bh = ext4_read_block_bitmap(sb, i); if (bitmap_bh == NULL) @@ -516,12 +559,13 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb) x = ext4_count_free(bitmap_bh, sb->s_blocksize); printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n", - i, ext4_free_blks_count(sb, gdp), x); + i, ext4_free_group_clusters(sb, gdp), x); bitmap_count += x; } brelse(bitmap_bh); - printk(KERN_DEBUG "ext4_count_free_blocks: stored = %llu" - ", computed = %llu, %llu\n", ext4_free_blocks_count(es), + printk(KERN_DEBUG "ext4_count_free_clusters: stored = %llu" + ", computed = %llu, %llu\n", + EXT4_B2C(EXT4_SB(sb), ext4_free_blocks_count(es)), desc_count, bitmap_count); return bitmap_count; #else @@ -530,7 +574,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb) gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; - desc_count += ext4_free_blks_count(sb, gdp); + desc_count += ext4_free_group_clusters(sb, gdp); } return desc_count; @@ -620,3 +664,76 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group) } +/* + * This function returns the number of file system metadata clusters at + * the beginning of a block group, including the reserved gdt blocks. + */ +unsigned ext4_num_base_meta_clusters(struct super_block *sb, + ext4_group_t block_group) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + unsigned num; + + /* Check for superblock and gdt backups in this group */ + num = ext4_bg_has_super(sb, block_group); + + if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) || + block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) * + sbi->s_desc_per_block) { + if (num) { + num += ext4_bg_num_gdb(sb, block_group); + num += le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks); + } + } else { /* For META_BG_BLOCK_GROUPS */ + num += ext4_bg_num_gdb(sb, block_group); + } + return EXT4_NUM_B2C(sbi, num); +} +/** + * ext4_inode_to_goal_block - return a hint for block allocation + * @inode: inode for block allocation + * + * Return the ideal location to start allocating blocks for a + * newly created inode. + */ +ext4_fsblk_t ext4_inode_to_goal_block(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + ext4_group_t block_group; + ext4_grpblk_t colour; + int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); + ext4_fsblk_t bg_start; + ext4_fsblk_t last_block; + + block_group = ei->i_block_group; + if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { + /* + * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME + * block groups per flexgroup, reserve the first block + * group for directories and special files. Regular + * files will start at the second block group. This + * tends to speed up directory access and improves + * fsck times. + */ + block_group &= ~(flex_size-1); + if (S_ISREG(inode->i_mode)) + block_group++; + } + bg_start = ext4_group_first_block_no(inode->i_sb, block_group); + last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; + + /* + * If we are doing delayed allocation, we don't need take + * colour into account. + */ + if (test_opt(inode->i_sb, DELALLOC)) + return bg_start; + + if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) + colour = (current->pid % 16) * + (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); + else + colour = (current->pid % 16) * ((last_block - bg_start) / 16); + return bg_start + colour; +} + diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index fac90f3fba80..8efb2f0a3447 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -246,3 +246,24 @@ int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk, return 1; } +int ext4_check_blockref(const char *function, unsigned int line, + struct inode *inode, __le32 *p, unsigned int max) +{ + struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; + __le32 *bref = p; + unsigned int blk; + + while (bref < p+max) { + blk = le32_to_cpu(*bref++); + if (blk && + unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), + blk, 1))) { + es->s_last_error_block = cpu_to_le64(blk); + ext4_error_inode(inode, function, line, blk, + "invalid block"); + return -EIO; + } + } + return 0; +} + diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index fa44df879711..5b0e26a1272d 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -144,9 +144,17 @@ struct ext4_allocation_request { #define EXT4_MAP_UNWRITTEN (1 << BH_Unwritten) #define EXT4_MAP_BOUNDARY (1 << BH_Boundary) #define EXT4_MAP_UNINIT (1 << BH_Uninit) +/* Sometimes (in the bigalloc case, from ext4_da_get_block_prep) the caller of + * ext4_map_blocks wants to know whether or not the underlying cluster has + * already been accounted for. EXT4_MAP_FROM_CLUSTER conveys to the caller that + * the requested mapping was from previously mapped (or delayed allocated) + * cluster. We use BH_AllocFromCluster only for this flag. BH_AllocFromCluster + * should never appear on buffer_head's state flags. + */ +#define EXT4_MAP_FROM_CLUSTER (1 << BH_AllocFromCluster) #define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\ EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\ - EXT4_MAP_UNINIT) + EXT4_MAP_UNINIT | EXT4_MAP_FROM_CLUSTER) struct ext4_map_blocks { ext4_fsblk_t m_pblk; @@ -175,6 +183,7 @@ struct mpage_da_data { */ #define EXT4_IO_END_UNWRITTEN 0x0001 #define EXT4_IO_END_ERROR 0x0002 +#define EXT4_IO_END_QUEUED 0x0004 struct ext4_io_page { struct page *p_page; @@ -238,8 +247,11 @@ struct ext4_io_submit { # define EXT4_BLOCK_SIZE(s) (EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size) #endif #define EXT4_ADDR_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / sizeof(__u32)) +#define EXT4_CLUSTER_SIZE(s) (EXT4_BLOCK_SIZE(s) << \ + EXT4_SB(s)->s_cluster_bits) #ifdef __KERNEL__ # define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) +# define EXT4_CLUSTER_BITS(s) (EXT4_SB(s)->s_cluster_bits) #else # define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_log_block_size + 10) #endif @@ -257,6 +269,14 @@ struct ext4_io_submit { #endif #define EXT4_BLOCK_ALIGN(size, blkbits) ALIGN((size), (1 << (blkbits))) +/* Translate a block number to a cluster number */ +#define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits) +/* Translate a cluster number to a block number */ +#define EXT4_C2B(sbi, cluster) ((cluster) << (sbi)->s_cluster_bits) +/* Translate # of blks to # of clusters */ +#define EXT4_NUM_B2C(sbi, blks) (((blks) + (sbi)->s_cluster_ratio - 1) >> \ + (sbi)->s_cluster_bits) + /* * Structure of a blocks group descriptor */ @@ -288,7 +308,7 @@ struct ext4_group_desc struct flex_groups { atomic_t free_inodes; - atomic_t free_blocks; + atomic_t free_clusters; atomic_t used_dirs; }; @@ -305,6 +325,7 @@ struct flex_groups { #define EXT4_DESC_SIZE(s) (EXT4_SB(s)->s_desc_size) #ifdef __KERNEL__ # define EXT4_BLOCKS_PER_GROUP(s) (EXT4_SB(s)->s_blocks_per_group) +# define EXT4_CLUSTERS_PER_GROUP(s) (EXT4_SB(s)->s_clusters_per_group) # define EXT4_DESC_PER_BLOCK(s) (EXT4_SB(s)->s_desc_per_block) # define EXT4_INODES_PER_GROUP(s) (EXT4_SB(s)->s_inodes_per_group) # define EXT4_DESC_PER_BLOCK_BITS(s) (EXT4_SB(s)->s_desc_per_block_bits) @@ -357,8 +378,7 @@ struct flex_groups { /* Flags that should be inherited by new inodes from their parent. */ #define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ - EXT4_SYNC_FL | EXT4_IMMUTABLE_FL | EXT4_APPEND_FL |\ - EXT4_NODUMP_FL | EXT4_NOATIME_FL |\ + EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\ EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\ EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL) @@ -519,6 +539,8 @@ struct ext4_new_group_data { #define EXT4_GET_BLOCKS_PUNCH_OUT_EXT 0x0020 /* Don't normalize allocation size (used for fallocate) */ #define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040 + /* Request will not result in inode size update (user for fallocate) */ +#define EXT4_GET_BLOCKS_KEEP_SIZE 0x0080 /* * Flags used by ext4_free_blocks @@ -526,6 +548,14 @@ struct ext4_new_group_data { #define EXT4_FREE_BLOCKS_METADATA 0x0001 #define EXT4_FREE_BLOCKS_FORGET 0x0002 #define EXT4_FREE_BLOCKS_VALIDATED 0x0004 +#define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008 +#define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010 +#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 + +/* + * Flags used by ext4_discard_partial_page_buffers + */ +#define EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED 0x0001 /* * ioctl commands @@ -536,9 +566,6 @@ struct ext4_new_group_data { #define EXT4_IOC_SETVERSION _IOW('f', 4, long) #define EXT4_IOC_GETVERSION_OLD FS_IOC_GETVERSION #define EXT4_IOC_SETVERSION_OLD FS_IOC_SETVERSION -#ifdef CONFIG_JBD2_DEBUG -#define EXT4_IOC_WAIT_FOR_READONLY _IOR('f', 99, long) -#endif #define EXT4_IOC_GETRSVSZ _IOR('f', 5, long) #define EXT4_IOC_SETRSVSZ _IOW('f', 6, long) #define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long) @@ -561,9 +588,6 @@ struct ext4_new_group_data { #define EXT4_IOC32_SETRSVSZ _IOW('f', 6, int) #define EXT4_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int) #define EXT4_IOC32_GROUP_ADD _IOW('f', 8, struct compat_ext4_new_group_input) -#ifdef CONFIG_JBD2_DEBUG -#define EXT4_IOC32_WAIT_FOR_READONLY _IOR('f', 99, int) -#endif #define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION #define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION #endif @@ -835,6 +859,7 @@ struct ext4_inode_info { ext4_group_t i_last_alloc_group; /* allocation reservation info for delalloc */ + /* In case of bigalloc, these refer to clusters rather than blocks */ unsigned int i_reserved_data_blocks; unsigned int i_reserved_meta_blocks; unsigned int i_allocated_meta_blocks; @@ -884,7 +909,6 @@ struct ext4_inode_info { /* * Mount flags */ -#define EXT4_MOUNT_OLDALLOC 0x00002 /* Don't use the new Orlov allocator */ #define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ #define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ #define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */ @@ -916,6 +940,9 @@ struct ext4_inode_info { #define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */ #define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */ +#define EXT4_MOUNT2_EXPLICIT_DELALLOC 0x00000001 /* User explicitly + specified delalloc */ + #define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ ~EXT4_MOUNT_##opt #define set_opt(sb, opt) EXT4_SB(sb)->s_mount_opt |= \ @@ -939,6 +966,8 @@ struct ext4_inode_info { #define ext4_find_next_zero_bit find_next_zero_bit_le #define ext4_find_next_bit find_next_bit_le +extern void ext4_set_bits(void *bm, int cur, int len); + /* * Maximal mount counts between two filesystem checks */ @@ -964,9 +993,9 @@ struct ext4_super_block { /*10*/ __le32 s_free_inodes_count; /* Free inodes count */ __le32 s_first_data_block; /* First Data Block */ __le32 s_log_block_size; /* Block size */ - __le32 s_obso_log_frag_size; /* Obsoleted fragment size */ + __le32 s_log_cluster_size; /* Allocation cluster size */ /*20*/ __le32 s_blocks_per_group; /* # Blocks per group */ - __le32 s_obso_frags_per_group; /* Obsoleted fragments per group */ + __le32 s_clusters_per_group; /* # Clusters per group */ __le32 s_inodes_per_group; /* # Inodes per group */ __le32 s_mtime; /* Mount time */ /*30*/ __le32 s_wtime; /* Write time */ @@ -1062,7 +1091,10 @@ struct ext4_super_block { __u8 s_last_error_func[32]; /* function where the error happened */ #define EXT4_S_ERR_END offsetof(struct ext4_super_block, s_mount_opts) __u8 s_mount_opts[64]; - __le32 s_reserved[112]; /* Padding to the end of the block */ + __le32 s_usr_quota_inum; /* inode for tracking user quota */ + __le32 s_grp_quota_inum; /* inode for tracking group quota */ + __le32 s_overhead_clusters; /* overhead blocks/clusters in fs */ + __le32 s_reserved[109]; /* Padding to the end of the block */ }; #define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START) @@ -1082,6 +1114,7 @@ struct ext4_sb_info { unsigned long s_desc_size; /* Size of a group descriptor in bytes */ unsigned long s_inodes_per_block;/* Number of inodes per block */ unsigned long s_blocks_per_group;/* Number of blocks in a group */ + unsigned long s_clusters_per_group; /* Number of clusters in a group */ unsigned long s_inodes_per_group;/* Number of inodes in a group */ unsigned long s_itb_per_group; /* Number of inode table blocks per group */ unsigned long s_gdb_count; /* Number of group descriptor blocks */ @@ -1090,6 +1123,8 @@ struct ext4_sb_info { ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */ unsigned long s_overhead_last; /* Last calculated overhead */ unsigned long s_blocks_last; /* Last seen block count */ + unsigned int s_cluster_ratio; /* Number of blocks per cluster */ + unsigned int s_cluster_bits; /* log2 of s_cluster_ratio */ loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ struct buffer_head * s_sbh; /* Buffer containing the super block */ struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */ @@ -1113,10 +1148,10 @@ struct ext4_sb_info { u32 s_hash_seed[4]; int s_def_hash_version; int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */ - struct percpu_counter s_freeblocks_counter; + struct percpu_counter s_freeclusters_counter; struct percpu_counter s_freeinodes_counter; struct percpu_counter s_dirs_counter; - struct percpu_counter s_dirtyblocks_counter; + struct percpu_counter s_dirtyclusters_counter; struct blockgroup_lock *s_blockgroup_lock; struct proc_dir_entry *s_proc; struct kobject s_kobj; @@ -1126,15 +1161,12 @@ struct ext4_sb_info { struct journal_s *s_journal; struct list_head s_orphan; struct mutex s_orphan_lock; - struct mutex s_resize_lock; + unsigned long s_resize_flags; /* Flags indicating if there + is a resizer */ unsigned long s_commit_interval; u32 s_max_batch_time; u32 s_min_batch_time; struct block_device *journal_bdev; -#ifdef CONFIG_JBD2_DEBUG - struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ - wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */ -#endif #ifdef CONFIG_QUOTA char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ int s_jquota_fmt; /* Format of quota to use */ @@ -1214,6 +1246,9 @@ struct ext4_sb_info { /* Kernel thread for multiple mount protection */ struct task_struct *s_mmp_tsk; + + /* record the last minlen when FITRIM is called. */ + atomic_t s_last_trim_minblks; }; static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) @@ -1240,6 +1275,15 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)); } +static inline void ext4_set_io_unwritten_flag(struct inode *inode, + struct ext4_io_end *io_end) +{ + if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { + io_end->flag |= EXT4_IO_END_UNWRITTEN; + atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); + } +} + /* * Inode dynamic state flags */ @@ -1352,6 +1396,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) #define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020 #define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040 #define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100 +#define EXT4_FEATURE_RO_COMPAT_BIGALLOC 0x0200 #define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 #define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002 @@ -1394,7 +1439,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \ EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \ EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\ - EXT4_FEATURE_RO_COMPAT_HUGE_FILE) + EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\ + EXT4_FEATURE_RO_COMPAT_BIGALLOC) /* * Default values for user and/or group using reserved blocks @@ -1727,9 +1773,9 @@ extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, unsigned int flags, unsigned long *count, int *errp); -extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, - s64 nblocks, unsigned int flags); -extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *); +extern int ext4_claim_free_clusters(struct ext4_sb_info *sbi, + s64 nclusters, unsigned int flags); +extern ext4_fsblk_t ext4_count_free_clusters(struct super_block *); extern void ext4_check_blocks_bitmap(struct super_block *); extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, ext4_group_t block_group, @@ -1737,12 +1783,19 @@ extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); struct buffer_head *ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group); -extern unsigned ext4_init_block_bitmap(struct super_block *sb, - struct buffer_head *bh, - ext4_group_t group, - struct ext4_group_desc *desc); -#define ext4_free_blocks_after_init(sb, group, desc) \ - ext4_init_block_bitmap(sb, NULL, group, desc) +extern void ext4_init_block_bitmap(struct super_block *sb, + struct buffer_head *bh, + ext4_group_t group, + struct ext4_group_desc *desc); +extern unsigned ext4_free_clusters_after_init(struct super_block *sb, + ext4_group_t block_group, + struct ext4_group_desc *gdp); +extern unsigned ext4_num_base_meta_clusters(struct super_block *sb, + ext4_group_t block_group); +extern unsigned ext4_num_overhead_clusters(struct super_block *sb, + ext4_group_t block_group, + struct ext4_group_desc *gdp); +ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); /* dir.c */ extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, @@ -1767,7 +1820,8 @@ extern int ext4fs_dirhash(const char *name, int len, struct /* ialloc.c */ extern struct inode *ext4_new_inode(handle_t *, struct inode *, int, - const struct qstr *qstr, __u32 goal); + const struct qstr *qstr, __u32 goal, + uid_t *owner); extern void ext4_free_inode(handle_t *, struct inode *); extern struct inode * ext4_orphan_get(struct super_block *, unsigned long); extern unsigned long ext4_count_free_inodes(struct super_block *); @@ -1793,7 +1847,7 @@ extern void ext4_free_blocks(handle_t *handle, struct inode *inode, unsigned long count, int flags); extern int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t i, struct ext4_group_desc *desc); -extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, +extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, ext4_fsblk_t block, unsigned long count); extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); @@ -1830,10 +1884,27 @@ extern int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from); extern int ext4_block_zero_page_range(handle_t *handle, struct address_space *mapping, loff_t from, loff_t length); +extern int ext4_discard_partial_page_buffers(handle_t *handle, + struct address_space *mapping, loff_t from, + loff_t length, int flags); +extern int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, + struct inode *inode, struct page *page, loff_t from, + loff_t length, int flags); extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); extern qsize_t *ext4_get_reserved_space(struct inode *inode); extern void ext4_da_update_reserve_space(struct inode *inode, int used, int quota_claim); + +/* indirect.c */ +extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags); +extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, + unsigned long nr_segs); +extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); +extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk); +extern void ext4_ind_truncate(struct inode *inode); + /* ioctl.c */ extern long ext4_ioctl(struct file *, unsigned int, unsigned long); extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); @@ -1855,40 +1926,43 @@ extern int ext4_group_extend(struct super_block *sb, ext4_fsblk_t n_blocks_count); /* super.c */ -extern void __ext4_error(struct super_block *, const char *, unsigned int, - const char *, ...) - __attribute__ ((format (printf, 4, 5))); +extern void *ext4_kvmalloc(size_t size, gfp_t flags); +extern void *ext4_kvzalloc(size_t size, gfp_t flags); +extern void ext4_kvfree(void *ptr); +extern __printf(4, 5) +void __ext4_error(struct super_block *, const char *, unsigned int, + const char *, ...); #define ext4_error(sb, message...) __ext4_error(sb, __func__, \ __LINE__, ## message) -extern void ext4_error_inode(struct inode *, const char *, unsigned int, - ext4_fsblk_t, const char *, ...) - __attribute__ ((format (printf, 5, 6))); -extern void ext4_error_file(struct file *, const char *, unsigned int, - ext4_fsblk_t, const char *, ...) - __attribute__ ((format (printf, 5, 6))); +extern __printf(5, 6) +void ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t, + const char *, ...); +extern __printf(5, 6) +void ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t, + const char *, ...); extern void __ext4_std_error(struct super_block *, const char *, unsigned int, int); -extern void __ext4_abort(struct super_block *, const char *, unsigned int, - const char *, ...) - __attribute__ ((format (printf, 4, 5))); +extern __printf(4, 5) +void __ext4_abort(struct super_block *, const char *, unsigned int, + const char *, ...); #define ext4_abort(sb, message...) __ext4_abort(sb, __func__, \ __LINE__, ## message) -extern void __ext4_warning(struct super_block *, const char *, unsigned int, - const char *, ...) - __attribute__ ((format (printf, 4, 5))); +extern __printf(4, 5) +void __ext4_warning(struct super_block *, const char *, unsigned int, + const char *, ...); #define ext4_warning(sb, message...) __ext4_warning(sb, __func__, \ __LINE__, ## message) -extern void ext4_msg(struct super_block *, const char *, const char *, ...) - __attribute__ ((format (printf, 3, 4))); +extern __printf(3, 4) +void ext4_msg(struct super_block *, const char *, const char *, ...); extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp, const char *, unsigned int, const char *); #define dump_mmp_msg(sb, mmp, msg) __dump_mmp_msg(sb, mmp, __func__, \ __LINE__, msg) -extern void __ext4_grp_locked_error(const char *, unsigned int, \ - struct super_block *, ext4_group_t, \ - unsigned long, ext4_fsblk_t, \ - const char *, ...) - __attribute__ ((format (printf, 7, 8))); +extern __printf(7, 8) +void __ext4_grp_locked_error(const char *, unsigned int, + struct super_block *, ext4_group_t, + unsigned long, ext4_fsblk_t, + const char *, ...); #define ext4_grp_locked_error(sb, grp, message...) \ __ext4_grp_locked_error(__func__, __LINE__, (sb), (grp), ## message) extern void ext4_update_dynamic_rev(struct super_block *sb); @@ -1904,8 +1978,8 @@ extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb, struct ext4_group_desc *bg); extern ext4_fsblk_t ext4_inode_table(struct super_block *sb, struct ext4_group_desc *bg); -extern __u32 ext4_free_blks_count(struct super_block *sb, - struct ext4_group_desc *bg); +extern __u32 ext4_free_group_clusters(struct super_block *sb, + struct ext4_group_desc *bg); extern __u32 ext4_free_inodes_count(struct super_block *sb, struct ext4_group_desc *bg); extern __u32 ext4_used_dirs_count(struct super_block *sb, @@ -1918,8 +1992,9 @@ extern void ext4_inode_bitmap_set(struct super_block *sb, struct ext4_group_desc *bg, ext4_fsblk_t blk); extern void ext4_inode_table_set(struct super_block *sb, struct ext4_group_desc *bg, ext4_fsblk_t blk); -extern void ext4_free_blks_set(struct super_block *sb, - struct ext4_group_desc *bg, __u32 count); +extern void ext4_free_group_clusters_set(struct super_block *sb, + struct ext4_group_desc *bg, + __u32 count); extern void ext4_free_inodes_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count); extern void ext4_used_dirs_set(struct super_block *sb, @@ -2028,13 +2103,13 @@ do { \ } while (0) #ifdef CONFIG_SMP -/* Each CPU can accumulate percpu_counter_batch blocks in their local - * counters. So we need to make sure we have free blocks more +/* Each CPU can accumulate percpu_counter_batch clusters in their local + * counters. So we need to make sure we have free clusters more * than percpu_counter_batch * nr_cpu_ids. Also add a window of 4 times. */ -#define EXT4_FREEBLOCKS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids)) +#define EXT4_FREECLUSTERS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids)) #else -#define EXT4_FREEBLOCKS_WATERMARK 0 +#define EXT4_FREECLUSTERS_WATERMARK 0 #endif static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) @@ -2067,11 +2142,19 @@ struct ext4_group_info { * 5 free 8-block regions. */ }; -#define EXT4_GROUP_INFO_NEED_INIT_BIT 0 +#define EXT4_GROUP_INFO_NEED_INIT_BIT 0 +#define EXT4_GROUP_INFO_WAS_TRIMMED_BIT 1 #define EXT4_MB_GRP_NEED_INIT(grp) \ (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_WAS_TRIMMED(grp) \ + (test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_SET_TRIMMED(grp) \ + (set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_CLEAR_TRIMMED(grp) \ + (clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) + #define EXT4_MAX_CONTENTION 8 #define EXT4_CONTENTION_THRESHOLD 2 @@ -2123,6 +2206,19 @@ static inline void ext4_mark_super_dirty(struct super_block *sb) } /* + * Block validity checking + */ +#define ext4_check_indirect_blockref(inode, bh) \ + ext4_check_blockref(__func__, __LINE__, inode, \ + (__le32 *)(bh)->b_data, \ + EXT4_ADDR_PER_BLOCK((inode)->i_sb)) + +#define ext4_ind_check_inode(inode) \ + ext4_check_blockref(__func__, __LINE__, inode, \ + EXT4_I(inode)->i_data, \ + EXT4_NDIR_BLOCKS) + +/* * Inodes and files operations */ @@ -2151,6 +2247,8 @@ extern void ext4_exit_system_zone(void); extern int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk, unsigned int count); +extern int ext4_check_blockref(const char *, unsigned int, + struct inode *, __le32 *, unsigned int); /* extents.c */ extern int ext4_ext_tree_init(handle_t *handle, struct inode *); @@ -2197,10 +2295,19 @@ extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t); enum ext4_state_bits { BH_Uninit /* blocks are allocated but uninitialized on disk */ = BH_JBDPrivateStart, + BH_AllocFromCluster, /* allocated blocks were part of already + * allocated cluster. Note that this flag will + * never, ever appear in a buffer_head's state + * flag. See EXT4_MAP_FROM_CLUSTER to see where + * this is used. */ + BH_Da_Mapped, /* Delayed allocated block that now has a mapping. This + * flag is set when ext4_map_blocks is called on a + * delayed allocated block to get its real mapping. */ }; BUFFER_FNS(Uninit, uninit) TAS_BUFFER_FNS(Uninit, uninit) +BUFFER_FNS(Da_Mapped, da_mapped) /* * Add new method to test wether block and inode bitmaps are properly @@ -2230,6 +2337,12 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh) extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ]; +#define EXT4_RESIZING 0 +extern int ext4_resize_begin(struct super_block *sb); +extern void ext4_resize_end(struct super_block *sb); + #endif /* __KERNEL__ */ +#include "ext4_extents.h" + #endif /* _EXT4_H */ diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 095c36f3b612..a52db3a69a30 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -290,5 +290,7 @@ extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, struct ext4_ext_path *); extern void ext4_ext_drop_refs(struct ext4_ext_path *); extern int ext4_ext_check_inode(struct inode *inode); +extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk, + int search_hint_reverse); #endif /* _EXT4_EXTENTS */ diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index f5240aa15601..aca179017582 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -109,9 +109,11 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, if (ext4_handle_valid(handle)) { err = jbd2_journal_dirty_metadata(handle, bh); - if (err) - ext4_journal_abort_handle(where, line, __func__, - bh, handle, err); + if (err) { + /* Errors can only happen if there is a bug */ + handle->h_err = err; + __ext4_journal_stop(where, line, handle); + } } else { if (inode) mark_buffer_dirty_inode(bh, inode); diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index bb85757689b6..5802fa1dab18 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -289,10 +289,10 @@ static inline int ext4_should_order_data(struct inode *inode) static inline int ext4_should_writeback_data(struct inode *inode) { - if (!S_ISREG(inode->i_mode)) - return 0; if (EXT4_JOURNAL(inode) == NULL) return 1; + if (!S_ISREG(inode->i_mode)) + return 0; if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) return 0; if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index f815cc81e7a2..61fa9e1614af 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -42,7 +42,6 @@ #include <asm/uaccess.h> #include <linux/fiemap.h> #include "ext4_jbd2.h" -#include "ext4_extents.h" #include <trace/events/ext4.h> @@ -96,13 +95,17 @@ static int ext4_ext_get_access(handle_t *handle, struct inode *inode, * - ENOMEM * - EIO */ -static int ext4_ext_dirty(handle_t *handle, struct inode *inode, - struct ext4_ext_path *path) +#define ext4_ext_dirty(handle, inode, path) \ + __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path)) +static int __ext4_ext_dirty(const char *where, unsigned int line, + handle_t *handle, struct inode *inode, + struct ext4_ext_path *path) { int err; if (path->p_bh) { /* path points to block */ - err = ext4_handle_dirty_metadata(handle, inode, path->p_bh); + err = __ext4_handle_dirty_metadata(where, line, handle, + inode, path->p_bh); } else { /* path points to leaf/index in inode body */ err = ext4_mark_inode_dirty(handle, inode); @@ -114,17 +117,9 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t block) { - struct ext4_inode_info *ei = EXT4_I(inode); - ext4_fsblk_t bg_start; - ext4_fsblk_t last_block; - ext4_grpblk_t colour; - ext4_group_t block_group; - int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); - int depth; - if (path) { + int depth = path->p_depth; struct ext4_extent *ex; - depth = path->p_depth; /* * Try to predict block placement assuming that we are @@ -161,36 +156,7 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, } /* OK. use inode's group */ - block_group = ei->i_block_group; - if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { - /* - * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME - * block groups per flexgroup, reserve the first block - * group for directories and special files. Regular - * files will start at the second block group. This - * tends to speed up directory access and improves - * fsck times. - */ - block_group &= ~(flex_size-1); - if (S_ISREG(inode->i_mode)) - block_group++; - } - bg_start = ext4_group_first_block_no(inode->i_sb, block_group); - last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; - - /* - * If we are doing delayed allocation, we don't need take - * colour into account. - */ - if (test_opt(inode->i_sb, DELALLOC)) - return bg_start; - - if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) - colour = (current->pid % 16) * - (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); - else - colour = (current->pid % 16) * ((last_block - bg_start) / 16); - return bg_start + colour + block; + return ext4_inode_to_goal_block(inode); } /* @@ -215,12 +181,10 @@ static inline int ext4_ext_space_block(struct inode *inode, int check) size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) / sizeof(struct ext4_extent); - if (!check) { #ifdef AGGRESSIVE_TEST - if (size > 6) - size = 6; + if (!check && size > 6) + size = 6; #endif - } return size; } @@ -230,12 +194,10 @@ static inline int ext4_ext_space_block_idx(struct inode *inode, int check) size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) / sizeof(struct ext4_extent_idx); - if (!check) { #ifdef AGGRESSIVE_TEST - if (size > 5) - size = 5; + if (!check && size > 5) + size = 5; #endif - } return size; } @@ -246,12 +208,10 @@ static inline int ext4_ext_space_root(struct inode *inode, int check) size = sizeof(EXT4_I(inode)->i_data); size -= sizeof(struct ext4_extent_header); size /= sizeof(struct ext4_extent); - if (!check) { #ifdef AGGRESSIVE_TEST - if (size > 3) - size = 3; + if (!check && size > 3) + size = 3; #endif - } return size; } @@ -262,12 +222,10 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check) size = sizeof(EXT4_I(inode)->i_data); size -= sizeof(struct ext4_extent_header); size /= sizeof(struct ext4_extent_idx); - if (!check) { #ifdef AGGRESSIVE_TEST - if (size > 4) - size = 4; + if (!check && size > 4) + size = 4; #endif - } return size; } @@ -279,7 +237,7 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check) int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) { struct ext4_inode_info *ei = EXT4_I(inode); - int idxs, num = 0; + int idxs; idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) / sizeof(struct ext4_extent_idx)); @@ -294,6 +252,8 @@ int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) */ if (ei->i_da_metadata_calc_len && ei->i_da_metadata_calc_last_lblock+1 == lblock) { + int num = 0; + if ((ei->i_da_metadata_calc_len % idxs) == 0) num++; if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0) @@ -356,8 +316,6 @@ static int ext4_valid_extent_entries(struct inode *inode, struct ext4_extent_header *eh, int depth) { - struct ext4_extent *ext; - struct ext4_extent_idx *ext_idx; unsigned short entries; if (eh->eh_entries == 0) return 1; @@ -366,7 +324,7 @@ static int ext4_valid_extent_entries(struct inode *inode, if (depth == 0) { /* leaf entries */ - ext = EXT_FIRST_EXTENT(eh); + struct ext4_extent *ext = EXT_FIRST_EXTENT(eh); while (entries) { if (!ext4_valid_extent(inode, ext)) return 0; @@ -374,7 +332,7 @@ static int ext4_valid_extent_entries(struct inode *inode, entries--; } } else { - ext_idx = EXT_FIRST_INDEX(eh); + struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh); while (entries) { if (!ext4_valid_extent_idx(inode, ext_idx)) return 0; @@ -776,42 +734,44 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, logical, le32_to_cpu(curp->p_idx->ei_block)); return -EIO; } - len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; + + if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries) + >= le16_to_cpu(curp->p_hdr->eh_max))) { + EXT4_ERROR_INODE(inode, + "eh_entries %d >= eh_max %d!", + le16_to_cpu(curp->p_hdr->eh_entries), + le16_to_cpu(curp->p_hdr->eh_max)); + return -EIO; + } + if (logical > le32_to_cpu(curp->p_idx->ei_block)) { /* insert after */ - if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) { - len = (len - 1) * sizeof(struct ext4_extent_idx); - len = len < 0 ? 0 : len; - ext_debug("insert new index %d after: %llu. " - "move %d from 0x%p to 0x%p\n", - logical, ptr, len, - (curp->p_idx + 1), (curp->p_idx + 2)); - memmove(curp->p_idx + 2, curp->p_idx + 1, len); - } + ext_debug("insert new index %d after: %llu\n", logical, ptr); ix = curp->p_idx + 1; } else { /* insert before */ - len = len * sizeof(struct ext4_extent_idx); - len = len < 0 ? 0 : len; - ext_debug("insert new index %d before: %llu. " - "move %d from 0x%p to 0x%p\n", - logical, ptr, len, - curp->p_idx, (curp->p_idx + 1)); - memmove(curp->p_idx + 1, curp->p_idx, len); + ext_debug("insert new index %d before: %llu\n", logical, ptr); ix = curp->p_idx; } + len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1; + BUG_ON(len < 0); + if (len > 0) { + ext_debug("insert new index %d: " + "move %d indices from 0x%p to 0x%p\n", + logical, len, ix, ix + 1); + memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx)); + } + + if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) { + EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!"); + return -EIO; + } + ix->ei_block = cpu_to_le32(logical); ext4_idx_store_pblock(ix, ptr); le16_add_cpu(&curp->p_hdr->eh_entries, 1); - if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries) - > le16_to_cpu(curp->p_hdr->eh_max))) { - EXT4_ERROR_INODE(inode, - "logical %d == ei_block %d!", - logical, le32_to_cpu(curp->p_idx->ei_block)); - return -EIO; - } if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) { EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!"); return -EIO; @@ -1074,16 +1034,14 @@ cleanup: */ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, unsigned int flags, - struct ext4_ext_path *path, struct ext4_extent *newext) { - struct ext4_ext_path *curp = path; struct ext4_extent_header *neh; struct buffer_head *bh; ext4_fsblk_t newblock; int err = 0; - newblock = ext4_ext_new_meta_block(handle, inode, path, + newblock = ext4_ext_new_meta_block(handle, inode, NULL, newext, &err, flags); if (newblock == 0) return err; @@ -1103,7 +1061,8 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, } /* move top-level index/leaf into new block */ - memmove(bh->b_data, curp->p_hdr, sizeof(EXT4_I(inode)->i_data)); + memmove(bh->b_data, EXT4_I(inode)->i_data, + sizeof(EXT4_I(inode)->i_data)); /* set size of new block */ neh = ext_block_hdr(bh); @@ -1121,32 +1080,23 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, if (err) goto out; - /* create index in new top-level index: num,max,pointer */ - err = ext4_ext_get_access(handle, inode, curp); - if (err) - goto out; - - curp->p_hdr->eh_magic = EXT4_EXT_MAGIC; - curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0)); - curp->p_hdr->eh_entries = cpu_to_le16(1); - curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr); - - if (path[0].p_hdr->eh_depth) - curp->p_idx->ei_block = - EXT_FIRST_INDEX(path[0].p_hdr)->ei_block; - else - curp->p_idx->ei_block = - EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block; - ext4_idx_store_pblock(curp->p_idx, newblock); - + /* Update top-level index: num,max,pointer */ neh = ext_inode_hdr(inode); + neh->eh_entries = cpu_to_le16(1); + ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock); + if (neh->eh_depth == 0) { + /* Root extent block becomes index block */ + neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0)); + EXT_FIRST_INDEX(neh)->ei_block = + EXT_FIRST_EXTENT(neh)->ee_block; + } ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n", le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max), le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block), ext4_idx_pblock(EXT_FIRST_INDEX(neh))); - neh->eh_depth = cpu_to_le16(path->p_depth + 1); - err = ext4_ext_dirty(handle, inode, curp); + neh->eh_depth = cpu_to_le16(neh->eh_depth + 1); + ext4_mark_inode_dirty(handle, inode); out: brelse(bh); @@ -1194,8 +1144,7 @@ repeat: err = PTR_ERR(path); } else { /* tree is full, time to grow in depth */ - err = ext4_ext_grow_indepth(handle, inode, flags, - path, newext); + err = ext4_ext_grow_indepth(handle, inode, flags, newext); if (err) goto out; @@ -1267,9 +1216,9 @@ static int ext4_ext_search_left(struct inode *inode, if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) { EXT4_ERROR_INODE(inode, "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!", - ix != NULL ? ix->ei_block : 0, + ix != NULL ? le32_to_cpu(ix->ei_block) : 0, EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ? - EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block : 0, + le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0, depth); return -EIO; } @@ -1292,13 +1241,14 @@ static int ext4_ext_search_left(struct inode *inode, /* * search the closest allocated block to the right for *logical * and returns it at @logical + it's physical address at @phys - * if *logical is the smallest allocated block, the function + * if *logical is the largest allocated block, the function * returns 0 at @phys * return value contains 0 (success) or error code */ static int ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path, - ext4_lblk_t *logical, ext4_fsblk_t *phys) + ext4_lblk_t *logical, ext4_fsblk_t *phys, + struct ext4_extent **ret_ex) { struct buffer_head *bh = NULL; struct ext4_extent_header *eh; @@ -1340,9 +1290,7 @@ static int ext4_ext_search_right(struct inode *inode, return -EIO; } } - *logical = le32_to_cpu(ex->ee_block); - *phys = ext4_ext_pblock(ex); - return 0; + goto found_extent; } if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) { @@ -1355,9 +1303,7 @@ static int ext4_ext_search_right(struct inode *inode, if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) { /* next allocated block in this leaf */ ex++; - *logical = le32_to_cpu(ex->ee_block); - *phys = ext4_ext_pblock(ex); - return 0; + goto found_extent; } /* go up and search for index to the right */ @@ -1400,9 +1346,12 @@ got_index: return -EIO; } ex = EXT_FIRST_EXTENT(eh); +found_extent: *logical = le32_to_cpu(ex->ee_block); *phys = ext4_ext_pblock(ex); - put_bh(bh); + *ret_ex = ex; + if (bh) + put_bh(bh); return 0; } @@ -1427,7 +1376,8 @@ ext4_ext_next_allocated_block(struct ext4_ext_path *path) while (depth >= 0) { if (depth == path->p_depth) { /* leaf */ - if (path[depth].p_ext != + if (path[depth].p_ext && + path[depth].p_ext != EXT_LAST_EXTENT(path[depth].p_hdr)) return le32_to_cpu(path[depth].p_ext[1].ee_block); } else { @@ -1446,8 +1396,7 @@ ext4_ext_next_allocated_block(struct ext4_ext_path *path) * ext4_ext_next_leaf_block: * returns first allocated block from next leaf or EXT_MAX_BLOCKS */ -static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode, - struct ext4_ext_path *path) +static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path) { int depth; @@ -1656,7 +1605,8 @@ static int ext4_ext_try_to_merge(struct inode *inode, * such that there will be no overlap, and then returns 1. * If there is no overlap found, it returns 0. */ -static unsigned int ext4_ext_check_overlap(struct inode *inode, +static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi, + struct inode *inode, struct ext4_extent *newext, struct ext4_ext_path *path) { @@ -1670,6 +1620,7 @@ static unsigned int ext4_ext_check_overlap(struct inode *inode, if (!path[depth].p_ext) goto out; b2 = le32_to_cpu(path[depth].p_ext->ee_block); + b2 &= ~(sbi->s_cluster_ratio - 1); /* * get the next allocated block if the extent in the path @@ -1679,6 +1630,7 @@ static unsigned int ext4_ext_check_overlap(struct inode *inode, b2 = ext4_ext_next_allocated_block(path); if (b2 == EXT_MAX_BLOCKS) goto out; + b2 &= ~(sbi->s_cluster_ratio - 1); } /* check for wrap through zero on extent logical start block*/ @@ -1730,7 +1682,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, /* try to insert block into found extent and return */ if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO) && ext4_can_extents_be_merged(inode, ex, newext)) { - ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n", + ext_debug("append [%d]%d block to %u:[%d]%d (from %llu)\n", ext4_ext_is_uninitialized(newext), ext4_ext_get_actual_len(newext), le32_to_cpu(ex->ee_block), @@ -1757,7 +1709,6 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, goto merge; } -repeat: depth = ext_depth(inode); eh = path[depth].p_hdr; if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) @@ -1765,10 +1716,11 @@ repeat: /* probably next leaf has space for us? */ fex = EXT_LAST_EXTENT(eh); - next = ext4_ext_next_leaf_block(inode, path); - if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block) - && next != EXT_MAX_BLOCKS) { - ext_debug("next leaf block - %d\n", next); + next = EXT_MAX_BLOCKS; + if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)) + next = ext4_ext_next_leaf_block(path); + if (next != EXT_MAX_BLOCKS) { + ext_debug("next leaf block - %u\n", next); BUG_ON(npath != NULL); npath = ext4_ext_find_extent(inode, next, NULL); if (IS_ERR(npath)) @@ -1779,7 +1731,7 @@ repeat: ext_debug("next leaf isn't full(%d)\n", le16_to_cpu(eh->eh_entries)); path = npath; - goto repeat; + goto has_space; } ext_debug("next leaf has no free space(%d,%d)\n", le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); @@ -1806,46 +1758,51 @@ has_space: if (!nearex) { /* there is no extent in this leaf, create first one */ - ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n", + ext_debug("first extent in the leaf: %u:%llu:[%d]%d\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), ext4_ext_is_uninitialized(newext), ext4_ext_get_actual_len(newext)); - path[depth].p_ext = EXT_FIRST_EXTENT(eh); - } else if (le32_to_cpu(newext->ee_block) + nearex = EXT_FIRST_EXTENT(eh); + } else { + if (le32_to_cpu(newext->ee_block) > le32_to_cpu(nearex->ee_block)) { -/* BUG_ON(newext->ee_block == nearex->ee_block); */ - if (nearex != EXT_LAST_EXTENT(eh)) { - len = EXT_MAX_EXTENT(eh) - nearex; - len = (len - 1) * sizeof(struct ext4_extent); - len = len < 0 ? 0 : len; - ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, " - "move %d from 0x%p to 0x%p\n", + /* Insert after */ + ext_debug("insert %u:%llu:[%d]%d before: " + "nearest %p\n", + le32_to_cpu(newext->ee_block), + ext4_ext_pblock(newext), + ext4_ext_is_uninitialized(newext), + ext4_ext_get_actual_len(newext), + nearex); + nearex++; + } else { + /* Insert before */ + BUG_ON(newext->ee_block == nearex->ee_block); + ext_debug("insert %u:%llu:[%d]%d after: " + "nearest %p\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), ext4_ext_is_uninitialized(newext), ext4_ext_get_actual_len(newext), - nearex, len, nearex + 1, nearex + 2); - memmove(nearex + 2, nearex + 1, len); + nearex); + } + len = EXT_LAST_EXTENT(eh) - nearex + 1; + if (len > 0) { + ext_debug("insert %u:%llu:[%d]%d: " + "move %d extents from 0x%p to 0x%p\n", + le32_to_cpu(newext->ee_block), + ext4_ext_pblock(newext), + ext4_ext_is_uninitialized(newext), + ext4_ext_get_actual_len(newext), + len, nearex, nearex + 1); + memmove(nearex + 1, nearex, + len * sizeof(struct ext4_extent)); } - path[depth].p_ext = nearex + 1; - } else { - BUG_ON(newext->ee_block == nearex->ee_block); - len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent); - len = len < 0 ? 0 : len; - ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, " - "move %d from 0x%p to 0x%p\n", - le32_to_cpu(newext->ee_block), - ext4_ext_pblock(newext), - ext4_ext_is_uninitialized(newext), - ext4_ext_get_actual_len(newext), - nearex, len, nearex + 1, nearex + 2); - memmove(nearex + 1, nearex, len); - path[depth].p_ext = nearex; } le16_add_cpu(&eh->eh_entries, 1); - nearex = path[depth].p_ext; + path[depth].p_ext = nearex; nearex->ee_block = newext->ee_block; ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext)); nearex->ee_len = newext->ee_len; @@ -1995,6 +1952,7 @@ ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block, struct ext4_ext_cache *cex; BUG_ON(len == 0); spin_lock(&EXT4_I(inode)->i_block_reservation_lock); + trace_ext4_ext_put_in_cache(inode, block, len, start); cex = &EXT4_I(inode)->i_cached_extent; cex->ec_block = block; cex->ec_len = len; @@ -2052,7 +2010,7 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, } /* - * ext4_ext_in_cache() + * ext4_ext_check_cache() * Checks to see if the given block is in the cache. * If it is, the cached extent is stored in the given * cache extent pointer. If the cached extent is a hole, @@ -2096,6 +2054,7 @@ errout: sbi->extent_cache_misses++; else sbi->extent_cache_hits++; + trace_ext4_ext_in_cache(inode, block, ret); spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); return ret; } @@ -2134,8 +2093,6 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, /* * ext4_ext_rm_idx: * removes index from the index block. - * It's used in truncate case only, thus all requests are for - * last index in the block only. */ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, struct ext4_ext_path *path) @@ -2153,11 +2110,20 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, err = ext4_ext_get_access(handle, inode, path); if (err) return err; + + if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) { + int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx; + len *= sizeof(struct ext4_extent_idx); + memmove(path->p_idx, path->p_idx + 1, len); + } + le16_add_cpu(&path->p_hdr->eh_entries, -1); err = ext4_ext_dirty(handle, inode, path); if (err) return err; ext_debug("index is empty, remove it, free block %llu\n", leaf); + trace_ext4_ext_rm_idx(inode, leaf); + ext4_free_blocks(handle, inode, NULL, leaf, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); return err; @@ -2186,7 +2152,7 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks, * need to account for leaf block credit * * bitmaps and block group descriptor blocks - * and other metadat blocks still need to be + * and other metadata blocks still need to be * accounted. */ /* 1 bitmap, 1 block group descriptor */ @@ -2223,14 +2189,40 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) } static int ext4_remove_blocks(handle_t *handle, struct inode *inode, - struct ext4_extent *ex, - ext4_lblk_t from, ext4_lblk_t to) + struct ext4_extent *ex, + ext4_fsblk_t *partial_cluster, + ext4_lblk_t from, ext4_lblk_t to) { + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); unsigned short ee_len = ext4_ext_get_actual_len(ex); + ext4_fsblk_t pblk; int flags = EXT4_FREE_BLOCKS_FORGET; if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) flags |= EXT4_FREE_BLOCKS_METADATA; + /* + * For bigalloc file systems, we never free a partial cluster + * at the beginning of the extent. Instead, we make a note + * that we tried freeing the cluster, and check to see if we + * need to free it on a subsequent call to ext4_remove_blocks, + * or at the end of the ext4_truncate() operation. + */ + flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER; + + trace_ext4_remove_blocks(inode, ex, from, to, *partial_cluster); + /* + * If we have a partial cluster, and it's different from the + * cluster of the last block, we need to explicitly free the + * partial cluster here. + */ + pblk = ext4_ext_pblock(ex) + ee_len - 1; + if (*partial_cluster && (EXT4_B2C(sbi, pblk) != *partial_cluster)) { + ext4_free_blocks(handle, inode, NULL, + EXT4_C2B(sbi, *partial_cluster), + sbi->s_cluster_ratio, flags); + *partial_cluster = 0; + } + #ifdef EXTENTS_STATS { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); @@ -2250,12 +2242,24 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, && to == le32_to_cpu(ex->ee_block) + ee_len - 1) { /* tail removal */ ext4_lblk_t num; - ext4_fsblk_t start; num = le32_to_cpu(ex->ee_block) + ee_len - from; - start = ext4_ext_pblock(ex) + ee_len - num; - ext_debug("free last %u blocks starting %llu\n", num, start); - ext4_free_blocks(handle, inode, NULL, start, num, flags); + pblk = ext4_ext_pblock(ex) + ee_len - num; + ext_debug("free last %u blocks starting %llu\n", num, pblk); + ext4_free_blocks(handle, inode, NULL, pblk, num, flags); + /* + * If the block range to be freed didn't start at the + * beginning of a cluster, and we removed the entire + * extent, save the partial cluster here, since we + * might need to delete if we determine that the + * truncate operation has removed all of the blocks in + * the cluster. + */ + if (pblk & (sbi->s_cluster_ratio - 1) && + (ee_len == num)) + *partial_cluster = EXT4_B2C(sbi, pblk); + else + *partial_cluster = 0; } else if (from == le32_to_cpu(ex->ee_block) && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { /* head removal */ @@ -2266,7 +2270,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, start = ext4_ext_pblock(ex); ext_debug("free first %u blocks starting %llu\n", num, start); - ext4_free_blocks(handle, inode, 0, start, num, flags); + ext4_free_blocks(handle, inode, NULL, start, num, flags); } else { printk(KERN_INFO "strange request: removal(2) " @@ -2290,19 +2294,19 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, */ static int ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, - struct ext4_ext_path *path, ext4_lblk_t start, - ext4_lblk_t end) + struct ext4_ext_path *path, ext4_fsblk_t *partial_cluster, + ext4_lblk_t start, ext4_lblk_t end) { + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int err = 0, correct_index = 0; int depth = ext_depth(inode), credits; struct ext4_extent_header *eh; - ext4_lblk_t a, b, block; + ext4_lblk_t a, b; unsigned num; ext4_lblk_t ex_ee_block; unsigned short ex_ee_len; unsigned uninitialized = 0; struct ext4_extent *ex; - struct ext4_map_blocks map; /* the header must be checked already in ext4_ext_remove_space() */ ext_debug("truncate since %u in leaf\n", start); @@ -2319,6 +2323,8 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, ex_ee_block = le32_to_cpu(ex->ee_block); ex_ee_len = ext4_ext_get_actual_len(ex); + trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster); + while (ex >= EXT_FIRST_EXTENT(eh) && ex_ee_block + ex_ee_len > start) { @@ -2343,86 +2349,18 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, ex_ee_block = le32_to_cpu(ex->ee_block); ex_ee_len = ext4_ext_get_actual_len(ex); continue; - } else if (a != ex_ee_block && - b != ex_ee_block + ex_ee_len - 1) { - /* - * If this is a truncate, then this condition should - * never happen because at least one of the end points - * needs to be on the edge of the extent. - */ - if (end == EXT_MAX_BLOCKS - 1) { - ext_debug(" bad truncate %u:%u\n", - start, end); - block = 0; - num = 0; - err = -EIO; - goto out; - } - /* - * else this is a hole punch, so the extent needs to - * be split since neither edge of the hole is on the - * extent edge - */ - else{ - map.m_pblk = ext4_ext_pblock(ex); - map.m_lblk = ex_ee_block; - map.m_len = b - ex_ee_block; - - err = ext4_split_extent(handle, - inode, path, &map, 0, - EXT4_GET_BLOCKS_PUNCH_OUT_EXT | - EXT4_GET_BLOCKS_PRE_IO); - - if (err < 0) - goto out; - - ex_ee_len = ext4_ext_get_actual_len(ex); - - b = ex_ee_block+ex_ee_len - 1 < end ? - ex_ee_block+ex_ee_len - 1 : end; - - /* Then remove tail of this extent */ - block = ex_ee_block; - num = a - block; - } + } else if (b != ex_ee_block + ex_ee_len - 1) { + EXT4_ERROR_INODE(inode," bad truncate %u:%u\n", + start, end); + err = -EIO; + goto out; } else if (a != ex_ee_block) { /* remove tail of the extent */ - block = ex_ee_block; - num = a - block; - } else if (b != ex_ee_block + ex_ee_len - 1) { - /* remove head of the extent */ - block = b; - num = ex_ee_block + ex_ee_len - b; - - /* - * If this is a truncate, this condition - * should never happen - */ - if (end == EXT_MAX_BLOCKS - 1) { - ext_debug(" bad truncate %u:%u\n", - start, end); - err = -EIO; - goto out; - } + num = a - ex_ee_block; } else { /* remove whole extent: excellent! */ - block = ex_ee_block; num = 0; - if (a != ex_ee_block) { - ext_debug(" bad truncate %u:%u\n", - start, end); - err = -EIO; - goto out; - } - - if (b != ex_ee_block + ex_ee_len - 1) { - ext_debug(" bad truncate %u:%u\n", - start, end); - err = -EIO; - goto out; - } } - /* * 3 for leaf, sb, and inode plus 2 (bmap and group * descriptor) for each block group; assume two block @@ -2444,23 +2382,15 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, if (err) goto out; - err = ext4_remove_blocks(handle, inode, ex, a, b); + err = ext4_remove_blocks(handle, inode, ex, partial_cluster, + a, b); if (err) goto out; - if (num == 0) { + if (num == 0) /* this extent is removed; mark slot entirely unused */ ext4_ext_store_pblock(ex, 0); - } else if (block != ex_ee_block) { - /* - * If this was a head removal, then we need to update - * the physical block since it is now at a different - * location - */ - ext4_ext_store_pblock(ex, ext4_ext_pblock(ex) + (b-a)); - } - ex->ee_block = cpu_to_le32(block); ex->ee_len = cpu_to_le16(num); /* * Do not mark uninitialized if all the blocks in the @@ -2468,11 +2398,6 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, */ if (uninitialized && num) ext4_ext_mark_uninitialized(ex); - - err = ext4_ext_dirty(handle, inode, path + depth); - if (err) - goto out; - /* * If the extent was completely released, * we need to remove it from the leaf @@ -2492,9 +2417,14 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, sizeof(struct ext4_extent)); } le16_add_cpu(&eh->eh_entries, -1); - } + } else + *partial_cluster = 0; - ext_debug("new extent: %u:%u:%llu\n", block, num, + err = ext4_ext_dirty(handle, inode, path + depth); + if (err) + goto out; + + ext_debug("new extent: %u:%u:%llu\n", ex_ee_block, num, ext4_ext_pblock(ex)); ex--; ex_ee_block = le32_to_cpu(ex->ee_block); @@ -2504,6 +2434,25 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, if (correct_index && eh->eh_entries) err = ext4_ext_correct_indexes(handle, inode, path); + /* + * If there is still a entry in the leaf node, check to see if + * it references the partial cluster. This is the only place + * where it could; if it doesn't, we can free the cluster. + */ + if (*partial_cluster && ex >= EXT_FIRST_EXTENT(eh) && + (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) != + *partial_cluster)) { + int flags = EXT4_FREE_BLOCKS_FORGET; + + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + flags |= EXT4_FREE_BLOCKS_METADATA; + + ext4_free_blocks(handle, inode, NULL, + EXT4_C2B(sbi, *partial_cluster), + sbi->s_cluster_ratio, flags); + *partial_cluster = 0; + } + /* if this leaf is free, then we should * remove it from index block above */ if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL) @@ -2534,12 +2483,12 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path) return 1; } -static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, - ext4_lblk_t end) +static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) { struct super_block *sb = inode->i_sb; int depth = ext_depth(inode); struct ext4_ext_path *path; + ext4_fsblk_t partial_cluster = 0; handle_t *handle; int i, err; @@ -2553,6 +2502,8 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, again: ext4_ext_invalidate_cache(inode); + trace_ext4_ext_remove_space(inode, start, depth); + /* * We start scanning from right side, freeing all the blocks * after i_size and walking into the tree depth-wise. @@ -2575,7 +2526,8 @@ again: if (i == depth) { /* this is leaf block */ err = ext4_ext_rm_leaf(handle, inode, path, - start, end); + &partial_cluster, start, + EXT_MAX_BLOCKS - 1); /* root level has p_bh == NULL, brelse() eats this */ brelse(path[i].p_bh); path[i].p_bh = NULL; @@ -2647,6 +2599,24 @@ again: } } + trace_ext4_ext_remove_space_done(inode, start, depth, partial_cluster, + path->p_hdr->eh_entries); + + /* If we still have something in the partial cluster and we have removed + * even the first extent, then we should free the blocks in the partial + * cluster as well. */ + if (partial_cluster && path->p_hdr->eh_entries == 0) { + int flags = EXT4_FREE_BLOCKS_FORGET; + + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + flags |= EXT4_FREE_BLOCKS_METADATA; + + ext4_free_blocks(handle, inode, NULL, + EXT4_C2B(EXT4_SB(sb), partial_cluster), + EXT4_SB(sb)->s_cluster_ratio, flags); + partial_cluster = 0; + } + /* TODO: flexible tree reduction should be here */ if (path->p_hdr->eh_entries == 0) { /* @@ -2938,17 +2908,29 @@ out: * a> There is no split required: Entire extent should be initialized * b> Splits in two extents: Write is happening at either end of the extent * c> Splits in three extents: Somone is writing in middle of the extent + * + * Pre-conditions: + * - The extent pointed to by 'path' is uninitialized. + * - The extent pointed to by 'path' contains a superset + * of the logical span [map->m_lblk, map->m_lblk + map->m_len). + * + * Post-conditions on success: + * - the returned value is the number of blocks beyond map->l_lblk + * that are allocated and initialized. + * It is guaranteed to be >= map->m_len. */ static int ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, struct ext4_ext_path *path) { + struct ext4_extent_header *eh; struct ext4_map_blocks split_map; struct ext4_extent zero_ex; struct ext4_extent *ex; ext4_lblk_t ee_block, eof_block; - unsigned int allocated, ee_len, depth; + unsigned int ee_len, depth; + int allocated; int err = 0; int split_flag = 0; @@ -2962,11 +2944,93 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, eof_block = map->m_lblk + map->m_len; depth = ext_depth(inode); + eh = path[depth].p_hdr; ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); allocated = ee_len - (map->m_lblk - ee_block); + trace_ext4_ext_convert_to_initialized_enter(inode, map, ex); + + /* Pre-conditions */ + BUG_ON(!ext4_ext_is_uninitialized(ex)); + BUG_ON(!in_range(map->m_lblk, ee_block, ee_len)); + BUG_ON(map->m_lblk + map->m_len > ee_block + ee_len); + + /* + * Attempt to transfer newly initialized blocks from the currently + * uninitialized extent to its left neighbor. This is much cheaper + * than an insertion followed by a merge as those involve costly + * memmove() calls. This is the common case in steady state for + * workloads doing fallocate(FALLOC_FL_KEEP_SIZE) followed by append + * writes. + * + * Limitations of the current logic: + * - L1: we only deal with writes at the start of the extent. + * The approach could be extended to writes at the end + * of the extent but this scenario was deemed less common. + * - L2: we do not deal with writes covering the whole extent. + * This would require removing the extent if the transfer + * is possible. + * - L3: we only attempt to merge with an extent stored in the + * same extent tree node. + */ + if ((map->m_lblk == ee_block) && /*L1*/ + (map->m_len < ee_len) && /*L2*/ + (ex > EXT_FIRST_EXTENT(eh))) { /*L3*/ + struct ext4_extent *prev_ex; + ext4_lblk_t prev_lblk; + ext4_fsblk_t prev_pblk, ee_pblk; + unsigned int prev_len, write_len; + + prev_ex = ex - 1; + prev_lblk = le32_to_cpu(prev_ex->ee_block); + prev_len = ext4_ext_get_actual_len(prev_ex); + prev_pblk = ext4_ext_pblock(prev_ex); + ee_pblk = ext4_ext_pblock(ex); + write_len = map->m_len; + + /* + * A transfer of blocks from 'ex' to 'prev_ex' is allowed + * upon those conditions: + * - C1: prev_ex is initialized, + * - C2: prev_ex is logically abutting ex, + * - C3: prev_ex is physically abutting ex, + * - C4: prev_ex can receive the additional blocks without + * overflowing the (initialized) length limit. + */ + if ((!ext4_ext_is_uninitialized(prev_ex)) && /*C1*/ + ((prev_lblk + prev_len) == ee_block) && /*C2*/ + ((prev_pblk + prev_len) == ee_pblk) && /*C3*/ + (prev_len < (EXT_INIT_MAX_LEN - write_len))) { /*C4*/ + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + + trace_ext4_ext_convert_to_initialized_fastpath(inode, + map, ex, prev_ex); + + /* Shift the start of ex by 'write_len' blocks */ + ex->ee_block = cpu_to_le32(ee_block + write_len); + ext4_ext_store_pblock(ex, ee_pblk + write_len); + ex->ee_len = cpu_to_le16(ee_len - write_len); + ext4_ext_mark_uninitialized(ex); /* Restore the flag */ + + /* Extend prev_ex by 'write_len' blocks */ + prev_ex->ee_len = cpu_to_le16(prev_len + write_len); + + /* Mark the block containing both extents as dirty */ + ext4_ext_dirty(handle, inode, path + depth); + + /* Update path to point to the right extent */ + path[depth].p_ext = prev_ex; + + /* Result: number of initialized blocks past m_lblk */ + allocated = write_len; + goto out; + } + } + WARN_ON(map->m_lblk < ee_block); /* * It is safe to convert extent to initialized via explicit @@ -3107,12 +3171,10 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, struct ext4_ext_path *path) { struct ext4_extent *ex; - struct ext4_extent_header *eh; int depth; int err = 0; depth = ext_depth(inode); - eh = path[depth].p_hdr; ex = path[depth].p_ext; ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical" @@ -3196,6 +3258,192 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode, return ext4_mark_inode_dirty(handle, inode); } +/** + * ext4_find_delalloc_range: find delayed allocated block in the given range. + * + * Goes through the buffer heads in the range [lblk_start, lblk_end] and returns + * whether there are any buffers marked for delayed allocation. It returns '1' + * on the first delalloc'ed buffer head found. If no buffer head in the given + * range is marked for delalloc, it returns 0. + * lblk_start should always be <= lblk_end. + * search_hint_reverse is to indicate that searching in reverse from lblk_end to + * lblk_start might be more efficient (i.e., we will likely hit the delalloc'ed + * block sooner). This is useful when blocks are truncated sequentially from + * lblk_start towards lblk_end. + */ +static int ext4_find_delalloc_range(struct inode *inode, + ext4_lblk_t lblk_start, + ext4_lblk_t lblk_end, + int search_hint_reverse) +{ + struct address_space *mapping = inode->i_mapping; + struct buffer_head *head, *bh = NULL; + struct page *page; + ext4_lblk_t i, pg_lblk; + pgoff_t index; + + /* reverse search wont work if fs block size is less than page size */ + if (inode->i_blkbits < PAGE_CACHE_SHIFT) + search_hint_reverse = 0; + + if (search_hint_reverse) + i = lblk_end; + else + i = lblk_start; + + index = i >> (PAGE_CACHE_SHIFT - inode->i_blkbits); + + while ((i >= lblk_start) && (i <= lblk_end)) { + page = find_get_page(mapping, index); + if (!page) + goto nextpage; + + if (!page_has_buffers(page)) + goto nextpage; + + head = page_buffers(page); + if (!head) + goto nextpage; + + bh = head; + pg_lblk = index << (PAGE_CACHE_SHIFT - + inode->i_blkbits); + do { + if (unlikely(pg_lblk < lblk_start)) { + /* + * This is possible when fs block size is less + * than page size and our cluster starts/ends in + * middle of the page. So we need to skip the + * initial few blocks till we reach the 'lblk' + */ + pg_lblk++; + continue; + } + + /* Check if the buffer is delayed allocated and that it + * is not yet mapped. (when da-buffers are mapped during + * their writeout, their da_mapped bit is set.) + */ + if (buffer_delay(bh) && !buffer_da_mapped(bh)) { + page_cache_release(page); + trace_ext4_find_delalloc_range(inode, + lblk_start, lblk_end, + search_hint_reverse, + 1, i); + return 1; + } + if (search_hint_reverse) + i--; + else + i++; + } while ((i >= lblk_start) && (i <= lblk_end) && + ((bh = bh->b_this_page) != head)); +nextpage: + if (page) + page_cache_release(page); + /* + * Move to next page. 'i' will be the first lblk in the next + * page. + */ + if (search_hint_reverse) + index--; + else + index++; + i = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + } + + trace_ext4_find_delalloc_range(inode, lblk_start, lblk_end, + search_hint_reverse, 0, 0); + return 0; +} + +int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk, + int search_hint_reverse) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + ext4_lblk_t lblk_start, lblk_end; + lblk_start = lblk & (~(sbi->s_cluster_ratio - 1)); + lblk_end = lblk_start + sbi->s_cluster_ratio - 1; + + return ext4_find_delalloc_range(inode, lblk_start, lblk_end, + search_hint_reverse); +} + +/** + * Determines how many complete clusters (out of those specified by the 'map') + * are under delalloc and were reserved quota for. + * This function is called when we are writing out the blocks that were + * originally written with their allocation delayed, but then the space was + * allocated using fallocate() before the delayed allocation could be resolved. + * The cases to look for are: + * ('=' indicated delayed allocated blocks + * '-' indicates non-delayed allocated blocks) + * (a) partial clusters towards beginning and/or end outside of allocated range + * are not delalloc'ed. + * Ex: + * |----c---=|====c====|====c====|===-c----| + * |++++++ allocated ++++++| + * ==> 4 complete clusters in above example + * + * (b) partial cluster (outside of allocated range) towards either end is + * marked for delayed allocation. In this case, we will exclude that + * cluster. + * Ex: + * |----====c========|========c========| + * |++++++ allocated ++++++| + * ==> 1 complete clusters in above example + * + * Ex: + * |================c================| + * |++++++ allocated ++++++| + * ==> 0 complete clusters in above example + * + * The ext4_da_update_reserve_space will be called only if we + * determine here that there were some "entire" clusters that span + * this 'allocated' range. + * In the non-bigalloc case, this function will just end up returning num_blks + * without ever calling ext4_find_delalloc_range. + */ +static unsigned int +get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, + unsigned int num_blks) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + ext4_lblk_t alloc_cluster_start, alloc_cluster_end; + ext4_lblk_t lblk_from, lblk_to, c_offset; + unsigned int allocated_clusters = 0; + + alloc_cluster_start = EXT4_B2C(sbi, lblk_start); + alloc_cluster_end = EXT4_B2C(sbi, lblk_start + num_blks - 1); + + /* max possible clusters for this allocation */ + allocated_clusters = alloc_cluster_end - alloc_cluster_start + 1; + + trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks); + + /* Check towards left side */ + c_offset = lblk_start & (sbi->s_cluster_ratio - 1); + if (c_offset) { + lblk_from = lblk_start & (~(sbi->s_cluster_ratio - 1)); + lblk_to = lblk_from + c_offset - 1; + + if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0)) + allocated_clusters--; + } + + /* Now check towards right. */ + c_offset = (lblk_start + num_blks) & (sbi->s_cluster_ratio - 1); + if (allocated_clusters && c_offset) { + lblk_from = lblk_start + num_blks; + lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1; + + if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0)) + allocated_clusters--; + } + + return allocated_clusters; +} + static int ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, @@ -3212,6 +3460,9 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, flags, allocated); ext4_ext_show_leaf(inode, path); + trace_ext4_ext_handle_uninitialized_extents(inode, map, allocated, + newblock); + /* get_block() before submit the IO, split the extent */ if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { ret = ext4_split_unwritten_extents(handle, inode, map, @@ -3221,10 +3472,9 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, * that this IO needs to conversion to written when IO is * completed */ - if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) { - io->flag = EXT4_IO_END_UNWRITTEN; - atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); - } else + if (io) + ext4_set_io_unwritten_flag(inode, io); + else ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); if (ext4_should_dioread_nolock(inode)) map->m_flags |= EXT4_MAP_UNINIT; @@ -3265,14 +3515,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, /* buffered write, writepage time, convert*/ ret = ext4_ext_convert_to_initialized(handle, inode, map, path); - if (ret >= 0) { + if (ret >= 0) ext4_update_inode_fsync_trans(handle, inode, 1); - err = check_eofblocks_fl(handle, inode, map->m_lblk, path, - map->m_len); - if (err < 0) - goto out2; - } - out: if (ret <= 0) { err = ret; @@ -3301,11 +3545,24 @@ out: * But fallocate would have already updated quota and block * count for this offset. So cancel these reservation */ - if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) - ext4_da_update_reserve_space(inode, allocated, 0); + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { + unsigned int reserved_clusters; + reserved_clusters = get_reserved_cluster_alloc(inode, + map->m_lblk, map->m_len); + if (reserved_clusters) + ext4_da_update_reserve_space(inode, + reserved_clusters, + 0); + } map_out: map->m_flags |= EXT4_MAP_MAPPED; + if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) { + err = check_eofblocks_fl(handle, inode, map->m_lblk, path, + map->m_len); + if (err < 0) + goto out2; + } out1: if (allocated > map->m_len) allocated = map->m_len; @@ -3321,6 +3578,111 @@ out2: } /* + * get_implied_cluster_alloc - check to see if the requested + * allocation (in the map structure) overlaps with a cluster already + * allocated in an extent. + * @sb The filesystem superblock structure + * @map The requested lblk->pblk mapping + * @ex The extent structure which might contain an implied + * cluster allocation + * + * This function is called by ext4_ext_map_blocks() after we failed to + * find blocks that were already in the inode's extent tree. Hence, + * we know that the beginning of the requested region cannot overlap + * the extent from the inode's extent tree. There are three cases we + * want to catch. The first is this case: + * + * |--- cluster # N--| + * |--- extent ---| |---- requested region ---| + * |==========| + * + * The second case that we need to test for is this one: + * + * |--------- cluster # N ----------------| + * |--- requested region --| |------- extent ----| + * |=======================| + * + * The third case is when the requested region lies between two extents + * within the same cluster: + * |------------- cluster # N-------------| + * |----- ex -----| |---- ex_right ----| + * |------ requested region ------| + * |================| + * + * In each of the above cases, we need to set the map->m_pblk and + * map->m_len so it corresponds to the return the extent labelled as + * "|====|" from cluster #N, since it is already in use for data in + * cluster EXT4_B2C(sbi, map->m_lblk). We will then return 1 to + * signal to ext4_ext_map_blocks() that map->m_pblk should be treated + * as a new "allocated" block region. Otherwise, we will return 0 and + * ext4_ext_map_blocks() will then allocate one or more new clusters + * by calling ext4_mb_new_blocks(). + */ +static int get_implied_cluster_alloc(struct super_block *sb, + struct ext4_map_blocks *map, + struct ext4_extent *ex, + struct ext4_ext_path *path) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_lblk_t c_offset = map->m_lblk & (sbi->s_cluster_ratio-1); + ext4_lblk_t ex_cluster_start, ex_cluster_end; + ext4_lblk_t rr_cluster_start, rr_cluster_end; + ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); + ext4_fsblk_t ee_start = ext4_ext_pblock(ex); + unsigned short ee_len = ext4_ext_get_actual_len(ex); + + /* The extent passed in that we are trying to match */ + ex_cluster_start = EXT4_B2C(sbi, ee_block); + ex_cluster_end = EXT4_B2C(sbi, ee_block + ee_len - 1); + + /* The requested region passed into ext4_map_blocks() */ + rr_cluster_start = EXT4_B2C(sbi, map->m_lblk); + rr_cluster_end = EXT4_B2C(sbi, map->m_lblk + map->m_len - 1); + + if ((rr_cluster_start == ex_cluster_end) || + (rr_cluster_start == ex_cluster_start)) { + if (rr_cluster_start == ex_cluster_end) + ee_start += ee_len - 1; + map->m_pblk = (ee_start & ~(sbi->s_cluster_ratio - 1)) + + c_offset; + map->m_len = min(map->m_len, + (unsigned) sbi->s_cluster_ratio - c_offset); + /* + * Check for and handle this case: + * + * |--------- cluster # N-------------| + * |------- extent ----| + * |--- requested region ---| + * |===========| + */ + + if (map->m_lblk < ee_block) + map->m_len = min(map->m_len, ee_block - map->m_lblk); + + /* + * Check for the case where there is already another allocated + * block to the right of 'ex' but before the end of the cluster. + * + * |------------- cluster # N-------------| + * |----- ex -----| |---- ex_right ----| + * |------ requested region ------| + * |================| + */ + if (map->m_lblk > ee_block) { + ext4_lblk_t next = ext4_ext_next_allocated_block(path); + map->m_len = min(map->m_len, next - map->m_lblk); + } + + trace_ext4_get_implied_cluster_alloc_exit(sb, map, 1); + return 1; + } + + trace_ext4_get_implied_cluster_alloc_exit(sb, map, 0); + return 0; +} + + +/* * Block allocation/map/preallocation routine for extents based files * * @@ -3342,24 +3704,30 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags) { struct ext4_ext_path *path = NULL; - struct ext4_extent newex, *ex; + struct ext4_extent newex, *ex, *ex2; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ext4_fsblk_t newblock = 0; - int err = 0, depth, ret; - unsigned int allocated = 0; + int free_on_err = 0, err = 0, depth, ret; + unsigned int allocated = 0, offset = 0; + unsigned int allocated_clusters = 0; unsigned int punched_out = 0; unsigned int result = 0; struct ext4_allocation_request ar; ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; - struct ext4_map_blocks punch_map; + ext4_lblk_t cluster_offset; ext_debug("blocks %u/%u requested for inode %lu\n", map->m_lblk, map->m_len, inode->i_ino); trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); /* check in cache */ - if (ext4_ext_in_cache(inode, map->m_lblk, &newex) && - ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0)) { + if (!(flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) && + ext4_ext_in_cache(inode, map->m_lblk, &newex)) { if (!newex.ee_start_lo && !newex.ee_start_hi) { + if ((sbi->s_cluster_ratio > 1) && + ext4_find_delalloc_cluster(inode, map->m_lblk, 0)) + map->m_flags |= EXT4_MAP_FROM_CLUSTER; + if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { /* * block isn't allocated yet and @@ -3370,6 +3738,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, /* we should allocate requested block */ } else { /* block is already allocated */ + if (sbi->s_cluster_ratio > 1) + map->m_flags |= EXT4_MAP_FROM_CLUSTER; newblock = map->m_lblk - le32_to_cpu(newex.ee_block) + ext4_ext_pblock(&newex); @@ -3415,8 +3785,14 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, * we split out initialized portions during a write. */ ee_len = ext4_ext_get_actual_len(ex); + + trace_ext4_ext_show_extent(inode, ee_block, ee_start, ee_len); + /* if found extent covers block, simply return it */ if (in_range(map->m_lblk, ee_block, ee_len)) { + struct ext4_map_blocks punch_map; + ext4_fsblk_t partial_cluster = 0; + newblock = map->m_lblk - ee_block + ee_start; /* number of remaining blocks in the extent */ allocated = ee_len - (map->m_lblk - ee_block); @@ -3497,13 +3873,37 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ext4_ext_mark_uninitialized(ex); - err = ext4_ext_remove_space(inode, map->m_lblk, - map->m_lblk + punched_out); + ext4_ext_invalidate_cache(inode); + + err = ext4_ext_rm_leaf(handle, inode, path, + &partial_cluster, map->m_lblk, + map->m_lblk + punched_out); + + if (!err && path->p_hdr->eh_entries == 0) { + /* + * Punch hole freed all of this sub tree, + * so we need to correct eh_depth + */ + err = ext4_ext_get_access(handle, inode, path); + if (err == 0) { + ext_inode_hdr(inode)->eh_depth = 0; + ext_inode_hdr(inode)->eh_max = + cpu_to_le16(ext4_ext_space_root( + inode, 0)); + + err = ext4_ext_dirty( + handle, inode, path); + } + } goto out2; } } + if ((sbi->s_cluster_ratio > 1) && + ext4_find_delalloc_cluster(inode, map->m_lblk, 0)) + map->m_flags |= EXT4_MAP_FROM_CLUSTER; + /* * requested block isn't allocated yet; * we couldn't try to create block if create flag is zero @@ -3516,9 +3916,25 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ext4_ext_put_gap_in_cache(inode, path, map->m_lblk); goto out2; } + /* * Okay, we need to do block allocation. */ + map->m_flags &= ~EXT4_MAP_FROM_CLUSTER; + newex.ee_block = cpu_to_le32(map->m_lblk); + cluster_offset = map->m_lblk & (sbi->s_cluster_ratio-1); + + /* + * If we are doing bigalloc, check to see if the extent returned + * by ext4_ext_find_extent() implies a cluster we can use. + */ + if (cluster_offset && ex && + get_implied_cluster_alloc(inode->i_sb, map, ex, path)) { + ar.len = allocated = map->m_len; + newblock = map->m_pblk; + map->m_flags |= EXT4_MAP_FROM_CLUSTER; + goto got_allocated_blocks; + } /* find neighbour allocated blocks */ ar.lleft = map->m_lblk; @@ -3526,10 +3942,21 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, if (err) goto out2; ar.lright = map->m_lblk; - err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright); + ex2 = NULL; + err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2); if (err) goto out2; + /* Check if the extent after searching to the right implies a + * cluster we can use. */ + if ((sbi->s_cluster_ratio > 1) && ex2 && + get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) { + ar.len = allocated = map->m_len; + newblock = map->m_pblk; + map->m_flags |= EXT4_MAP_FROM_CLUSTER; + goto got_allocated_blocks; + } + /* * See if request is beyond maximum number of blocks we can have in * a single extent. For an initialized extent this limit is @@ -3544,9 +3971,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, map->m_len = EXT_UNINIT_MAX_LEN; /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */ - newex.ee_block = cpu_to_le32(map->m_lblk); newex.ee_len = cpu_to_le16(map->m_len); - err = ext4_ext_check_overlap(inode, &newex, path); + err = ext4_ext_check_overlap(sbi, inode, &newex, path); if (err) allocated = ext4_ext_get_actual_len(&newex); else @@ -3556,7 +3982,18 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ar.inode = inode; ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk); ar.logical = map->m_lblk; - ar.len = allocated; + /* + * We calculate the offset from the beginning of the cluster + * for the logical block number, since when we allocate a + * physical cluster, the physical block should start at the + * same offset from the beginning of the cluster. This is + * needed so that future calls to get_implied_cluster_alloc() + * work correctly. + */ + offset = map->m_lblk & (sbi->s_cluster_ratio - 1); + ar.len = EXT4_NUM_B2C(sbi, offset+allocated); + ar.goal -= offset; + ar.logical -= offset; if (S_ISREG(inode->i_mode)) ar.flags = EXT4_MB_HINT_DATA; else @@ -3569,9 +4006,15 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, goto out2; ext_debug("allocate new block: goal %llu, found %llu/%u\n", ar.goal, newblock, allocated); + free_on_err = 1; + allocated_clusters = ar.len; + ar.len = EXT4_C2B(sbi, ar.len) - offset; + if (ar.len > allocated) + ar.len = allocated; +got_allocated_blocks: /* try to insert new extent into found leaf and return */ - ext4_ext_store_pblock(&newex, newblock); + ext4_ext_store_pblock(&newex, newblock + offset); newex.ee_len = cpu_to_le16(ar.len); /* Mark uninitialized */ if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){ @@ -3584,10 +4027,9 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, * that we need to perform conversion when IO is done. */ if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { - if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) { - io->flag = EXT4_IO_END_UNWRITTEN; - atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); - } else + if (io) + ext4_set_io_unwritten_flag(inode, io); + else ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); } @@ -3595,18 +4037,22 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, map->m_flags |= EXT4_MAP_UNINIT; } - err = check_eofblocks_fl(handle, inode, map->m_lblk, path, ar.len); - if (err) - goto out2; - - err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); - if (err) { + err = 0; + if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) + err = check_eofblocks_fl(handle, inode, map->m_lblk, + path, ar.len); + if (!err) + err = ext4_ext_insert_extent(handle, inode, path, + &newex, flags); + if (err && free_on_err) { + int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ? + EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0; /* free data blocks we just allocated */ /* not a good idea to call discard here directly, * but otherwise we'd need to call it every free() */ ext4_discard_preallocations(inode); ext4_free_blocks(handle, inode, NULL, ext4_ext_pblock(&newex), - ext4_ext_get_actual_len(&newex), 0); + ext4_ext_get_actual_len(&newex), fb_flags); goto out2; } @@ -3621,8 +4067,82 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, * Update reserved blocks/metadata blocks after successful * block allocation which had been deferred till now. */ - if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) - ext4_da_update_reserve_space(inode, allocated, 1); + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { + unsigned int reserved_clusters; + /* + * Check how many clusters we had reserved this allocated range + */ + reserved_clusters = get_reserved_cluster_alloc(inode, + map->m_lblk, allocated); + if (map->m_flags & EXT4_MAP_FROM_CLUSTER) { + if (reserved_clusters) { + /* + * We have clusters reserved for this range. + * But since we are not doing actual allocation + * and are simply using blocks from previously + * allocated cluster, we should release the + * reservation and not claim quota. + */ + ext4_da_update_reserve_space(inode, + reserved_clusters, 0); + } + } else { + BUG_ON(allocated_clusters < reserved_clusters); + /* We will claim quota for all newly allocated blocks.*/ + ext4_da_update_reserve_space(inode, allocated_clusters, + 1); + if (reserved_clusters < allocated_clusters) { + struct ext4_inode_info *ei = EXT4_I(inode); + int reservation = allocated_clusters - + reserved_clusters; + /* + * It seems we claimed few clusters outside of + * the range of this allocation. We should give + * it back to the reservation pool. This can + * happen in the following case: + * + * * Suppose s_cluster_ratio is 4 (i.e., each + * cluster has 4 blocks. Thus, the clusters + * are [0-3],[4-7],[8-11]... + * * First comes delayed allocation write for + * logical blocks 10 & 11. Since there were no + * previous delayed allocated blocks in the + * range [8-11], we would reserve 1 cluster + * for this write. + * * Next comes write for logical blocks 3 to 8. + * In this case, we will reserve 2 clusters + * (for [0-3] and [4-7]; and not for [8-11] as + * that range has a delayed allocated blocks. + * Thus total reserved clusters now becomes 3. + * * Now, during the delayed allocation writeout + * time, we will first write blocks [3-8] and + * allocate 3 clusters for writing these + * blocks. Also, we would claim all these + * three clusters above. + * * Now when we come here to writeout the + * blocks [10-11], we would expect to claim + * the reservation of 1 cluster we had made + * (and we would claim it since there are no + * more delayed allocated blocks in the range + * [8-11]. But our reserved cluster count had + * already gone to 0. + * + * Thus, at the step 4 above when we determine + * that there are still some unwritten delayed + * allocated blocks outside of our current + * block range, we should increment the + * reserved clusters count so that when the + * remaining blocks finally gets written, we + * could claim them. + */ + dquot_reserve_block(inode, + EXT4_C2B(sbi, reservation)); + spin_lock(&ei->i_block_reservation_lock); + ei->i_reserved_data_blocks += reservation; + spin_unlock(&ei->i_block_reservation_lock); + } + } + } /* * Cache the extent and update transaction to commit on fdatasync only @@ -3645,12 +4165,12 @@ out2: ext4_ext_drop_refs(path); kfree(path); } - trace_ext4_ext_map_blocks_exit(inode, map->m_lblk, - newblock, map->m_len, err ? err : allocated); - result = (flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) ? punched_out : allocated; + trace_ext4_ext_map_blocks_exit(inode, map->m_lblk, + newblock, map->m_len, err ? err : result); + return err ? err : result; } @@ -3660,6 +4180,7 @@ void ext4_ext_truncate(struct inode *inode) struct super_block *sb = inode->i_sb; ext4_lblk_t last_block; handle_t *handle; + loff_t page_len; int err = 0; /* @@ -3676,8 +4197,16 @@ void ext4_ext_truncate(struct inode *inode) if (IS_ERR(handle)) return; - if (inode->i_size & (sb->s_blocksize - 1)) - ext4_block_truncate_page(handle, mapping, inode->i_size); + if (inode->i_size % PAGE_CACHE_SIZE != 0) { + page_len = PAGE_CACHE_SIZE - + (inode->i_size & (PAGE_CACHE_SIZE - 1)); + + err = ext4_discard_partial_page_buffers(handle, + mapping, inode->i_size, page_len, 0); + + if (err) + goto out_stop; + } if (ext4_orphan_add(handle, inode)) goto out_stop; @@ -3699,7 +4228,7 @@ void ext4_ext_truncate(struct inode *inode) last_block = (inode->i_size + sb->s_blocksize - 1) >> EXT4_BLOCK_SIZE_BITS(sb); - err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); + err = ext4_ext_remove_space(inode, last_block); /* In a multi-transaction truncate, we only make the final * transaction synchronous. @@ -3771,6 +4300,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) int ret = 0; int ret2 = 0; int retries = 0; + int flags; struct ext4_map_blocks map; unsigned int credits, blkbits = inode->i_blkbits; @@ -3807,6 +4337,16 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); return ret; } + flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT; + if (mode & FALLOC_FL_KEEP_SIZE) + flags |= EXT4_GET_BLOCKS_KEEP_SIZE; + /* + * Don't normalize the request if it can fit in one extent so + * that it doesn't get unnecessarily split into multiple + * extents. + */ + if (len <= EXT_UNINIT_MAX_LEN << blkbits) + flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; retry: while (ret >= 0 && ret < max_blocks) { map.m_lblk = map.m_lblk + ret; @@ -3816,9 +4356,7 @@ retry: ret = PTR_ERR(handle); break; } - ret = ext4_map_blocks(handle, inode, &map, - EXT4_GET_BLOCKS_CREATE_UNINIT_EXT | - EXT4_GET_BLOCKS_NO_NORMALIZE); + ret = ext4_map_blocks(handle, inode, &map, flags); if (ret <= 0) { #ifdef EXT4FS_DEBUG WARN_ON(ret <= 0); @@ -3835,7 +4373,7 @@ retry: blkbits) >> blkbits)) new_size = offset + len; else - new_size = (map.m_lblk + ret) << blkbits; + new_size = ((loff_t) map.m_lblk + ret) << blkbits; ext4_falloc_update_inode(inode, mode, new_size, (map.m_flags & EXT4_MAP_NEW)); @@ -4113,7 +4651,6 @@ found_delayed_extent: return EXT_BREAK; return EXT_CONTINUE; } - /* fiemap flags we can handle specified here */ #define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) @@ -4173,17 +4710,28 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) struct address_space *mapping = inode->i_mapping; struct ext4_map_blocks map; handle_t *handle; - loff_t first_block_offset, last_block_offset, block_len; - loff_t first_page, last_page, first_page_offset, last_page_offset; + loff_t first_page, last_page, page_len; + loff_t first_page_offset, last_page_offset; int ret, credits, blocks_released, err = 0; + /* No need to punch hole beyond i_size */ + if (offset >= inode->i_size) + return 0; + + /* + * If the hole extends beyond i_size, set the hole + * to end after the page that contains i_size + */ + if (offset + length > inode->i_size) { + length = inode->i_size + + PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) - + offset; + } + first_block = (offset + sb->s_blocksize - 1) >> EXT4_BLOCK_SIZE_BITS(sb); last_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); - first_block_offset = first_block << EXT4_BLOCK_SIZE_BITS(sb); - last_block_offset = last_block << EXT4_BLOCK_SIZE_BITS(sb); - first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; last_page = (offset + length) >> PAGE_CACHE_SHIFT; @@ -4196,11 +4744,10 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) */ if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { err = filemap_write_and_wait_range(mapping, - first_page_offset == 0 ? 0 : first_page_offset-1, - last_page_offset); + offset, offset + length - 1); - if (err) - return err; + if (err) + return err; } /* Now release the pages */ @@ -4222,24 +4769,64 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) goto out; /* - * Now we need to zero out the un block aligned data. - * If the file is smaller than a block, just - * zero out the middle + * Now we need to zero out the non-page-aligned data in the + * pages at the start and tail of the hole, and unmap the buffer + * heads for the block aligned regions of the page that were + * completely zeroed. + */ + if (first_page > last_page) { + /* + * If the file space being truncated is contained within a page + * just zero out and unmap the middle of that page + */ + err = ext4_discard_partial_page_buffers(handle, + mapping, offset, length, 0); + + if (err) + goto out; + } else { + /* + * zero out and unmap the partial page that contains + * the start of the hole + */ + page_len = first_page_offset - offset; + if (page_len > 0) { + err = ext4_discard_partial_page_buffers(handle, mapping, + offset, page_len, 0); + if (err) + goto out; + } + + /* + * zero out and unmap the partial page that contains + * the end of the hole + */ + page_len = offset + length - last_page_offset; + if (page_len > 0) { + err = ext4_discard_partial_page_buffers(handle, mapping, + last_page_offset, page_len, 0); + if (err) + goto out; + } + } + + + /* + * If i_size is contained in the last page, we need to + * unmap and zero the partial page after i_size */ - if (first_block > last_block) - ext4_block_zero_page_range(handle, mapping, offset, length); - else { - /* zero out the head of the hole before the first block */ - block_len = first_block_offset - offset; - if (block_len > 0) - ext4_block_zero_page_range(handle, mapping, - offset, block_len); - - /* zero out the tail of the hole after the last block */ - block_len = offset + length - last_block_offset; - if (block_len > 0) { - ext4_block_zero_page_range(handle, mapping, - last_block_offset, block_len); + if (inode->i_size >> PAGE_CACHE_SHIFT == last_page && + inode->i_size % PAGE_CACHE_SIZE != 0) { + + page_len = PAGE_CACHE_SIZE - + (inode->i_size & (PAGE_CACHE_SIZE - 1)); + + if (page_len > 0) { + err = ext4_discard_partial_page_buffers(handle, + mapping, inode->i_size, page_len, 0); + + if (err) + goto out; } } diff --git a/fs/ext4/file.c b/fs/ext4/file.c index ce766f974b1d..cb70f1812a70 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -181,8 +181,8 @@ static int ext4_file_open(struct inode * inode, struct file * filp) path.dentry = mnt->mnt_root; cp = d_path(&path, buf, sizeof(buf)); if (!IS_ERR(cp)) { - memcpy(sbi->s_es->s_last_mounted, cp, - sizeof(sbi->s_es->s_last_mounted)); + strlcpy(sbi->s_es->s_last_mounted, cp, + sizeof(sbi->s_es->s_last_mounted)); ext4_mark_super_dirty(sb); } } @@ -224,53 +224,8 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int origin) maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes; else maxbytes = inode->i_sb->s_maxbytes; - mutex_lock(&inode->i_mutex); - switch (origin) { - case SEEK_END: - offset += inode->i_size; - break; - case SEEK_CUR: - if (offset == 0) { - mutex_unlock(&inode->i_mutex); - return file->f_pos; - } - offset += file->f_pos; - break; - case SEEK_DATA: - /* - * In the generic case the entire file is data, so as long as - * offset isn't at the end of the file then the offset is data. - */ - if (offset >= inode->i_size) { - mutex_unlock(&inode->i_mutex); - return -ENXIO; - } - break; - case SEEK_HOLE: - /* - * There is a virtual hole at the end of the file, so as long as - * offset isn't i_size or larger, return i_size. - */ - if (offset >= inode->i_size) { - mutex_unlock(&inode->i_mutex); - return -ENXIO; - } - offset = inode->i_size; - break; - } - - if (offset < 0 || offset > maxbytes) { - mutex_unlock(&inode->i_mutex); - return -EINVAL; - } - - if (offset != file->f_pos) { - file->f_pos = offset; - file->f_version = 0; - } - mutex_unlock(&inode->i_mutex); - return offset; + return generic_file_llseek_size(file, offset, origin, maxbytes); } const struct file_operations ext4_file_operations = { @@ -301,7 +256,7 @@ const struct inode_operations ext4_file_inode_operations = { .listxattr = ext4_listxattr, .removexattr = generic_removexattr, #endif - .check_acl = ext4_check_acl, + .get_acl = ext4_get_acl, .fiemap = ext4_fiemap, }; diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index da3bed3e0c29..00a2cb753efd 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -75,7 +75,7 @@ static void dump_completed_IO(struct inode * inode) * to written. * The function return the number of pending IOs on success. */ -extern int ext4_flush_completed_IO(struct inode *inode) +int ext4_flush_completed_IO(struct inode *inode) { ext4_io_end_t *io; struct ext4_inode_info *ei = EXT4_I(inode); @@ -83,14 +83,12 @@ extern int ext4_flush_completed_IO(struct inode *inode) int ret = 0; int ret2 = 0; - if (list_empty(&ei->i_completed_io_list)) - return ret; - dump_completed_IO(inode); spin_lock_irqsave(&ei->i_completed_io_lock, flags); while (!list_empty(&ei->i_completed_io_list)){ io = list_entry(ei->i_completed_io_list.next, ext4_io_end_t, list); + list_del_init(&io->list); /* * Calling ext4_end_io_nolock() to convert completed * IO to written. @@ -107,11 +105,9 @@ extern int ext4_flush_completed_IO(struct inode *inode) */ spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); ret = ext4_end_io_nolock(io); - spin_lock_irqsave(&ei->i_completed_io_lock, flags); if (ret < 0) ret2 = ret; - else - list_del_init(&io->list); + spin_lock_irqsave(&ei->i_completed_io_lock, flags); } spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); return (ret2 < 0) ? ret2 : 0; @@ -129,15 +125,30 @@ static int ext4_sync_parent(struct inode *inode) { struct writeback_control wbc; struct dentry *dentry = NULL; + struct inode *next; int ret = 0; - while (inode && ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) { + if (!ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) + return 0; + inode = igrab(inode); + while (ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) { ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY); - dentry = list_entry(inode->i_dentry.next, - struct dentry, d_alias); - if (!dentry || !dentry->d_parent || !dentry->d_parent->d_inode) + dentry = NULL; + spin_lock(&inode->i_lock); + if (!list_empty(&inode->i_dentry)) { + dentry = list_first_entry(&inode->i_dentry, + struct dentry, d_alias); + dget(dentry); + } + spin_unlock(&inode->i_lock); + if (!dentry) + break; + next = igrab(dentry->d_parent->d_inode); + dput(dentry); + if (!next) break; - inode = dentry->d_parent->d_inode; + iput(inode); + inode = next; ret = sync_mapping_buffers(inode->i_mapping); if (ret) break; @@ -148,6 +159,7 @@ static int ext4_sync_parent(struct inode *inode) if (ret) break; } + iput(inode); return ret; } diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 21bb2f61e502..00beb4f9cc4f 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -78,7 +78,7 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb, * allocation, essentially implementing a per-group read-only flag. */ if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { ext4_error(sb, "Checksum bad for group %u", block_group); - ext4_free_blks_set(sb, gdp, 0); + ext4_free_group_clusters_set(sb, gdp, 0); ext4_free_inodes_set(sb, gdp, 0); ext4_itable_unused_set(sb, gdp, 0); memset(bh->b_data, 0xff, sb->s_blocksize); @@ -293,121 +293,9 @@ error_return: ext4_std_error(sb, fatal); } -/* - * There are two policies for allocating an inode. If the new inode is - * a directory, then a forward search is made for a block group with both - * free space and a low directory-to-inode ratio; if that fails, then of - * the groups with above-average free space, that group with the fewest - * directories already is chosen. - * - * For other inodes, search forward from the parent directory\'s block - * group to find a free inode. - */ -static int find_group_dir(struct super_block *sb, struct inode *parent, - ext4_group_t *best_group) -{ - ext4_group_t ngroups = ext4_get_groups_count(sb); - unsigned int freei, avefreei; - struct ext4_group_desc *desc, *best_desc = NULL; - ext4_group_t group; - int ret = -1; - - freei = percpu_counter_read_positive(&EXT4_SB(sb)->s_freeinodes_counter); - avefreei = freei / ngroups; - - for (group = 0; group < ngroups; group++) { - desc = ext4_get_group_desc(sb, group, NULL); - if (!desc || !ext4_free_inodes_count(sb, desc)) - continue; - if (ext4_free_inodes_count(sb, desc) < avefreei) - continue; - if (!best_desc || - (ext4_free_blks_count(sb, desc) > - ext4_free_blks_count(sb, best_desc))) { - *best_group = group; - best_desc = desc; - ret = 0; - } - } - return ret; -} - -#define free_block_ratio 10 - -static int find_group_flex(struct super_block *sb, struct inode *parent, - ext4_group_t *best_group) -{ - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_group_desc *desc; - struct flex_groups *flex_group = sbi->s_flex_groups; - ext4_group_t parent_group = EXT4_I(parent)->i_block_group; - ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group); - ext4_group_t ngroups = ext4_get_groups_count(sb); - int flex_size = ext4_flex_bg_size(sbi); - ext4_group_t best_flex = parent_fbg_group; - int blocks_per_flex = sbi->s_blocks_per_group * flex_size; - int flexbg_free_blocks; - int flex_freeb_ratio; - ext4_group_t n_fbg_groups; - ext4_group_t i; - - n_fbg_groups = (ngroups + flex_size - 1) >> - sbi->s_log_groups_per_flex; - -find_close_to_parent: - flexbg_free_blocks = atomic_read(&flex_group[best_flex].free_blocks); - flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex; - if (atomic_read(&flex_group[best_flex].free_inodes) && - flex_freeb_ratio > free_block_ratio) - goto found_flexbg; - - if (best_flex && best_flex == parent_fbg_group) { - best_flex--; - goto find_close_to_parent; - } - - for (i = 0; i < n_fbg_groups; i++) { - if (i == parent_fbg_group || i == parent_fbg_group - 1) - continue; - - flexbg_free_blocks = atomic_read(&flex_group[i].free_blocks); - flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex; - - if (flex_freeb_ratio > free_block_ratio && - (atomic_read(&flex_group[i].free_inodes))) { - best_flex = i; - goto found_flexbg; - } - - if ((atomic_read(&flex_group[best_flex].free_inodes) == 0) || - ((atomic_read(&flex_group[i].free_blocks) > - atomic_read(&flex_group[best_flex].free_blocks)) && - atomic_read(&flex_group[i].free_inodes))) - best_flex = i; - } - - if (!atomic_read(&flex_group[best_flex].free_inodes) || - !atomic_read(&flex_group[best_flex].free_blocks)) - return -1; - -found_flexbg: - for (i = best_flex * flex_size; i < ngroups && - i < (best_flex + 1) * flex_size; i++) { - desc = ext4_get_group_desc(sb, i, NULL); - if (ext4_free_inodes_count(sb, desc)) { - *best_group = i; - goto out; - } - } - - return -1; -out: - return 0; -} - struct orlov_stats { __u32 free_inodes; - __u32 free_blocks; + __u32 free_clusters; __u32 used_dirs; }; @@ -424,7 +312,7 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g, if (flex_size > 1) { stats->free_inodes = atomic_read(&flex_group[g].free_inodes); - stats->free_blocks = atomic_read(&flex_group[g].free_blocks); + stats->free_clusters = atomic_read(&flex_group[g].free_clusters); stats->used_dirs = atomic_read(&flex_group[g].used_dirs); return; } @@ -432,11 +320,11 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g, desc = ext4_get_group_desc(sb, g, NULL); if (desc) { stats->free_inodes = ext4_free_inodes_count(sb, desc); - stats->free_blocks = ext4_free_blks_count(sb, desc); + stats->free_clusters = ext4_free_group_clusters(sb, desc); stats->used_dirs = ext4_used_dirs_count(sb, desc); } else { stats->free_inodes = 0; - stats->free_blocks = 0; + stats->free_clusters = 0; stats->used_dirs = 0; } } @@ -471,10 +359,10 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, ext4_group_t real_ngroups = ext4_get_groups_count(sb); int inodes_per_group = EXT4_INODES_PER_GROUP(sb); unsigned int freei, avefreei; - ext4_fsblk_t freeb, avefreeb; + ext4_fsblk_t freeb, avefreec; unsigned int ndirs; int max_dirs, min_inodes; - ext4_grpblk_t min_blocks; + ext4_grpblk_t min_clusters; ext4_group_t i, grp, g, ngroups; struct ext4_group_desc *desc; struct orlov_stats stats; @@ -490,9 +378,10 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter); avefreei = freei / ngroups; - freeb = percpu_counter_read_positive(&sbi->s_freeblocks_counter); - avefreeb = freeb; - do_div(avefreeb, ngroups); + freeb = EXT4_C2B(sbi, + percpu_counter_read_positive(&sbi->s_freeclusters_counter)); + avefreec = freeb; + do_div(avefreec, ngroups); ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter); if (S_ISDIR(mode) && @@ -518,7 +407,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, continue; if (stats.free_inodes < avefreei) continue; - if (stats.free_blocks < avefreeb) + if (stats.free_clusters < avefreec) continue; grp = g; ret = 0; @@ -556,7 +445,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, min_inodes = avefreei - inodes_per_group*flex_size / 4; if (min_inodes < 1) min_inodes = 1; - min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb)*flex_size / 4; + min_clusters = avefreec - EXT4_CLUSTERS_PER_GROUP(sb)*flex_size / 4; /* * Start looking in the flex group where we last allocated an @@ -575,7 +464,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, continue; if (stats.free_inodes < min_inodes) continue; - if (stats.free_blocks < min_blocks) + if (stats.free_clusters < min_clusters) continue; goto found_flex_bg; } @@ -659,7 +548,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent, *group = parent_group; desc = ext4_get_group_desc(sb, *group, NULL); if (desc && ext4_free_inodes_count(sb, desc) && - ext4_free_blks_count(sb, desc)) + ext4_free_group_clusters(sb, desc)) return 0; /* @@ -683,7 +572,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent, *group -= ngroups; desc = ext4_get_group_desc(sb, *group, NULL); if (desc && ext4_free_inodes_count(sb, desc) && - ext4_free_blks_count(sb, desc)) + ext4_free_group_clusters(sb, desc)) return 0; } @@ -802,7 +691,7 @@ err_ret: * group to find a free inode. */ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode, - const struct qstr *qstr, __u32 goal) + const struct qstr *qstr, __u32 goal, uid_t *owner) { struct super_block *sb; struct buffer_head *inode_bitmap_bh = NULL; @@ -816,8 +705,6 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode, int ret2, err = 0; struct inode *ret; ext4_group_t i; - int free = 0; - static int once = 1; ext4_group_t flex_group; /* Cannot create files in a deleted directory */ @@ -843,26 +730,9 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode, goto got_group; } - if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) { - ret2 = find_group_flex(sb, dir, &group); - if (ret2 == -1) { - ret2 = find_group_other(sb, dir, &group, mode); - if (ret2 == 0 && once) { - once = 0; - printk(KERN_NOTICE "ext4: find_group_flex " - "failed, fallback succeeded dir %lu\n", - dir->i_ino); - } - } - goto got_group; - } - - if (S_ISDIR(mode)) { - if (test_opt(sb, OLDALLOC)) - ret2 = find_group_dir(sb, dir, &group); - else - ret2 = find_group_orlov(sb, dir, &group, mode, qstr); - } else + if (S_ISDIR(mode)) + ret2 = find_group_orlov(sb, dir, &group, mode, qstr); + else ret2 = find_group_other(sb, dir, &group, mode); got_group: @@ -950,26 +820,21 @@ got: goto fail; } - free = 0; - ext4_lock_group(sb, group); + BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap"); + err = ext4_handle_dirty_metadata(handle, NULL, block_bitmap_bh); + brelse(block_bitmap_bh); + /* recheck and clear flag under lock if we still need to */ + ext4_lock_group(sb, group); if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { - free = ext4_free_blocks_after_init(sb, group, gdp); gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); - ext4_free_blks_set(sb, gdp, free); + ext4_free_group_clusters_set(sb, gdp, + ext4_free_clusters_after_init(sb, group, gdp)); gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); } ext4_unlock_group(sb, group); - /* Don't need to dirty bitmap block if we didn't change it */ - if (free) { - BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap"); - err = ext4_handle_dirty_metadata(handle, - NULL, block_bitmap_bh); - } - - brelse(block_bitmap_bh); if (err) goto fail; } @@ -987,8 +852,11 @@ got: flex_group = ext4_flex_group(sbi, group); atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes); } - - if (test_opt(sb, GRPID)) { + if (owner) { + inode->i_mode = mode; + inode->i_uid = owner[0]; + inode->i_gid = owner[1]; + } else if (test_opt(sb, GRPID)) { inode->i_mode = mode; inode->i_uid = current_fsuid(); inode->i_gid = dir->i_gid; @@ -1005,11 +873,7 @@ got: ei->i_dir_start_lookup = 0; ei->i_disksize = 0; - /* - * Don't inherit extent flag from directory, amongst others. We set - * extent flag on newly created directory and file only if -o extent - * mount option is specified - */ + /* Don't inherit extent flag from directory, amongst others. */ ei->i_flags = ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED); ei->i_file_acl = 0; @@ -1084,7 +948,7 @@ fail_free_drop: fail_drop: dquot_drop(inode); inode->i_flags |= S_NOQUOTA; - inode->i_nlink = 0; + clear_nlink(inode); unlock_new_inode(inode); iput(inode); brelse(inode_bitmap_bh); @@ -1235,7 +1099,7 @@ unsigned long ext4_count_dirs(struct super_block * sb) * inode allocation from the current group, so we take alloc_sem lock, to * block ext4_claim_inode until we are finished. */ -extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, +int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, int barrier) { struct ext4_group_info *grp = ext4_get_group_info(sb, group); @@ -1287,7 +1151,7 @@ extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, group, used_blks, ext4_itable_unused_count(sb, gdp)); ret = 1; - goto out; + goto err_out; } blk = ext4_inode_table(sb, gdp) + used_blks; diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c new file mode 100644 index 000000000000..3cfc73fbca8e --- /dev/null +++ b/fs/ext4/indirect.c @@ -0,0 +1,1503 @@ +/* + * linux/fs/ext4/indirect.c + * + * from + * + * linux/fs/ext4/inode.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/inode.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Goal-directed block allocation by Stephen Tweedie + * (sct@redhat.com), 1993, 1998 + */ + +#include <linux/module.h> +#include "ext4_jbd2.h" +#include "truncate.h" + +#include <trace/events/ext4.h> + +typedef struct { + __le32 *p; + __le32 key; + struct buffer_head *bh; +} Indirect; + +static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) +{ + p->key = *(p->p = v); + p->bh = bh; +} + +/** + * ext4_block_to_path - parse the block number into array of offsets + * @inode: inode in question (we are only interested in its superblock) + * @i_block: block number to be parsed + * @offsets: array to store the offsets in + * @boundary: set this non-zero if the referred-to block is likely to be + * followed (on disk) by an indirect block. + * + * To store the locations of file's data ext4 uses a data structure common + * for UNIX filesystems - tree of pointers anchored in the inode, with + * data blocks at leaves and indirect blocks in intermediate nodes. + * This function translates the block number into path in that tree - + * return value is the path length and @offsets[n] is the offset of + * pointer to (n+1)th node in the nth one. If @block is out of range + * (negative or too large) warning is printed and zero returned. + * + * Note: function doesn't find node addresses, so no IO is needed. All + * we need to know is the capacity of indirect blocks (taken from the + * inode->i_sb). + */ + +/* + * Portability note: the last comparison (check that we fit into triple + * indirect block) is spelled differently, because otherwise on an + * architecture with 32-bit longs and 8Kb pages we might get into trouble + * if our filesystem had 8Kb blocks. We might use long long, but that would + * kill us on x86. Oh, well, at least the sign propagation does not matter - + * i_block would have to be negative in the very beginning, so we would not + * get there at all. + */ + +static int ext4_block_to_path(struct inode *inode, + ext4_lblk_t i_block, + ext4_lblk_t offsets[4], int *boundary) +{ + int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb); + int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb); + const long direct_blocks = EXT4_NDIR_BLOCKS, + indirect_blocks = ptrs, + double_blocks = (1 << (ptrs_bits * 2)); + int n = 0; + int final = 0; + + if (i_block < direct_blocks) { + offsets[n++] = i_block; + final = direct_blocks; + } else if ((i_block -= direct_blocks) < indirect_blocks) { + offsets[n++] = EXT4_IND_BLOCK; + offsets[n++] = i_block; + final = ptrs; + } else if ((i_block -= indirect_blocks) < double_blocks) { + offsets[n++] = EXT4_DIND_BLOCK; + offsets[n++] = i_block >> ptrs_bits; + offsets[n++] = i_block & (ptrs - 1); + final = ptrs; + } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { + offsets[n++] = EXT4_TIND_BLOCK; + offsets[n++] = i_block >> (ptrs_bits * 2); + offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); + offsets[n++] = i_block & (ptrs - 1); + final = ptrs; + } else { + ext4_warning(inode->i_sb, "block %lu > max in inode %lu", + i_block + direct_blocks + + indirect_blocks + double_blocks, inode->i_ino); + } + if (boundary) + *boundary = final - 1 - (i_block & (ptrs - 1)); + return n; +} + +/** + * ext4_get_branch - read the chain of indirect blocks leading to data + * @inode: inode in question + * @depth: depth of the chain (1 - direct pointer, etc.) + * @offsets: offsets of pointers in inode/indirect blocks + * @chain: place to store the result + * @err: here we store the error value + * + * Function fills the array of triples <key, p, bh> and returns %NULL + * if everything went OK or the pointer to the last filled triple + * (incomplete one) otherwise. Upon the return chain[i].key contains + * the number of (i+1)-th block in the chain (as it is stored in memory, + * i.e. little-endian 32-bit), chain[i].p contains the address of that + * number (it points into struct inode for i==0 and into the bh->b_data + * for i>0) and chain[i].bh points to the buffer_head of i-th indirect + * block for i>0 and NULL for i==0. In other words, it holds the block + * numbers of the chain, addresses they were taken from (and where we can + * verify that chain did not change) and buffer_heads hosting these + * numbers. + * + * Function stops when it stumbles upon zero pointer (absent block) + * (pointer to last triple returned, *@err == 0) + * or when it gets an IO error reading an indirect block + * (ditto, *@err == -EIO) + * or when it reads all @depth-1 indirect blocks successfully and finds + * the whole chain, all way to the data (returns %NULL, *err == 0). + * + * Need to be called with + * down_read(&EXT4_I(inode)->i_data_sem) + */ +static Indirect *ext4_get_branch(struct inode *inode, int depth, + ext4_lblk_t *offsets, + Indirect chain[4], int *err) +{ + struct super_block *sb = inode->i_sb; + Indirect *p = chain; + struct buffer_head *bh; + + *err = 0; + /* i_data is not going away, no lock needed */ + add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets); + if (!p->key) + goto no_block; + while (--depth) { + bh = sb_getblk(sb, le32_to_cpu(p->key)); + if (unlikely(!bh)) + goto failure; + + if (!bh_uptodate_or_lock(bh)) { + if (bh_submit_read(bh) < 0) { + put_bh(bh); + goto failure; + } + /* validate block references */ + if (ext4_check_indirect_blockref(inode, bh)) { + put_bh(bh); + goto failure; + } + } + + add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets); + /* Reader: end */ + if (!p->key) + goto no_block; + } + return NULL; + +failure: + *err = -EIO; +no_block: + return p; +} + +/** + * ext4_find_near - find a place for allocation with sufficient locality + * @inode: owner + * @ind: descriptor of indirect block. + * + * This function returns the preferred place for block allocation. + * It is used when heuristic for sequential allocation fails. + * Rules are: + * + if there is a block to the left of our position - allocate near it. + * + if pointer will live in indirect block - allocate near that block. + * + if pointer will live in inode - allocate in the same + * cylinder group. + * + * In the latter case we colour the starting block by the callers PID to + * prevent it from clashing with concurrent allocations for a different inode + * in the same block group. The PID is used here so that functionally related + * files will be close-by on-disk. + * + * Caller must make sure that @ind is valid and will stay that way. + */ +static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data; + __le32 *p; + + /* Try to find previous block */ + for (p = ind->p - 1; p >= start; p--) { + if (*p) + return le32_to_cpu(*p); + } + + /* No such thing, so let's try location of indirect block */ + if (ind->bh) + return ind->bh->b_blocknr; + + /* + * It is going to be referred to from the inode itself? OK, just put it + * into the same cylinder group then. + */ + return ext4_inode_to_goal_block(inode); +} + +/** + * ext4_find_goal - find a preferred place for allocation. + * @inode: owner + * @block: block we want + * @partial: pointer to the last triple within a chain + * + * Normally this function find the preferred place for block allocation, + * returns it. + * Because this is only used for non-extent files, we limit the block nr + * to 32 bits. + */ +static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, + Indirect *partial) +{ + ext4_fsblk_t goal; + + /* + * XXX need to get goal block from mballoc's data structures + */ + + goal = ext4_find_near(inode, partial); + goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; + return goal; +} + +/** + * ext4_blks_to_allocate - Look up the block map and count the number + * of direct blocks need to be allocated for the given branch. + * + * @branch: chain of indirect blocks + * @k: number of blocks need for indirect blocks + * @blks: number of data blocks to be mapped. + * @blocks_to_boundary: the offset in the indirect block + * + * return the total number of blocks to be allocate, including the + * direct and indirect blocks. + */ +static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, + int blocks_to_boundary) +{ + unsigned int count = 0; + + /* + * Simple case, [t,d]Indirect block(s) has not allocated yet + * then it's clear blocks on that path have not allocated + */ + if (k > 0) { + /* right now we don't handle cross boundary allocation */ + if (blks < blocks_to_boundary + 1) + count += blks; + else + count += blocks_to_boundary + 1; + return count; + } + + count++; + while (count < blks && count <= blocks_to_boundary && + le32_to_cpu(*(branch[0].p + count)) == 0) { + count++; + } + return count; +} + +/** + * ext4_alloc_blocks: multiple allocate blocks needed for a branch + * @handle: handle for this transaction + * @inode: inode which needs allocated blocks + * @iblock: the logical block to start allocated at + * @goal: preferred physical block of allocation + * @indirect_blks: the number of blocks need to allocate for indirect + * blocks + * @blks: number of desired blocks + * @new_blocks: on return it will store the new block numbers for + * the indirect blocks(if needed) and the first direct block, + * @err: on return it will store the error code + * + * This function will return the number of blocks allocated as + * requested by the passed-in parameters. + */ +static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, + ext4_lblk_t iblock, ext4_fsblk_t goal, + int indirect_blks, int blks, + ext4_fsblk_t new_blocks[4], int *err) +{ + struct ext4_allocation_request ar; + int target, i; + unsigned long count = 0, blk_allocated = 0; + int index = 0; + ext4_fsblk_t current_block = 0; + int ret = 0; + + /* + * Here we try to allocate the requested multiple blocks at once, + * on a best-effort basis. + * To build a branch, we should allocate blocks for + * the indirect blocks(if not allocated yet), and at least + * the first direct block of this branch. That's the + * minimum number of blocks need to allocate(required) + */ + /* first we try to allocate the indirect blocks */ + target = indirect_blks; + while (target > 0) { + count = target; + /* allocating blocks for indirect blocks and direct blocks */ + current_block = ext4_new_meta_blocks(handle, inode, goal, + 0, &count, err); + if (*err) + goto failed_out; + + if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) { + EXT4_ERROR_INODE(inode, + "current_block %llu + count %lu > %d!", + current_block, count, + EXT4_MAX_BLOCK_FILE_PHYS); + *err = -EIO; + goto failed_out; + } + + target -= count; + /* allocate blocks for indirect blocks */ + while (index < indirect_blks && count) { + new_blocks[index++] = current_block++; + count--; + } + if (count > 0) { + /* + * save the new block number + * for the first direct block + */ + new_blocks[index] = current_block; + printk(KERN_INFO "%s returned more blocks than " + "requested\n", __func__); + WARN_ON(1); + break; + } + } + + target = blks - count ; + blk_allocated = count; + if (!target) + goto allocated; + /* Now allocate data blocks */ + memset(&ar, 0, sizeof(ar)); + ar.inode = inode; + ar.goal = goal; + ar.len = target; + ar.logical = iblock; + if (S_ISREG(inode->i_mode)) + /* enable in-core preallocation only for regular files */ + ar.flags = EXT4_MB_HINT_DATA; + + current_block = ext4_mb_new_blocks(handle, &ar, err); + if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) { + EXT4_ERROR_INODE(inode, + "current_block %llu + ar.len %d > %d!", + current_block, ar.len, + EXT4_MAX_BLOCK_FILE_PHYS); + *err = -EIO; + goto failed_out; + } + + if (*err && (target == blks)) { + /* + * if the allocation failed and we didn't allocate + * any blocks before + */ + goto failed_out; + } + if (!*err) { + if (target == blks) { + /* + * save the new block number + * for the first direct block + */ + new_blocks[index] = current_block; + } + blk_allocated += ar.len; + } +allocated: + /* total number of blocks allocated for direct blocks */ + ret = blk_allocated; + *err = 0; + return ret; +failed_out: + for (i = 0; i < index; i++) + ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); + return ret; +} + +/** + * ext4_alloc_branch - allocate and set up a chain of blocks. + * @handle: handle for this transaction + * @inode: owner + * @indirect_blks: number of allocated indirect blocks + * @blks: number of allocated direct blocks + * @goal: preferred place for allocation + * @offsets: offsets (in the blocks) to store the pointers to next. + * @branch: place to store the chain in. + * + * This function allocates blocks, zeroes out all but the last one, + * links them into chain and (if we are synchronous) writes them to disk. + * In other words, it prepares a branch that can be spliced onto the + * inode. It stores the information about that chain in the branch[], in + * the same format as ext4_get_branch() would do. We are calling it after + * we had read the existing part of chain and partial points to the last + * triple of that (one with zero ->key). Upon the exit we have the same + * picture as after the successful ext4_get_block(), except that in one + * place chain is disconnected - *branch->p is still zero (we did not + * set the last link), but branch->key contains the number that should + * be placed into *branch->p to fill that gap. + * + * If allocation fails we free all blocks we've allocated (and forget + * their buffer_heads) and return the error value the from failed + * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain + * as described above and return 0. + */ +static int ext4_alloc_branch(handle_t *handle, struct inode *inode, + ext4_lblk_t iblock, int indirect_blks, + int *blks, ext4_fsblk_t goal, + ext4_lblk_t *offsets, Indirect *branch) +{ + int blocksize = inode->i_sb->s_blocksize; + int i, n = 0; + int err = 0; + struct buffer_head *bh; + int num; + ext4_fsblk_t new_blocks[4]; + ext4_fsblk_t current_block; + + num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks, + *blks, new_blocks, &err); + if (err) + return err; + + branch[0].key = cpu_to_le32(new_blocks[0]); + /* + * metadata blocks and data blocks are allocated. + */ + for (n = 1; n <= indirect_blks; n++) { + /* + * Get buffer_head for parent block, zero it out + * and set the pointer to new one, then send + * parent to disk. + */ + bh = sb_getblk(inode->i_sb, new_blocks[n-1]); + if (unlikely(!bh)) { + err = -EIO; + goto failed; + } + + branch[n].bh = bh; + lock_buffer(bh); + BUFFER_TRACE(bh, "call get_create_access"); + err = ext4_journal_get_create_access(handle, bh); + if (err) { + /* Don't brelse(bh) here; it's done in + * ext4_journal_forget() below */ + unlock_buffer(bh); + goto failed; + } + + memset(bh->b_data, 0, blocksize); + branch[n].p = (__le32 *) bh->b_data + offsets[n]; + branch[n].key = cpu_to_le32(new_blocks[n]); + *branch[n].p = branch[n].key; + if (n == indirect_blks) { + current_block = new_blocks[n]; + /* + * End of chain, update the last new metablock of + * the chain to point to the new allocated + * data blocks numbers + */ + for (i = 1; i < num; i++) + *(branch[n].p + i) = cpu_to_le32(++current_block); + } + BUFFER_TRACE(bh, "marking uptodate"); + set_buffer_uptodate(bh); + unlock_buffer(bh); + + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, inode, bh); + if (err) + goto failed; + } + *blks = num; + return err; +failed: + /* Allocation failed, free what we already allocated */ + ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0); + for (i = 1; i <= n ; i++) { + /* + * branch[i].bh is newly allocated, so there is no + * need to revoke the block, which is why we don't + * need to set EXT4_FREE_BLOCKS_METADATA. + */ + ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, + EXT4_FREE_BLOCKS_FORGET); + } + for (i = n+1; i < indirect_blks; i++) + ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); + + ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0); + + return err; +} + +/** + * ext4_splice_branch - splice the allocated branch onto inode. + * @handle: handle for this transaction + * @inode: owner + * @block: (logical) number of block we are adding + * @chain: chain of indirect blocks (with a missing link - see + * ext4_alloc_branch) + * @where: location of missing link + * @num: number of indirect blocks we are adding + * @blks: number of direct blocks we are adding + * + * This function fills the missing link and does all housekeeping needed in + * inode (->i_blocks, etc.). In case of success we end up with the full + * chain to new block and return 0. + */ +static int ext4_splice_branch(handle_t *handle, struct inode *inode, + ext4_lblk_t block, Indirect *where, int num, + int blks) +{ + int i; + int err = 0; + ext4_fsblk_t current_block; + + /* + * If we're splicing into a [td]indirect block (as opposed to the + * inode) then we need to get write access to the [td]indirect block + * before the splice. + */ + if (where->bh) { + BUFFER_TRACE(where->bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, where->bh); + if (err) + goto err_out; + } + /* That's it */ + + *where->p = where->key; + + /* + * Update the host buffer_head or inode to point to more just allocated + * direct blocks blocks + */ + if (num == 0 && blks > 1) { + current_block = le32_to_cpu(where->key) + 1; + for (i = 1; i < blks; i++) + *(where->p + i) = cpu_to_le32(current_block++); + } + + /* We are done with atomic stuff, now do the rest of housekeeping */ + /* had we spliced it onto indirect block? */ + if (where->bh) { + /* + * If we spliced it onto an indirect block, we haven't + * altered the inode. Note however that if it is being spliced + * onto an indirect block at the very end of the file (the + * file is growing) then we *will* alter the inode to reflect + * the new i_size. But that is not done here - it is done in + * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode. + */ + jbd_debug(5, "splicing indirect only\n"); + BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, inode, where->bh); + if (err) + goto err_out; + } else { + /* + * OK, we spliced it into the inode itself on a direct block. + */ + ext4_mark_inode_dirty(handle, inode); + jbd_debug(5, "splicing direct\n"); + } + return err; + +err_out: + for (i = 1; i <= num; i++) { + /* + * branch[i].bh is newly allocated, so there is no + * need to revoke the block, which is why we don't + * need to set EXT4_FREE_BLOCKS_METADATA. + */ + ext4_free_blocks(handle, inode, where[i].bh, 0, 1, + EXT4_FREE_BLOCKS_FORGET); + } + ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key), + blks, 0); + + return err; +} + +/* + * The ext4_ind_map_blocks() function handles non-extents inodes + * (i.e., using the traditional indirect/double-indirect i_blocks + * scheme) for ext4_map_blocks(). + * + * Allocation strategy is simple: if we have to allocate something, we will + * have to go the whole way to leaf. So let's do it before attaching anything + * to tree, set linkage between the newborn blocks, write them if sync is + * required, recheck the path, free and repeat if check fails, otherwise + * set the last missing link (that will protect us from any truncate-generated + * removals - all blocks on the path are immune now) and possibly force the + * write on the parent block. + * That has a nice additional property: no special recovery from the failed + * allocations is needed - we simply release blocks and do not touch anything + * reachable from inode. + * + * `handle' can be NULL if create == 0. + * + * return > 0, # of blocks mapped or allocated. + * return = 0, if plain lookup failed. + * return < 0, error case. + * + * The ext4_ind_get_blocks() function should be called with + * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem + * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or + * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system + * blocks. + */ +int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, + int flags) +{ + int err = -EIO; + ext4_lblk_t offsets[4]; + Indirect chain[4]; + Indirect *partial; + ext4_fsblk_t goal; + int indirect_blks; + int blocks_to_boundary = 0; + int depth; + int count = 0; + ext4_fsblk_t first_block = 0; + + trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); + J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))); + J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); + depth = ext4_block_to_path(inode, map->m_lblk, offsets, + &blocks_to_boundary); + + if (depth == 0) + goto out; + + partial = ext4_get_branch(inode, depth, offsets, chain, &err); + + /* Simplest case - block found, no allocation needed */ + if (!partial) { + first_block = le32_to_cpu(chain[depth - 1].key); + count++; + /*map more blocks*/ + while (count < map->m_len && count <= blocks_to_boundary) { + ext4_fsblk_t blk; + + blk = le32_to_cpu(*(chain[depth-1].p + count)); + + if (blk == first_block + count) + count++; + else + break; + } + goto got_it; + } + + /* Next simple case - plain lookup or failed read of indirect block */ + if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO) + goto cleanup; + + /* + * Okay, we need to do block allocation. + */ + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { + EXT4_ERROR_INODE(inode, "Can't allocate blocks for " + "non-extent mapped inodes with bigalloc"); + return -ENOSPC; + } + + goal = ext4_find_goal(inode, map->m_lblk, partial); + + /* the number of blocks need to allocate for [d,t]indirect blocks */ + indirect_blks = (chain + depth) - partial - 1; + + /* + * Next look up the indirect map to count the totoal number of + * direct blocks to allocate for this branch. + */ + count = ext4_blks_to_allocate(partial, indirect_blks, + map->m_len, blocks_to_boundary); + /* + * Block out ext4_truncate while we alter the tree + */ + err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks, + &count, goal, + offsets + (partial - chain), partial); + + /* + * The ext4_splice_branch call will free and forget any buffers + * on the new chain if there is a failure, but that risks using + * up transaction credits, especially for bitmaps where the + * credits cannot be returned. Can we handle this somehow? We + * may need to return -EAGAIN upwards in the worst case. --sct + */ + if (!err) + err = ext4_splice_branch(handle, inode, map->m_lblk, + partial, indirect_blks, count); + if (err) + goto cleanup; + + map->m_flags |= EXT4_MAP_NEW; + + ext4_update_inode_fsync_trans(handle, inode, 1); +got_it: + map->m_flags |= EXT4_MAP_MAPPED; + map->m_pblk = le32_to_cpu(chain[depth-1].key); + map->m_len = count; + if (count > blocks_to_boundary) + map->m_flags |= EXT4_MAP_BOUNDARY; + err = count; + /* Clean up and exit */ + partial = chain + depth - 1; /* the whole chain */ +cleanup: + while (partial > chain) { + BUFFER_TRACE(partial->bh, "call brelse"); + brelse(partial->bh); + partial--; + } +out: + trace_ext4_ind_map_blocks_exit(inode, map->m_lblk, + map->m_pblk, map->m_len, err); + return err; +} + +/* + * O_DIRECT for ext3 (or indirect map) based files + * + * If the O_DIRECT write will extend the file then add this inode to the + * orphan list. So recovery will truncate it back to the original size + * if the machine crashes during the write. + * + * If the O_DIRECT write is intantiating holes inside i_size and the machine + * crashes then stale disk data _may_ be exposed inside the file. But current + * VFS code falls back into buffered path in that case so we are safe. + */ +ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, + unsigned long nr_segs) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + struct ext4_inode_info *ei = EXT4_I(inode); + handle_t *handle; + ssize_t ret; + int orphan = 0; + size_t count = iov_length(iov, nr_segs); + int retries = 0; + + if (rw == WRITE) { + loff_t final_size = offset + count; + + if (final_size > inode->i_size) { + /* Credits for sb + inode write */ + handle = ext4_journal_start(inode, 2); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + ret = ext4_orphan_add(handle, inode); + if (ret) { + ext4_journal_stop(handle); + goto out; + } + orphan = 1; + ei->i_disksize = inode->i_size; + ext4_journal_stop(handle); + } + } + +retry: + if (rw == READ && ext4_should_dioread_nolock(inode)) { + if (unlikely(!list_empty(&ei->i_completed_io_list))) { + mutex_lock(&inode->i_mutex); + ext4_flush_completed_IO(inode); + mutex_unlock(&inode->i_mutex); + } + ret = __blockdev_direct_IO(rw, iocb, inode, + inode->i_sb->s_bdev, iov, + offset, nr_segs, + ext4_get_block, NULL, NULL, 0); + } else { + ret = blockdev_direct_IO(rw, iocb, inode, iov, + offset, nr_segs, ext4_get_block); + + if (unlikely((rw & WRITE) && ret < 0)) { + loff_t isize = i_size_read(inode); + loff_t end = offset + iov_length(iov, nr_segs); + + if (end > isize) + ext4_truncate_failed_write(inode); + } + } + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + + if (orphan) { + int err; + + /* Credits for sb + inode write */ + handle = ext4_journal_start(inode, 2); + if (IS_ERR(handle)) { + /* This is really bad luck. We've written the data + * but cannot extend i_size. Bail out and pretend + * the write failed... */ + ret = PTR_ERR(handle); + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + + goto out; + } + if (inode->i_nlink) + ext4_orphan_del(handle, inode); + if (ret > 0) { + loff_t end = offset + ret; + if (end > inode->i_size) { + ei->i_disksize = end; + i_size_write(inode, end); + /* + * We're going to return a positive `ret' + * here due to non-zero-length I/O, so there's + * no way of reporting error returns from + * ext4_mark_inode_dirty() to userspace. So + * ignore it. + */ + ext4_mark_inode_dirty(handle, inode); + } + } + err = ext4_journal_stop(handle); + if (ret == 0) + ret = err; + } +out: + return ret; +} + +/* + * Calculate the number of metadata blocks need to reserve + * to allocate a new block at @lblocks for non extent file based file + */ +int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1); + int blk_bits; + + if (lblock < EXT4_NDIR_BLOCKS) + return 0; + + lblock -= EXT4_NDIR_BLOCKS; + + if (ei->i_da_metadata_calc_len && + (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) { + ei->i_da_metadata_calc_len++; + return 0; + } + ei->i_da_metadata_calc_last_lblock = lblock & dind_mask; + ei->i_da_metadata_calc_len = 1; + blk_bits = order_base_2(lblock); + return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1; +} + +int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk) +{ + int indirects; + + /* if nrblocks are contiguous */ + if (chunk) { + /* + * With N contiguous data blocks, we need at most + * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks, + * 2 dindirect blocks, and 1 tindirect block + */ + return DIV_ROUND_UP(nrblocks, + EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4; + } + /* + * if nrblocks are not contiguous, worse case, each block touch + * a indirect block, and each indirect block touch a double indirect + * block, plus a triple indirect block + */ + indirects = nrblocks * 2 + 1; + return indirects; +} + +/* + * Truncate transactions can be complex and absolutely huge. So we need to + * be able to restart the transaction at a conventient checkpoint to make + * sure we don't overflow the journal. + * + * start_transaction gets us a new handle for a truncate transaction, + * and extend_transaction tries to extend the existing one a bit. If + * extend fails, we need to propagate the failure up and restart the + * transaction in the top-level truncate loop. --sct + */ +static handle_t *start_transaction(struct inode *inode) +{ + handle_t *result; + + result = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)); + if (!IS_ERR(result)) + return result; + + ext4_std_error(inode->i_sb, PTR_ERR(result)); + return result; +} + +/* + * Try to extend this transaction for the purposes of truncation. + * + * Returns 0 if we managed to create more room. If we can't create more + * room, and the transaction must be restarted we return 1. + */ +static int try_to_extend_transaction(handle_t *handle, struct inode *inode) +{ + if (!ext4_handle_valid(handle)) + return 0; + if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1)) + return 0; + if (!ext4_journal_extend(handle, ext4_blocks_for_truncate(inode))) + return 0; + return 1; +} + +/* + * Probably it should be a library function... search for first non-zero word + * or memcmp with zero_page, whatever is better for particular architecture. + * Linus? + */ +static inline int all_zeroes(__le32 *p, __le32 *q) +{ + while (p < q) + if (*p++) + return 0; + return 1; +} + +/** + * ext4_find_shared - find the indirect blocks for partial truncation. + * @inode: inode in question + * @depth: depth of the affected branch + * @offsets: offsets of pointers in that branch (see ext4_block_to_path) + * @chain: place to store the pointers to partial indirect blocks + * @top: place to the (detached) top of branch + * + * This is a helper function used by ext4_truncate(). + * + * When we do truncate() we may have to clean the ends of several + * indirect blocks but leave the blocks themselves alive. Block is + * partially truncated if some data below the new i_size is referred + * from it (and it is on the path to the first completely truncated + * data block, indeed). We have to free the top of that path along + * with everything to the right of the path. Since no allocation + * past the truncation point is possible until ext4_truncate() + * finishes, we may safely do the latter, but top of branch may + * require special attention - pageout below the truncation point + * might try to populate it. + * + * We atomically detach the top of branch from the tree, store the + * block number of its root in *@top, pointers to buffer_heads of + * partially truncated blocks - in @chain[].bh and pointers to + * their last elements that should not be removed - in + * @chain[].p. Return value is the pointer to last filled element + * of @chain. + * + * The work left to caller to do the actual freeing of subtrees: + * a) free the subtree starting from *@top + * b) free the subtrees whose roots are stored in + * (@chain[i].p+1 .. end of @chain[i].bh->b_data) + * c) free the subtrees growing from the inode past the @chain[0]. + * (no partially truncated stuff there). */ + +static Indirect *ext4_find_shared(struct inode *inode, int depth, + ext4_lblk_t offsets[4], Indirect chain[4], + __le32 *top) +{ + Indirect *partial, *p; + int k, err; + + *top = 0; + /* Make k index the deepest non-null offset + 1 */ + for (k = depth; k > 1 && !offsets[k-1]; k--) + ; + partial = ext4_get_branch(inode, k, offsets, chain, &err); + /* Writer: pointers */ + if (!partial) + partial = chain + k-1; + /* + * If the branch acquired continuation since we've looked at it - + * fine, it should all survive and (new) top doesn't belong to us. + */ + if (!partial->key && *partial->p) + /* Writer: end */ + goto no_top; + for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--) + ; + /* + * OK, we've found the last block that must survive. The rest of our + * branch should be detached before unlocking. However, if that rest + * of branch is all ours and does not grow immediately from the inode + * it's easier to cheat and just decrement partial->p. + */ + if (p == chain + k - 1 && p > chain) { + p->p--; + } else { + *top = *p->p; + /* Nope, don't do this in ext4. Must leave the tree intact */ +#if 0 + *p->p = 0; +#endif + } + /* Writer: end */ + + while (partial > p) { + brelse(partial->bh); + partial--; + } +no_top: + return partial; +} + +/* + * Zero a number of block pointers in either an inode or an indirect block. + * If we restart the transaction we must again get write access to the + * indirect block for further modification. + * + * We release `count' blocks on disk, but (last - first) may be greater + * than `count' because there can be holes in there. + * + * Return 0 on success, 1 on invalid block range + * and < 0 on fatal error. + */ +static int ext4_clear_blocks(handle_t *handle, struct inode *inode, + struct buffer_head *bh, + ext4_fsblk_t block_to_free, + unsigned long count, __le32 *first, + __le32 *last) +{ + __le32 *p; + int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED; + int err; + + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + flags |= EXT4_FREE_BLOCKS_METADATA; + + if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, + count)) { + EXT4_ERROR_INODE(inode, "attempt to clear invalid " + "blocks %llu len %lu", + (unsigned long long) block_to_free, count); + return 1; + } + + if (try_to_extend_transaction(handle, inode)) { + if (bh) { + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, inode, bh); + if (unlikely(err)) + goto out_err; + } + err = ext4_mark_inode_dirty(handle, inode); + if (unlikely(err)) + goto out_err; + err = ext4_truncate_restart_trans(handle, inode, + ext4_blocks_for_truncate(inode)); + if (unlikely(err)) + goto out_err; + if (bh) { + BUFFER_TRACE(bh, "retaking write access"); + err = ext4_journal_get_write_access(handle, bh); + if (unlikely(err)) + goto out_err; + } + } + + for (p = first; p < last; p++) + *p = 0; + + ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags); + return 0; +out_err: + ext4_std_error(inode->i_sb, err); + return err; +} + +/** + * ext4_free_data - free a list of data blocks + * @handle: handle for this transaction + * @inode: inode we are dealing with + * @this_bh: indirect buffer_head which contains *@first and *@last + * @first: array of block numbers + * @last: points immediately past the end of array + * + * We are freeing all blocks referred from that array (numbers are stored as + * little-endian 32-bit) and updating @inode->i_blocks appropriately. + * + * We accumulate contiguous runs of blocks to free. Conveniently, if these + * blocks are contiguous then releasing them at one time will only affect one + * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't + * actually use a lot of journal space. + * + * @this_bh will be %NULL if @first and @last point into the inode's direct + * block pointers. + */ +static void ext4_free_data(handle_t *handle, struct inode *inode, + struct buffer_head *this_bh, + __le32 *first, __le32 *last) +{ + ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */ + unsigned long count = 0; /* Number of blocks in the run */ + __le32 *block_to_free_p = NULL; /* Pointer into inode/ind + corresponding to + block_to_free */ + ext4_fsblk_t nr; /* Current block # */ + __le32 *p; /* Pointer into inode/ind + for current block */ + int err = 0; + + if (this_bh) { /* For indirect block */ + BUFFER_TRACE(this_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, this_bh); + /* Important: if we can't update the indirect pointers + * to the blocks, we can't free them. */ + if (err) + return; + } + + for (p = first; p < last; p++) { + nr = le32_to_cpu(*p); + if (nr) { + /* accumulate blocks to free if they're contiguous */ + if (count == 0) { + block_to_free = nr; + block_to_free_p = p; + count = 1; + } else if (nr == block_to_free + count) { + count++; + } else { + err = ext4_clear_blocks(handle, inode, this_bh, + block_to_free, count, + block_to_free_p, p); + if (err) + break; + block_to_free = nr; + block_to_free_p = p; + count = 1; + } + } + } + + if (!err && count > 0) + err = ext4_clear_blocks(handle, inode, this_bh, block_to_free, + count, block_to_free_p, p); + if (err < 0) + /* fatal error */ + return; + + if (this_bh) { + BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata"); + + /* + * The buffer head should have an attached journal head at this + * point. However, if the data is corrupted and an indirect + * block pointed to itself, it would have been detached when + * the block was cleared. Check for this instead of OOPSing. + */ + if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) + ext4_handle_dirty_metadata(handle, inode, this_bh); + else + EXT4_ERROR_INODE(inode, + "circular indirect block detected at " + "block %llu", + (unsigned long long) this_bh->b_blocknr); + } +} + +/** + * ext4_free_branches - free an array of branches + * @handle: JBD handle for this transaction + * @inode: inode we are dealing with + * @parent_bh: the buffer_head which contains *@first and *@last + * @first: array of block numbers + * @last: pointer immediately past the end of array + * @depth: depth of the branches to free + * + * We are freeing all blocks referred from these branches (numbers are + * stored as little-endian 32-bit) and updating @inode->i_blocks + * appropriately. + */ +static void ext4_free_branches(handle_t *handle, struct inode *inode, + struct buffer_head *parent_bh, + __le32 *first, __le32 *last, int depth) +{ + ext4_fsblk_t nr; + __le32 *p; + + if (ext4_handle_is_aborted(handle)) + return; + + if (depth--) { + struct buffer_head *bh; + int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); + p = last; + while (--p >= first) { + nr = le32_to_cpu(*p); + if (!nr) + continue; /* A hole */ + + if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), + nr, 1)) { + EXT4_ERROR_INODE(inode, + "invalid indirect mapped " + "block %lu (level %d)", + (unsigned long) nr, depth); + break; + } + + /* Go read the buffer for the next level down */ + bh = sb_bread(inode->i_sb, nr); + + /* + * A read failure? Report error and clear slot + * (should be rare). + */ + if (!bh) { + EXT4_ERROR_INODE_BLOCK(inode, nr, + "Read failure"); + continue; + } + + /* This zaps the entire block. Bottom up. */ + BUFFER_TRACE(bh, "free child branches"); + ext4_free_branches(handle, inode, bh, + (__le32 *) bh->b_data, + (__le32 *) bh->b_data + addr_per_block, + depth); + brelse(bh); + + /* + * Everything below this this pointer has been + * released. Now let this top-of-subtree go. + * + * We want the freeing of this indirect block to be + * atomic in the journal with the updating of the + * bitmap block which owns it. So make some room in + * the journal. + * + * We zero the parent pointer *after* freeing its + * pointee in the bitmaps, so if extend_transaction() + * for some reason fails to put the bitmap changes and + * the release into the same transaction, recovery + * will merely complain about releasing a free block, + * rather than leaking blocks. + */ + if (ext4_handle_is_aborted(handle)) + return; + if (try_to_extend_transaction(handle, inode)) { + ext4_mark_inode_dirty(handle, inode); + ext4_truncate_restart_trans(handle, inode, + ext4_blocks_for_truncate(inode)); + } + + /* + * The forget flag here is critical because if + * we are journaling (and not doing data + * journaling), we have to make sure a revoke + * record is written to prevent the journal + * replay from overwriting the (former) + * indirect block if it gets reallocated as a + * data block. This must happen in the same + * transaction where the data blocks are + * actually freed. + */ + ext4_free_blocks(handle, inode, NULL, nr, 1, + EXT4_FREE_BLOCKS_METADATA| + EXT4_FREE_BLOCKS_FORGET); + + if (parent_bh) { + /* + * The block which we have just freed is + * pointed to by an indirect block: journal it + */ + BUFFER_TRACE(parent_bh, "get_write_access"); + if (!ext4_journal_get_write_access(handle, + parent_bh)){ + *p = 0; + BUFFER_TRACE(parent_bh, + "call ext4_handle_dirty_metadata"); + ext4_handle_dirty_metadata(handle, + inode, + parent_bh); + } + } + } + } else { + /* We have reached the bottom of the tree. */ + BUFFER_TRACE(parent_bh, "free data blocks"); + ext4_free_data(handle, inode, parent_bh, first, last); + } +} + +void ext4_ind_truncate(struct inode *inode) +{ + handle_t *handle; + struct ext4_inode_info *ei = EXT4_I(inode); + __le32 *i_data = ei->i_data; + int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); + struct address_space *mapping = inode->i_mapping; + ext4_lblk_t offsets[4]; + Indirect chain[4]; + Indirect *partial; + __le32 nr = 0; + int n = 0; + ext4_lblk_t last_block, max_block; + loff_t page_len; + unsigned blocksize = inode->i_sb->s_blocksize; + int err; + + handle = start_transaction(inode); + if (IS_ERR(handle)) + return; /* AKPM: return what? */ + + last_block = (inode->i_size + blocksize-1) + >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); + max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) + >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); + + if (inode->i_size % PAGE_CACHE_SIZE != 0) { + page_len = PAGE_CACHE_SIZE - + (inode->i_size & (PAGE_CACHE_SIZE - 1)); + + err = ext4_discard_partial_page_buffers(handle, + mapping, inode->i_size, page_len, 0); + + if (err) + goto out_stop; + } + + if (last_block != max_block) { + n = ext4_block_to_path(inode, last_block, offsets, NULL); + if (n == 0) + goto out_stop; /* error */ + } + + /* + * OK. This truncate is going to happen. We add the inode to the + * orphan list, so that if this truncate spans multiple transactions, + * and we crash, we will resume the truncate when the filesystem + * recovers. It also marks the inode dirty, to catch the new size. + * + * Implication: the file must always be in a sane, consistent + * truncatable state while each transaction commits. + */ + if (ext4_orphan_add(handle, inode)) + goto out_stop; + + /* + * From here we block out all ext4_get_block() callers who want to + * modify the block allocation tree. + */ + down_write(&ei->i_data_sem); + + ext4_discard_preallocations(inode); + + /* + * The orphan list entry will now protect us from any crash which + * occurs before the truncate completes, so it is now safe to propagate + * the new, shorter inode size (held for now in i_size) into the + * on-disk inode. We do this via i_disksize, which is the value which + * ext4 *really* writes onto the disk inode. + */ + ei->i_disksize = inode->i_size; + + if (last_block == max_block) { + /* + * It is unnecessary to free any data blocks if last_block is + * equal to the indirect block limit. + */ + goto out_unlock; + } else if (n == 1) { /* direct blocks */ + ext4_free_data(handle, inode, NULL, i_data+offsets[0], + i_data + EXT4_NDIR_BLOCKS); + goto do_indirects; + } + + partial = ext4_find_shared(inode, n, offsets, chain, &nr); + /* Kill the top of shared branch (not detached) */ + if (nr) { + if (partial == chain) { + /* Shared branch grows from the inode */ + ext4_free_branches(handle, inode, NULL, + &nr, &nr+1, (chain+n-1) - partial); + *partial->p = 0; + /* + * We mark the inode dirty prior to restart, + * and prior to stop. No need for it here. + */ + } else { + /* Shared branch grows from an indirect block */ + BUFFER_TRACE(partial->bh, "get_write_access"); + ext4_free_branches(handle, inode, partial->bh, + partial->p, + partial->p+1, (chain+n-1) - partial); + } + } + /* Clear the ends of indirect blocks on the shared branch */ + while (partial > chain) { + ext4_free_branches(handle, inode, partial->bh, partial->p + 1, + (__le32*)partial->bh->b_data+addr_per_block, + (chain+n-1) - partial); + BUFFER_TRACE(partial->bh, "call brelse"); + brelse(partial->bh); + partial--; + } +do_indirects: + /* Kill the remaining (whole) subtrees */ + switch (offsets[0]) { + default: + nr = i_data[EXT4_IND_BLOCK]; + if (nr) { + ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); + i_data[EXT4_IND_BLOCK] = 0; + } + case EXT4_IND_BLOCK: + nr = i_data[EXT4_DIND_BLOCK]; + if (nr) { + ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); + i_data[EXT4_DIND_BLOCK] = 0; + } + case EXT4_DIND_BLOCK: + nr = i_data[EXT4_TIND_BLOCK]; + if (nr) { + ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); + i_data[EXT4_TIND_BLOCK] = 0; + } + case EXT4_TIND_BLOCK: + ; + } + +out_unlock: + up_write(&ei->i_data_sem); + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); + + /* + * In a multi-transaction truncate, we only make the final transaction + * synchronous + */ + if (IS_SYNC(inode)) + ext4_handle_sync(handle); +out_stop: + /* + * If this was a simple ftruncate(), and the file will remain alive + * then we need to clear up the orphan record which we created above. + * However, if this was a real unlink then we were called by + * ext4_delete_inode(), and we allow that function to clean up the + * orphan info for us. + */ + if (inode->i_nlink) + ext4_orphan_del(handle, inode); + + ext4_journal_stop(handle); + trace_ext4_truncate_exit(inode); +} + diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 678cde834f19..848f436df29f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -12,10 +12,6 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * - * Goal-directed block allocation by Stephen Tweedie - * (sct@redhat.com), 1993, 1998 - * Big-endian to little-endian byte-swapping/bitmaps by - * David S. Miller (davem@caip.rutgers.edu), 1995 * 64-bit file support on 64-bit platforms by Jakub Jelinek * (jj@sunsite.ms.mff.cuni.cz) * @@ -46,7 +42,7 @@ #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" -#include "ext4_extents.h" +#include "truncate.h" #include <trace/events/ext4.h> @@ -89,72 +85,6 @@ static int ext4_inode_is_fast_symlink(struct inode *inode) } /* - * Work out how many blocks we need to proceed with the next chunk of a - * truncate transaction. - */ -static unsigned long blocks_for_truncate(struct inode *inode) -{ - ext4_lblk_t needed; - - needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); - - /* Give ourselves just enough room to cope with inodes in which - * i_blocks is corrupt: we've seen disk corruptions in the past - * which resulted in random data in an inode which looked enough - * like a regular file for ext4 to try to delete it. Things - * will go a bit crazy if that happens, but at least we should - * try not to panic the whole kernel. */ - if (needed < 2) - needed = 2; - - /* But we need to bound the transaction so we don't overflow the - * journal. */ - if (needed > EXT4_MAX_TRANS_DATA) - needed = EXT4_MAX_TRANS_DATA; - - return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed; -} - -/* - * Truncate transactions can be complex and absolutely huge. So we need to - * be able to restart the transaction at a conventient checkpoint to make - * sure we don't overflow the journal. - * - * start_transaction gets us a new handle for a truncate transaction, - * and extend_transaction tries to extend the existing one a bit. If - * extend fails, we need to propagate the failure up and restart the - * transaction in the top-level truncate loop. --sct - */ -static handle_t *start_transaction(struct inode *inode) -{ - handle_t *result; - - result = ext4_journal_start(inode, blocks_for_truncate(inode)); - if (!IS_ERR(result)) - return result; - - ext4_std_error(inode->i_sb, PTR_ERR(result)); - return result; -} - -/* - * Try to extend this transaction for the purposes of truncation. - * - * Returns 0 if we managed to create more room. If we can't create more - * room, and the transaction must be restarted we return 1. - */ -static int try_to_extend_transaction(handle_t *handle, struct inode *inode) -{ - if (!ext4_handle_valid(handle)) - return 0; - if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1)) - return 0; - if (!ext4_journal_extend(handle, blocks_for_truncate(inode))) - return 0; - return 1; -} - -/* * Restart the transaction associated with *handle. This does a commit, * so before we call here everything must be consistently dirtied against * this transaction. @@ -189,7 +119,37 @@ void ext4_evict_inode(struct inode *inode) int err; trace_ext4_evict_inode(inode); + + ext4_ioend_wait(inode); + if (inode->i_nlink) { + /* + * When journalling data dirty buffers are tracked only in the + * journal. So although mm thinks everything is clean and + * ready for reaping the inode might still have some pages to + * write in the running transaction or waiting to be + * checkpointed. Thus calling jbd2_journal_invalidatepage() + * (via truncate_inode_pages()) to discard these buffers can + * cause data loss. Also even if we did not discard these + * buffers, we would have no way to find them after the inode + * is reaped and thus user could see stale data if he tries to + * read them before the transaction is checkpointed. So be + * careful and force everything to disk here... We use + * ei->i_datasync_tid to store the newest transaction + * containing inode's data. + * + * Note that directories do not have this problem because they + * don't use page cache. + */ + if (ext4_should_journal_data(inode) && + (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) { + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; + tid_t commit_tid = EXT4_I(inode)->i_datasync_tid; + + jbd2_log_start_commit(journal, commit_tid); + jbd2_log_wait_commit(journal, commit_tid); + filemap_write_and_wait(&inode->i_data); + } truncate_inode_pages(&inode->i_data, 0); goto no_delete; } @@ -204,7 +164,7 @@ void ext4_evict_inode(struct inode *inode) if (is_bad_inode(inode)) goto no_delete; - handle = ext4_journal_start(inode, blocks_for_truncate(inode)+3); + handle = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)+3); if (IS_ERR(handle)) { ext4_std_error(inode->i_sb, PTR_ERR(handle)); /* @@ -277,793 +237,6 @@ no_delete: ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ } -typedef struct { - __le32 *p; - __le32 key; - struct buffer_head *bh; -} Indirect; - -static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) -{ - p->key = *(p->p = v); - p->bh = bh; -} - -/** - * ext4_block_to_path - parse the block number into array of offsets - * @inode: inode in question (we are only interested in its superblock) - * @i_block: block number to be parsed - * @offsets: array to store the offsets in - * @boundary: set this non-zero if the referred-to block is likely to be - * followed (on disk) by an indirect block. - * - * To store the locations of file's data ext4 uses a data structure common - * for UNIX filesystems - tree of pointers anchored in the inode, with - * data blocks at leaves and indirect blocks in intermediate nodes. - * This function translates the block number into path in that tree - - * return value is the path length and @offsets[n] is the offset of - * pointer to (n+1)th node in the nth one. If @block is out of range - * (negative or too large) warning is printed and zero returned. - * - * Note: function doesn't find node addresses, so no IO is needed. All - * we need to know is the capacity of indirect blocks (taken from the - * inode->i_sb). - */ - -/* - * Portability note: the last comparison (check that we fit into triple - * indirect block) is spelled differently, because otherwise on an - * architecture with 32-bit longs and 8Kb pages we might get into trouble - * if our filesystem had 8Kb blocks. We might use long long, but that would - * kill us on x86. Oh, well, at least the sign propagation does not matter - - * i_block would have to be negative in the very beginning, so we would not - * get there at all. - */ - -static int ext4_block_to_path(struct inode *inode, - ext4_lblk_t i_block, - ext4_lblk_t offsets[4], int *boundary) -{ - int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb); - int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb); - const long direct_blocks = EXT4_NDIR_BLOCKS, - indirect_blocks = ptrs, - double_blocks = (1 << (ptrs_bits * 2)); - int n = 0; - int final = 0; - - if (i_block < direct_blocks) { - offsets[n++] = i_block; - final = direct_blocks; - } else if ((i_block -= direct_blocks) < indirect_blocks) { - offsets[n++] = EXT4_IND_BLOCK; - offsets[n++] = i_block; - final = ptrs; - } else if ((i_block -= indirect_blocks) < double_blocks) { - offsets[n++] = EXT4_DIND_BLOCK; - offsets[n++] = i_block >> ptrs_bits; - offsets[n++] = i_block & (ptrs - 1); - final = ptrs; - } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { - offsets[n++] = EXT4_TIND_BLOCK; - offsets[n++] = i_block >> (ptrs_bits * 2); - offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); - offsets[n++] = i_block & (ptrs - 1); - final = ptrs; - } else { - ext4_warning(inode->i_sb, "block %lu > max in inode %lu", - i_block + direct_blocks + - indirect_blocks + double_blocks, inode->i_ino); - } - if (boundary) - *boundary = final - 1 - (i_block & (ptrs - 1)); - return n; -} - -static int __ext4_check_blockref(const char *function, unsigned int line, - struct inode *inode, - __le32 *p, unsigned int max) -{ - struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; - __le32 *bref = p; - unsigned int blk; - - while (bref < p+max) { - blk = le32_to_cpu(*bref++); - if (blk && - unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), - blk, 1))) { - es->s_last_error_block = cpu_to_le64(blk); - ext4_error_inode(inode, function, line, blk, - "invalid block"); - return -EIO; - } - } - return 0; -} - - -#define ext4_check_indirect_blockref(inode, bh) \ - __ext4_check_blockref(__func__, __LINE__, inode, \ - (__le32 *)(bh)->b_data, \ - EXT4_ADDR_PER_BLOCK((inode)->i_sb)) - -#define ext4_check_inode_blockref(inode) \ - __ext4_check_blockref(__func__, __LINE__, inode, \ - EXT4_I(inode)->i_data, \ - EXT4_NDIR_BLOCKS) - -/** - * ext4_get_branch - read the chain of indirect blocks leading to data - * @inode: inode in question - * @depth: depth of the chain (1 - direct pointer, etc.) - * @offsets: offsets of pointers in inode/indirect blocks - * @chain: place to store the result - * @err: here we store the error value - * - * Function fills the array of triples <key, p, bh> and returns %NULL - * if everything went OK or the pointer to the last filled triple - * (incomplete one) otherwise. Upon the return chain[i].key contains - * the number of (i+1)-th block in the chain (as it is stored in memory, - * i.e. little-endian 32-bit), chain[i].p contains the address of that - * number (it points into struct inode for i==0 and into the bh->b_data - * for i>0) and chain[i].bh points to the buffer_head of i-th indirect - * block for i>0 and NULL for i==0. In other words, it holds the block - * numbers of the chain, addresses they were taken from (and where we can - * verify that chain did not change) and buffer_heads hosting these - * numbers. - * - * Function stops when it stumbles upon zero pointer (absent block) - * (pointer to last triple returned, *@err == 0) - * or when it gets an IO error reading an indirect block - * (ditto, *@err == -EIO) - * or when it reads all @depth-1 indirect blocks successfully and finds - * the whole chain, all way to the data (returns %NULL, *err == 0). - * - * Need to be called with - * down_read(&EXT4_I(inode)->i_data_sem) - */ -static Indirect *ext4_get_branch(struct inode *inode, int depth, - ext4_lblk_t *offsets, - Indirect chain[4], int *err) -{ - struct super_block *sb = inode->i_sb; - Indirect *p = chain; - struct buffer_head *bh; - - *err = 0; - /* i_data is not going away, no lock needed */ - add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets); - if (!p->key) - goto no_block; - while (--depth) { - bh = sb_getblk(sb, le32_to_cpu(p->key)); - if (unlikely(!bh)) - goto failure; - - if (!bh_uptodate_or_lock(bh)) { - if (bh_submit_read(bh) < 0) { - put_bh(bh); - goto failure; - } - /* validate block references */ - if (ext4_check_indirect_blockref(inode, bh)) { - put_bh(bh); - goto failure; - } - } - - add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets); - /* Reader: end */ - if (!p->key) - goto no_block; - } - return NULL; - -failure: - *err = -EIO; -no_block: - return p; -} - -/** - * ext4_find_near - find a place for allocation with sufficient locality - * @inode: owner - * @ind: descriptor of indirect block. - * - * This function returns the preferred place for block allocation. - * It is used when heuristic for sequential allocation fails. - * Rules are: - * + if there is a block to the left of our position - allocate near it. - * + if pointer will live in indirect block - allocate near that block. - * + if pointer will live in inode - allocate in the same - * cylinder group. - * - * In the latter case we colour the starting block by the callers PID to - * prevent it from clashing with concurrent allocations for a different inode - * in the same block group. The PID is used here so that functionally related - * files will be close-by on-disk. - * - * Caller must make sure that @ind is valid and will stay that way. - */ -static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data; - __le32 *p; - ext4_fsblk_t bg_start; - ext4_fsblk_t last_block; - ext4_grpblk_t colour; - ext4_group_t block_group; - int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); - - /* Try to find previous block */ - for (p = ind->p - 1; p >= start; p--) { - if (*p) - return le32_to_cpu(*p); - } - - /* No such thing, so let's try location of indirect block */ - if (ind->bh) - return ind->bh->b_blocknr; - - /* - * It is going to be referred to from the inode itself? OK, just put it - * into the same cylinder group then. - */ - block_group = ei->i_block_group; - if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { - block_group &= ~(flex_size-1); - if (S_ISREG(inode->i_mode)) - block_group++; - } - bg_start = ext4_group_first_block_no(inode->i_sb, block_group); - last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; - - /* - * If we are doing delayed allocation, we don't need take - * colour into account. - */ - if (test_opt(inode->i_sb, DELALLOC)) - return bg_start; - - if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) - colour = (current->pid % 16) * - (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); - else - colour = (current->pid % 16) * ((last_block - bg_start) / 16); - return bg_start + colour; -} - -/** - * ext4_find_goal - find a preferred place for allocation. - * @inode: owner - * @block: block we want - * @partial: pointer to the last triple within a chain - * - * Normally this function find the preferred place for block allocation, - * returns it. - * Because this is only used for non-extent files, we limit the block nr - * to 32 bits. - */ -static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, - Indirect *partial) -{ - ext4_fsblk_t goal; - - /* - * XXX need to get goal block from mballoc's data structures - */ - - goal = ext4_find_near(inode, partial); - goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; - return goal; -} - -/** - * ext4_blks_to_allocate - Look up the block map and count the number - * of direct blocks need to be allocated for the given branch. - * - * @branch: chain of indirect blocks - * @k: number of blocks need for indirect blocks - * @blks: number of data blocks to be mapped. - * @blocks_to_boundary: the offset in the indirect block - * - * return the total number of blocks to be allocate, including the - * direct and indirect blocks. - */ -static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, - int blocks_to_boundary) -{ - unsigned int count = 0; - - /* - * Simple case, [t,d]Indirect block(s) has not allocated yet - * then it's clear blocks on that path have not allocated - */ - if (k > 0) { - /* right now we don't handle cross boundary allocation */ - if (blks < blocks_to_boundary + 1) - count += blks; - else - count += blocks_to_boundary + 1; - return count; - } - - count++; - while (count < blks && count <= blocks_to_boundary && - le32_to_cpu(*(branch[0].p + count)) == 0) { - count++; - } - return count; -} - -/** - * ext4_alloc_blocks: multiple allocate blocks needed for a branch - * @handle: handle for this transaction - * @inode: inode which needs allocated blocks - * @iblock: the logical block to start allocated at - * @goal: preferred physical block of allocation - * @indirect_blks: the number of blocks need to allocate for indirect - * blocks - * @blks: number of desired blocks - * @new_blocks: on return it will store the new block numbers for - * the indirect blocks(if needed) and the first direct block, - * @err: on return it will store the error code - * - * This function will return the number of blocks allocated as - * requested by the passed-in parameters. - */ -static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, - ext4_lblk_t iblock, ext4_fsblk_t goal, - int indirect_blks, int blks, - ext4_fsblk_t new_blocks[4], int *err) -{ - struct ext4_allocation_request ar; - int target, i; - unsigned long count = 0, blk_allocated = 0; - int index = 0; - ext4_fsblk_t current_block = 0; - int ret = 0; - - /* - * Here we try to allocate the requested multiple blocks at once, - * on a best-effort basis. - * To build a branch, we should allocate blocks for - * the indirect blocks(if not allocated yet), and at least - * the first direct block of this branch. That's the - * minimum number of blocks need to allocate(required) - */ - /* first we try to allocate the indirect blocks */ - target = indirect_blks; - while (target > 0) { - count = target; - /* allocating blocks for indirect blocks and direct blocks */ - current_block = ext4_new_meta_blocks(handle, inode, goal, - 0, &count, err); - if (*err) - goto failed_out; - - if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) { - EXT4_ERROR_INODE(inode, - "current_block %llu + count %lu > %d!", - current_block, count, - EXT4_MAX_BLOCK_FILE_PHYS); - *err = -EIO; - goto failed_out; - } - - target -= count; - /* allocate blocks for indirect blocks */ - while (index < indirect_blks && count) { - new_blocks[index++] = current_block++; - count--; - } - if (count > 0) { - /* - * save the new block number - * for the first direct block - */ - new_blocks[index] = current_block; - printk(KERN_INFO "%s returned more blocks than " - "requested\n", __func__); - WARN_ON(1); - break; - } - } - - target = blks - count ; - blk_allocated = count; - if (!target) - goto allocated; - /* Now allocate data blocks */ - memset(&ar, 0, sizeof(ar)); - ar.inode = inode; - ar.goal = goal; - ar.len = target; - ar.logical = iblock; - if (S_ISREG(inode->i_mode)) - /* enable in-core preallocation only for regular files */ - ar.flags = EXT4_MB_HINT_DATA; - - current_block = ext4_mb_new_blocks(handle, &ar, err); - if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) { - EXT4_ERROR_INODE(inode, - "current_block %llu + ar.len %d > %d!", - current_block, ar.len, - EXT4_MAX_BLOCK_FILE_PHYS); - *err = -EIO; - goto failed_out; - } - - if (*err && (target == blks)) { - /* - * if the allocation failed and we didn't allocate - * any blocks before - */ - goto failed_out; - } - if (!*err) { - if (target == blks) { - /* - * save the new block number - * for the first direct block - */ - new_blocks[index] = current_block; - } - blk_allocated += ar.len; - } -allocated: - /* total number of blocks allocated for direct blocks */ - ret = blk_allocated; - *err = 0; - return ret; -failed_out: - for (i = 0; i < index; i++) - ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); - return ret; -} - -/** - * ext4_alloc_branch - allocate and set up a chain of blocks. - * @handle: handle for this transaction - * @inode: owner - * @indirect_blks: number of allocated indirect blocks - * @blks: number of allocated direct blocks - * @goal: preferred place for allocation - * @offsets: offsets (in the blocks) to store the pointers to next. - * @branch: place to store the chain in. - * - * This function allocates blocks, zeroes out all but the last one, - * links them into chain and (if we are synchronous) writes them to disk. - * In other words, it prepares a branch that can be spliced onto the - * inode. It stores the information about that chain in the branch[], in - * the same format as ext4_get_branch() would do. We are calling it after - * we had read the existing part of chain and partial points to the last - * triple of that (one with zero ->key). Upon the exit we have the same - * picture as after the successful ext4_get_block(), except that in one - * place chain is disconnected - *branch->p is still zero (we did not - * set the last link), but branch->key contains the number that should - * be placed into *branch->p to fill that gap. - * - * If allocation fails we free all blocks we've allocated (and forget - * their buffer_heads) and return the error value the from failed - * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain - * as described above and return 0. - */ -static int ext4_alloc_branch(handle_t *handle, struct inode *inode, - ext4_lblk_t iblock, int indirect_blks, - int *blks, ext4_fsblk_t goal, - ext4_lblk_t *offsets, Indirect *branch) -{ - int blocksize = inode->i_sb->s_blocksize; - int i, n = 0; - int err = 0; - struct buffer_head *bh; - int num; - ext4_fsblk_t new_blocks[4]; - ext4_fsblk_t current_block; - - num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks, - *blks, new_blocks, &err); - if (err) - return err; - - branch[0].key = cpu_to_le32(new_blocks[0]); - /* - * metadata blocks and data blocks are allocated. - */ - for (n = 1; n <= indirect_blks; n++) { - /* - * Get buffer_head for parent block, zero it out - * and set the pointer to new one, then send - * parent to disk. - */ - bh = sb_getblk(inode->i_sb, new_blocks[n-1]); - if (unlikely(!bh)) { - err = -EIO; - goto failed; - } - - branch[n].bh = bh; - lock_buffer(bh); - BUFFER_TRACE(bh, "call get_create_access"); - err = ext4_journal_get_create_access(handle, bh); - if (err) { - /* Don't brelse(bh) here; it's done in - * ext4_journal_forget() below */ - unlock_buffer(bh); - goto failed; - } - - memset(bh->b_data, 0, blocksize); - branch[n].p = (__le32 *) bh->b_data + offsets[n]; - branch[n].key = cpu_to_le32(new_blocks[n]); - *branch[n].p = branch[n].key; - if (n == indirect_blks) { - current_block = new_blocks[n]; - /* - * End of chain, update the last new metablock of - * the chain to point to the new allocated - * data blocks numbers - */ - for (i = 1; i < num; i++) - *(branch[n].p + i) = cpu_to_le32(++current_block); - } - BUFFER_TRACE(bh, "marking uptodate"); - set_buffer_uptodate(bh); - unlock_buffer(bh); - - BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, inode, bh); - if (err) - goto failed; - } - *blks = num; - return err; -failed: - /* Allocation failed, free what we already allocated */ - ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0); - for (i = 1; i <= n ; i++) { - /* - * branch[i].bh is newly allocated, so there is no - * need to revoke the block, which is why we don't - * need to set EXT4_FREE_BLOCKS_METADATA. - */ - ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, - EXT4_FREE_BLOCKS_FORGET); - } - for (i = n+1; i < indirect_blks; i++) - ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); - - ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0); - - return err; -} - -/** - * ext4_splice_branch - splice the allocated branch onto inode. - * @handle: handle for this transaction - * @inode: owner - * @block: (logical) number of block we are adding - * @chain: chain of indirect blocks (with a missing link - see - * ext4_alloc_branch) - * @where: location of missing link - * @num: number of indirect blocks we are adding - * @blks: number of direct blocks we are adding - * - * This function fills the missing link and does all housekeeping needed in - * inode (->i_blocks, etc.). In case of success we end up with the full - * chain to new block and return 0. - */ -static int ext4_splice_branch(handle_t *handle, struct inode *inode, - ext4_lblk_t block, Indirect *where, int num, - int blks) -{ - int i; - int err = 0; - ext4_fsblk_t current_block; - - /* - * If we're splicing into a [td]indirect block (as opposed to the - * inode) then we need to get write access to the [td]indirect block - * before the splice. - */ - if (where->bh) { - BUFFER_TRACE(where->bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, where->bh); - if (err) - goto err_out; - } - /* That's it */ - - *where->p = where->key; - - /* - * Update the host buffer_head or inode to point to more just allocated - * direct blocks blocks - */ - if (num == 0 && blks > 1) { - current_block = le32_to_cpu(where->key) + 1; - for (i = 1; i < blks; i++) - *(where->p + i) = cpu_to_le32(current_block++); - } - - /* We are done with atomic stuff, now do the rest of housekeeping */ - /* had we spliced it onto indirect block? */ - if (where->bh) { - /* - * If we spliced it onto an indirect block, we haven't - * altered the inode. Note however that if it is being spliced - * onto an indirect block at the very end of the file (the - * file is growing) then we *will* alter the inode to reflect - * the new i_size. But that is not done here - it is done in - * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode. - */ - jbd_debug(5, "splicing indirect only\n"); - BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, inode, where->bh); - if (err) - goto err_out; - } else { - /* - * OK, we spliced it into the inode itself on a direct block. - */ - ext4_mark_inode_dirty(handle, inode); - jbd_debug(5, "splicing direct\n"); - } - return err; - -err_out: - for (i = 1; i <= num; i++) { - /* - * branch[i].bh is newly allocated, so there is no - * need to revoke the block, which is why we don't - * need to set EXT4_FREE_BLOCKS_METADATA. - */ - ext4_free_blocks(handle, inode, where[i].bh, 0, 1, - EXT4_FREE_BLOCKS_FORGET); - } - ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key), - blks, 0); - - return err; -} - -/* - * The ext4_ind_map_blocks() function handles non-extents inodes - * (i.e., using the traditional indirect/double-indirect i_blocks - * scheme) for ext4_map_blocks(). - * - * Allocation strategy is simple: if we have to allocate something, we will - * have to go the whole way to leaf. So let's do it before attaching anything - * to tree, set linkage between the newborn blocks, write them if sync is - * required, recheck the path, free and repeat if check fails, otherwise - * set the last missing link (that will protect us from any truncate-generated - * removals - all blocks on the path are immune now) and possibly force the - * write on the parent block. - * That has a nice additional property: no special recovery from the failed - * allocations is needed - we simply release blocks and do not touch anything - * reachable from inode. - * - * `handle' can be NULL if create == 0. - * - * return > 0, # of blocks mapped or allocated. - * return = 0, if plain lookup failed. - * return < 0, error case. - * - * The ext4_ind_get_blocks() function should be called with - * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem - * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or - * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system - * blocks. - */ -static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, - struct ext4_map_blocks *map, - int flags) -{ - int err = -EIO; - ext4_lblk_t offsets[4]; - Indirect chain[4]; - Indirect *partial; - ext4_fsblk_t goal; - int indirect_blks; - int blocks_to_boundary = 0; - int depth; - int count = 0; - ext4_fsblk_t first_block = 0; - - trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); - J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))); - J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); - depth = ext4_block_to_path(inode, map->m_lblk, offsets, - &blocks_to_boundary); - - if (depth == 0) - goto out; - - partial = ext4_get_branch(inode, depth, offsets, chain, &err); - - /* Simplest case - block found, no allocation needed */ - if (!partial) { - first_block = le32_to_cpu(chain[depth - 1].key); - count++; - /*map more blocks*/ - while (count < map->m_len && count <= blocks_to_boundary) { - ext4_fsblk_t blk; - - blk = le32_to_cpu(*(chain[depth-1].p + count)); - - if (blk == first_block + count) - count++; - else - break; - } - goto got_it; - } - - /* Next simple case - plain lookup or failed read of indirect block */ - if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO) - goto cleanup; - - /* - * Okay, we need to do block allocation. - */ - goal = ext4_find_goal(inode, map->m_lblk, partial); - - /* the number of blocks need to allocate for [d,t]indirect blocks */ - indirect_blks = (chain + depth) - partial - 1; - - /* - * Next look up the indirect map to count the totoal number of - * direct blocks to allocate for this branch. - */ - count = ext4_blks_to_allocate(partial, indirect_blks, - map->m_len, blocks_to_boundary); - /* - * Block out ext4_truncate while we alter the tree - */ - err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks, - &count, goal, - offsets + (partial - chain), partial); - - /* - * The ext4_splice_branch call will free and forget any buffers - * on the new chain if there is a failure, but that risks using - * up transaction credits, especially for bitmaps where the - * credits cannot be returned. Can we handle this somehow? We - * may need to return -EAGAIN upwards in the worst case. --sct - */ - if (!err) - err = ext4_splice_branch(handle, inode, map->m_lblk, - partial, indirect_blks, count); - if (err) - goto cleanup; - - map->m_flags |= EXT4_MAP_NEW; - - ext4_update_inode_fsync_trans(handle, inode, 1); -got_it: - map->m_flags |= EXT4_MAP_MAPPED; - map->m_pblk = le32_to_cpu(chain[depth-1].key); - map->m_len = count; - if (count > blocks_to_boundary) - map->m_flags |= EXT4_MAP_BOUNDARY; - err = count; - /* Clean up and exit */ - partial = chain + depth - 1; /* the whole chain */ -cleanup: - while (partial > chain) { - BUFFER_TRACE(partial->bh, "call brelse"); - brelse(partial->bh); - partial--; - } -out: - trace_ext4_ind_map_blocks_exit(inode, map->m_lblk, - map->m_pblk, map->m_len, err); - return err; -} - #ifdef CONFIG_QUOTA qsize_t *ext4_get_reserved_space(struct inode *inode) { @@ -1073,33 +246,6 @@ qsize_t *ext4_get_reserved_space(struct inode *inode) /* * Calculate the number of metadata blocks need to reserve - * to allocate a new block at @lblocks for non extent file based file - */ -static int ext4_indirect_calc_metadata_amount(struct inode *inode, - sector_t lblock) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1); - int blk_bits; - - if (lblock < EXT4_NDIR_BLOCKS) - return 0; - - lblock -= EXT4_NDIR_BLOCKS; - - if (ei->i_da_metadata_calc_len && - (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) { - ei->i_da_metadata_calc_len++; - return 0; - } - ei->i_da_metadata_calc_last_lblock = lblock & dind_mask; - ei->i_da_metadata_calc_len = 1; - blk_bits = order_base_2(lblock); - return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1; -} - -/* - * Calculate the number of metadata blocks need to reserve * to allocate a block located at @lblock */ static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) @@ -1107,7 +253,7 @@ static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) return ext4_ext_calc_metadata_amount(inode, lblock); - return ext4_indirect_calc_metadata_amount(inode, lblock); + return ext4_ind_calc_metadata_amount(inode, lblock); } /* @@ -1121,7 +267,7 @@ void ext4_da_update_reserve_space(struct inode *inode, struct ext4_inode_info *ei = EXT4_I(inode); spin_lock(&ei->i_block_reservation_lock); - trace_ext4_da_update_reserve_space(inode, used); + trace_ext4_da_update_reserve_space(inode, used, quota_claim); if (unlikely(used > ei->i_reserved_data_blocks)) { ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d " "with only %d reserved data blocks\n", @@ -1134,7 +280,7 @@ void ext4_da_update_reserve_space(struct inode *inode, /* Update per-inode reservations */ ei->i_reserved_data_blocks -= used; ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks; - percpu_counter_sub(&sbi->s_dirtyblocks_counter, + percpu_counter_sub(&sbi->s_dirtyclusters_counter, used + ei->i_allocated_meta_blocks); ei->i_allocated_meta_blocks = 0; @@ -1144,7 +290,7 @@ void ext4_da_update_reserve_space(struct inode *inode, * only when we have written all of the delayed * allocation blocks. */ - percpu_counter_sub(&sbi->s_dirtyblocks_counter, + percpu_counter_sub(&sbi->s_dirtyclusters_counter, ei->i_reserved_meta_blocks); ei->i_reserved_meta_blocks = 0; ei->i_da_metadata_calc_len = 0; @@ -1153,14 +299,14 @@ void ext4_da_update_reserve_space(struct inode *inode, /* Update quota subsystem for data blocks */ if (quota_claim) - dquot_claim_block(inode, used); + dquot_claim_block(inode, EXT4_C2B(sbi, used)); else { /* * We did fallocate with an offset that is already delayed * allocated. So on delayed allocated writeback we should * not re-claim the quota for fallocated blocks. */ - dquot_release_reservation_block(inode, used); + dquot_release_reservation_block(inode, EXT4_C2B(sbi, used)); } /* @@ -1252,6 +398,49 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx, } /* + * Sets the BH_Da_Mapped bit on the buffer heads corresponding to the given map. + */ +static void set_buffers_da_mapped(struct inode *inode, + struct ext4_map_blocks *map) +{ + struct address_space *mapping = inode->i_mapping; + struct pagevec pvec; + int i, nr_pages; + pgoff_t index, end; + + index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits); + end = (map->m_lblk + map->m_len - 1) >> + (PAGE_CACHE_SHIFT - inode->i_blkbits); + + pagevec_init(&pvec, 0); + while (index <= end) { + nr_pages = pagevec_lookup(&pvec, mapping, index, + min(end - index + 1, + (pgoff_t)PAGEVEC_SIZE)); + if (nr_pages == 0) + break; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + struct buffer_head *bh, *head; + + if (unlikely(page->mapping != mapping) || + !PageDirty(page)) + break; + + if (page_has_buffers(page)) { + bh = head = page_buffers(page); + do { + set_buffer_da_mapped(bh); + bh = bh->b_this_page; + } while (bh != head); + } + index++; + } + pagevec_release(&pvec); + } +} + +/* * The ext4_map_blocks() function tries to look up the requested blocks, * and returns if the blocks are already mapped. * @@ -1269,7 +458,7 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx, * the buffer head is mapped. * * It returns 0 if plain look up failed (blocks have not been allocated), in - * that casem, buffer head is unmapped + * that case, buffer head is unmapped * * It returns the error in case of allocation failure. */ @@ -1288,9 +477,11 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, */ down_read((&EXT4_I(inode)->i_data_sem)); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { - retval = ext4_ext_map_blocks(handle, inode, map, 0); + retval = ext4_ext_map_blocks(handle, inode, map, flags & + EXT4_GET_BLOCKS_KEEP_SIZE); } else { - retval = ext4_ind_map_blocks(handle, inode, map, 0); + retval = ext4_ind_map_blocks(handle, inode, map, flags & + EXT4_GET_BLOCKS_KEEP_SIZE); } up_read((&EXT4_I(inode)->i_data_sem)); @@ -1308,7 +499,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, * Returns if the blocks have already allocated * * Note that if blocks have been preallocated - * ext4_ext_get_block() returns th create = 0 + * ext4_ext_get_block() returns the create = 0 * with buffer head unmapped. */ if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) @@ -1370,9 +561,17 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)) ext4_da_update_reserve_space(inode, retval, 1); } - if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); + /* If we have successfully mapped the delayed allocated blocks, + * set the BH_Da_Mapped bit on them. Its important to do this + * under the protection of i_data_sem. + */ + if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) + set_buffers_da_mapped(inode, map); + } + up_write((&EXT4_I(inode)->i_data_sem)); if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { int ret = check_block_validity(inode, map); @@ -1500,7 +699,7 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, return bh; if (buffer_uptodate(bh)) return bh; - ll_rw_block(READ_META, 1, &bh); + ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh); wait_on_buffer(bh); if (buffer_uptodate(bh)) return bh; @@ -1589,16 +788,6 @@ static int do_journal_get_write_access(handle_t *handle, return ret; } -/* - * Truncate blocks that were not used by write. We have to truncate the - * pagecache as well so that corresponding buffers get properly unmapped. - */ -static void ext4_truncate_failed_write(struct inode *inode) -{ - truncate_inode_pages(inode->i_mapping, inode->i_size); - ext4_truncate(inode); -} - static int ext4_get_block_write(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); static int ext4_write_begin(struct file *file, struct address_space *mapping, @@ -1772,7 +961,11 @@ static int ext4_ordered_write_end(struct file *file, ext4_orphan_add(handle, inode); if (ret2 < 0) ret = ret2; + } else { + unlock_page(page); + page_cache_release(page); } + ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; @@ -1849,6 +1042,8 @@ static int ext4_journalled_write_end(struct file *file, from = pos & (PAGE_CACHE_SIZE - 1); to = from + len; + BUG_ON(!ext4_handle_valid(handle)); + if (copied < len) { if (!PageUptodate(page)) copied = 0; @@ -1863,6 +1058,7 @@ static int ext4_journalled_write_end(struct file *file, if (new_i_size > inode->i_size) i_size_write(inode, pos+copied); ext4_set_inode_state(inode, EXT4_STATE_JDATA); + EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; if (new_i_size > EXT4_I(inode)->i_disksize) { ext4_update_i_disksize(inode, new_i_size); ret2 = ext4_mark_inode_dirty(handle, inode); @@ -1897,14 +1093,14 @@ static int ext4_journalled_write_end(struct file *file, } /* - * Reserve a single block located at lblock + * Reserve a single cluster located at lblock */ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) { int retries = 0; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); - unsigned long md_needed; + unsigned int md_needed; int ret; /* @@ -1914,7 +1110,8 @@ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) */ repeat: spin_lock(&ei->i_block_reservation_lock); - md_needed = ext4_calc_metadata_amount(inode, lblock); + md_needed = EXT4_NUM_B2C(sbi, + ext4_calc_metadata_amount(inode, lblock)); trace_ext4_da_reserve_space(inode, md_needed); spin_unlock(&ei->i_block_reservation_lock); @@ -1923,15 +1120,15 @@ repeat: * us from metadata over-estimation, though we may go over by * a small amount in the end. Here we just reserve for data. */ - ret = dquot_reserve_block(inode, 1); + ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1)); if (ret) return ret; /* * We do still charge estimated metadata to the sb though; * we cannot afford to run out of free blocks. */ - if (ext4_claim_free_blocks(sbi, md_needed + 1, 0)) { - dquot_release_reservation_block(inode, 1); + if (ext4_claim_free_clusters(sbi, md_needed + 1, 0)) { + dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1)); if (ext4_should_retry_alloc(inode->i_sb, &retries)) { yield(); goto repeat; @@ -1978,19 +1175,21 @@ static void ext4_da_release_space(struct inode *inode, int to_free) * We can release all of the reserved metadata blocks * only when we have written all of the delayed * allocation blocks. + * Note that in case of bigalloc, i_reserved_meta_blocks, + * i_reserved_data_blocks, etc. refer to number of clusters. */ - percpu_counter_sub(&sbi->s_dirtyblocks_counter, + percpu_counter_sub(&sbi->s_dirtyclusters_counter, ei->i_reserved_meta_blocks); ei->i_reserved_meta_blocks = 0; ei->i_da_metadata_calc_len = 0; } /* update fs dirty data blocks counter */ - percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free); + percpu_counter_sub(&sbi->s_dirtyclusters_counter, to_free); spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); - dquot_release_reservation_block(inode, to_free); + dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free)); } static void ext4_da_page_release_reservation(struct page *page, @@ -1999,6 +1198,9 @@ static void ext4_da_page_release_reservation(struct page *page, int to_release = 0; struct buffer_head *head, *bh; unsigned int curr_off = 0; + struct inode *inode = page->mapping->host; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + int num_clusters; head = page_buffers(page); bh = head; @@ -2008,10 +1210,24 @@ static void ext4_da_page_release_reservation(struct page *page, if ((offset <= curr_off) && (buffer_delay(bh))) { to_release++; clear_buffer_delay(bh); + clear_buffer_da_mapped(bh); } curr_off = next_off; } while ((bh = bh->b_this_page) != head); - ext4_da_release_space(page->mapping->host, to_release); + + /* If we have released all the blocks belonging to a cluster, then we + * need to release the reserved space for that cluster. */ + num_clusters = EXT4_NUM_B2C(sbi, to_release); + while (num_clusters > 0) { + ext4_fsblk_t lblk; + lblk = (page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits)) + + ((num_clusters - 1) << sbi->s_cluster_bits); + if (sbi->s_cluster_ratio == 1 || + !ext4_find_delalloc_cluster(inode, lblk, 1)) + ext4_da_release_space(inode, 1); + + num_clusters--; + } } /* @@ -2113,6 +1329,8 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd, clear_buffer_delay(bh); bh->b_blocknr = pblock; } + if (buffer_da_mapped(bh)) + clear_buffer_da_mapped(bh); if (buffer_unwritten(bh) || buffer_mapped(bh)) BUG_ON(bh->b_blocknr != pblock); @@ -2148,7 +1366,12 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd, else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT)) err = ext4_bio_write_page(&io_submit, page, len, mpd->wbc); - else + else if (buffer_uninit(page_bufs)) { + ext4_set_bh_endio(page_bufs, inode); + err = block_write_full_page_endio(page, + noalloc_get_block_write, + mpd->wbc, ext4_end_io_buffer_write); + } else err = block_write_full_page(page, noalloc_get_block_write, mpd->wbc); @@ -2201,12 +1424,15 @@ static void ext4_print_free_blocks(struct inode *inode) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); printk(KERN_CRIT "Total free blocks count %lld\n", - ext4_count_free_blocks(inode->i_sb)); + EXT4_C2B(EXT4_SB(inode->i_sb), + ext4_count_free_clusters(inode->i_sb))); printk(KERN_CRIT "Free/Dirty block details\n"); printk(KERN_CRIT "free_blocks=%lld\n", - (long long) percpu_counter_sum(&sbi->s_freeblocks_counter)); + (long long) EXT4_C2B(EXT4_SB(inode->i_sb), + percpu_counter_sum(&sbi->s_freeclusters_counter))); printk(KERN_CRIT "dirty_blocks=%lld\n", - (long long) percpu_counter_sum(&sbi->s_dirtyblocks_counter)); + (long long) EXT4_C2B(EXT4_SB(inode->i_sb), + percpu_counter_sum(&sbi->s_dirtyclusters_counter))); printk(KERN_CRIT "Block reservation details\n"); printk(KERN_CRIT "i_reserved_data_blocks=%u\n", EXT4_I(inode)->i_reserved_data_blocks); @@ -2285,8 +1511,7 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd) if (err == -EAGAIN) goto submit_io; - if (err == -ENOSPC && - ext4_count_free_blocks(sb)) { + if (err == -ENOSPC && ext4_count_free_clusters(sb)) { mpd->retval = err; goto submit_io; } @@ -2326,13 +1551,15 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd) for (i = 0; i < map.m_len; i++) unmap_underlying_metadata(bdev, map.m_pblk + i); - } - if (ext4_should_order_data(mpd->inode)) { - err = ext4_jbd2_file_inode(handle, mpd->inode); - if (err) - /* This only happens if the journal is aborted */ - return; + if (ext4_should_order_data(mpd->inode)) { + err = ext4_jbd2_file_inode(handle, mpd->inode); + if (err) { + /* Only if the journal is aborted */ + mpd->retval = err; + goto submit_io; + } + } } /* @@ -2439,6 +1666,66 @@ static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh) } /* + * This function is grabs code from the very beginning of + * ext4_map_blocks, but assumes that the caller is from delayed write + * time. This function looks up the requested blocks and sets the + * buffer delay bit under the protection of i_data_sem. + */ +static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, + struct ext4_map_blocks *map, + struct buffer_head *bh) +{ + int retval; + sector_t invalid_block = ~((sector_t) 0xffff); + + if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es)) + invalid_block = ~0; + + map->m_flags = 0; + ext_debug("ext4_da_map_blocks(): inode %lu, max_blocks %u," + "logical block %lu\n", inode->i_ino, map->m_len, + (unsigned long) map->m_lblk); + /* + * Try to see if we can get the block without requesting a new + * file system block. + */ + down_read((&EXT4_I(inode)->i_data_sem)); + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + retval = ext4_ext_map_blocks(NULL, inode, map, 0); + else + retval = ext4_ind_map_blocks(NULL, inode, map, 0); + + if (retval == 0) { + /* + * XXX: __block_prepare_write() unmaps passed block, + * is it OK? + */ + /* If the block was allocated from previously allocated cluster, + * then we dont need to reserve it again. */ + if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) { + retval = ext4_da_reserve_space(inode, iblock); + if (retval) + /* not enough space to reserve */ + goto out_unlock; + } + + /* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served + * and it should not appear on the bh->b_state. + */ + map->m_flags &= ~EXT4_MAP_FROM_CLUSTER; + + map_bh(bh, inode->i_sb, invalid_block); + set_buffer_new(bh); + set_buffer_delay(bh); + } + +out_unlock: + up_read((&EXT4_I(inode)->i_data_sem)); + + return retval; +} + +/* * This is a special get_blocks_t callback which is used by * ext4_da_write_begin(). It will either return mapped block or * reserve space for a single block. @@ -2455,10 +1742,6 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, { struct ext4_map_blocks map; int ret = 0; - sector_t invalid_block = ~((sector_t) 0xffff); - - if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es)) - invalid_block = ~0; BUG_ON(create == 0); BUG_ON(bh->b_size != inode->i_sb->s_blocksize); @@ -2471,25 +1754,9 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, * preallocated blocks are unmapped but should treated * the same as allocated blocks. */ - ret = ext4_map_blocks(NULL, inode, &map, 0); - if (ret < 0) + ret = ext4_da_map_blocks(inode, iblock, &map, bh); + if (ret <= 0) return ret; - if (ret == 0) { - if (buffer_delay(bh)) - return 0; /* Not sure this could or should happen */ - /* - * XXX: __block_write_begin() unmaps passed block, is it OK? - */ - ret = ext4_da_reserve_space(inode, iblock); - if (ret) - /* not enough space to reserve */ - return ret; - - map_bh(bh, inode->i_sb, invalid_block); - set_buffer_new(bh); - set_buffer_delay(bh); - return 0; - } map_bh(bh, inode->i_sb, map.m_pblk); bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; @@ -2564,6 +1831,8 @@ static int __ext4_journalled_writepage(struct page *page, goto out; } + BUG_ON(!ext4_handle_valid(handle)); + ret = walk_page_buffers(handle, page_bufs, 0, len, NULL, do_journal_get_write_access); @@ -2571,6 +1840,7 @@ static int __ext4_journalled_writepage(struct page *page, write_end_fn); if (ret == 0) ret = err; + EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; err = ext4_journal_stop(handle); if (!ret) ret = err; @@ -2663,8 +1933,12 @@ static int ext4_writepage(struct page *page, * We don't want to do block allocation, so redirty * the page and return. We may reach here when we do * a journal commit via journal_submit_inode_data_buffers. - * We can also reach here via shrink_page_list + * We can also reach here via shrink_page_list but it + * should never be for direct reclaim so warn if that + * happens */ + WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == + PF_MEMALLOC); goto redirty_page; } if (commit_write) @@ -2741,7 +2015,7 @@ static int write_cache_pages_da(struct address_space *mapping, index = wbc->range_start >> PAGE_CACHE_SHIFT; end = wbc->range_end >> PAGE_CACHE_SHIFT; - if (wbc->sync_mode == WB_SYNC_ALL) + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; @@ -2898,6 +2172,7 @@ static int ext4_da_writepages(struct address_space *mapping, struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); pgoff_t done_index = 0; pgoff_t end; + struct blk_plug plug; trace_ext4_da_writepages(inode, wbc); @@ -2973,9 +2248,10 @@ static int ext4_da_writepages(struct address_space *mapping, } retry: - if (wbc->sync_mode == WB_SYNC_ALL) + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, index, end); + blk_start_plug(&plug); while (!ret && wbc->nr_to_write > 0) { /* @@ -2994,6 +2270,7 @@ retry: ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " "%ld pages, ino %lu; err %d", __func__, wbc->nr_to_write, inode->i_ino, ret); + blk_finish_plug(&plug); goto out_writepages; } @@ -3026,11 +2303,12 @@ retry: ret = 0; } else if (ret == MPAGE_DA_EXTENT_TAIL) { /* - * got one extent now try with - * rest of the pages + * Got one extent now try with rest of the pages. + * If mpd.retval is set -EIO, journal is aborted. + * So we don't need to write any more. */ pages_written += mpd.pages_written; - ret = 0; + ret = mpd.retval; io_done = 1; } else if (wbc->nr_to_write) /* @@ -3040,6 +2318,7 @@ retry: */ break; } + blk_finish_plug(&plug); if (!io_done && !cycled) { cycled = 1; index = 0; @@ -3078,10 +2357,11 @@ static int ext4_nonda_switch(struct super_block *sb) * Delalloc need an accurate free block accounting. So switch * to non delalloc when we are near to error range. */ - free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); - dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyblocks_counter); + free_blocks = EXT4_C2B(sbi, + percpu_counter_read_positive(&sbi->s_freeclusters_counter)); + dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter); if (2 * free_blocks < 3 * dirty_blocks || - free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) { + free_blocks < (dirty_blocks + EXT4_FREECLUSTERS_WATERMARK)) { /* * free block count is less than 150% of dirty blocks * or free blocks is less than watermark @@ -3093,7 +2373,7 @@ static int ext4_nonda_switch(struct super_block *sb) * start pushing delalloc when 1/2 of free blocks are dirty. */ if (free_blocks < 2 * dirty_blocks) - writeback_inodes_sb_if_idle(sb); + writeback_inodes_sb_if_idle(sb, WB_REASON_FS_FREE_SPACE); return 0; } @@ -3107,6 +2387,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, pgoff_t index; struct inode *inode = mapping->host; handle_t *handle; + loff_t page_len; index = pos >> PAGE_CACHE_SHIFT; @@ -3153,6 +2434,13 @@ retry: */ if (pos + len > inode->i_size) ext4_truncate_failed_write(inode); + } else { + page_len = pos & (PAGE_CACHE_SIZE - 1); + if (page_len > 0) { + ret = ext4_discard_partial_page_buffers_no_lock(handle, + inode, page, pos - page_len, page_len, + EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED); + } } if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) @@ -3195,6 +2483,7 @@ static int ext4_da_write_end(struct file *file, loff_t new_i_size; unsigned long start, end; int write_mode = (int)(unsigned long)fsdata; + loff_t page_len; if (write_mode == FALL_BACK_TO_NONDELALLOC) { if (ext4_should_order_data(inode)) { @@ -3243,6 +2532,16 @@ static int ext4_da_write_end(struct file *file, } ret2 = generic_write_end(file, mapping, pos, len, copied, page, fsdata); + + page_len = PAGE_CACHE_SIZE - + ((pos + copied - 1) & (PAGE_CACHE_SIZE - 1)); + + if (page_len > 0) { + ret = ext4_discard_partial_page_buffers_no_lock(handle, + inode, page, pos + copied - 1, page_len, + EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED); + } + copied = ret2; if (ret2 < 0) ret = ret2; @@ -3450,112 +2749,6 @@ static int ext4_releasepage(struct page *page, gfp_t wait) } /* - * O_DIRECT for ext3 (or indirect map) based files - * - * If the O_DIRECT write will extend the file then add this inode to the - * orphan list. So recovery will truncate it back to the original size - * if the machine crashes during the write. - * - * If the O_DIRECT write is intantiating holes inside i_size and the machine - * crashes then stale disk data _may_ be exposed inside the file. But current - * VFS code falls back into buffered path in that case so we are safe. - */ -static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, - unsigned long nr_segs) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - struct ext4_inode_info *ei = EXT4_I(inode); - handle_t *handle; - ssize_t ret; - int orphan = 0; - size_t count = iov_length(iov, nr_segs); - int retries = 0; - - if (rw == WRITE) { - loff_t final_size = offset + count; - - if (final_size > inode->i_size) { - /* Credits for sb + inode write */ - handle = ext4_journal_start(inode, 2); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out; - } - ret = ext4_orphan_add(handle, inode); - if (ret) { - ext4_journal_stop(handle); - goto out; - } - orphan = 1; - ei->i_disksize = inode->i_size; - ext4_journal_stop(handle); - } - } - -retry: - if (rw == READ && ext4_should_dioread_nolock(inode)) - ret = __blockdev_direct_IO(rw, iocb, inode, - inode->i_sb->s_bdev, iov, - offset, nr_segs, - ext4_get_block, NULL, NULL, 0); - else { - ret = blockdev_direct_IO(rw, iocb, inode, iov, - offset, nr_segs, ext4_get_block); - - if (unlikely((rw & WRITE) && ret < 0)) { - loff_t isize = i_size_read(inode); - loff_t end = offset + iov_length(iov, nr_segs); - - if (end > isize) - ext4_truncate_failed_write(inode); - } - } - if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) - goto retry; - - if (orphan) { - int err; - - /* Credits for sb + inode write */ - handle = ext4_journal_start(inode, 2); - if (IS_ERR(handle)) { - /* This is really bad luck. We've written the data - * but cannot extend i_size. Bail out and pretend - * the write failed... */ - ret = PTR_ERR(handle); - if (inode->i_nlink) - ext4_orphan_del(NULL, inode); - - goto out; - } - if (inode->i_nlink) - ext4_orphan_del(handle, inode); - if (ret > 0) { - loff_t end = offset + ret; - if (end > inode->i_size) { - ei->i_disksize = end; - i_size_write(inode, end); - /* - * We're going to return a positive `ret' - * here due to non-zero-length I/O, so there's - * no way of reporting error returns from - * ext4_mark_inode_dirty() to userspace. So - * ignore it. - */ - ext4_mark_inode_dirty(handle, inode); - } - } - err = ext4_journal_stop(handle); - if (ret == 0) - ret = err; - } -out: - return ret; -} - -/* * ext4_get_block used when preparing for a DIO write or buffer write. * We allocate an uinitialized extent if blocks haven't been allocated. * The extent will be converted to initialized after the IO is complete. @@ -3614,8 +2807,8 @@ out: spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); /* queue the work to convert unwritten extents to written */ - queue_work(wq, &io_end->work); iocb->private = NULL; + queue_work(wq, &io_end->work); /* XXX: probably should move into the real I/O completion handler */ inode_dio_done(inode); @@ -3638,8 +2831,12 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) goto out; } - io_end->flag = EXT4_IO_END_UNWRITTEN; + /* + * It may be over-defensive here to check EXT4_IO_END_UNWRITTEN now, + * but being more careful is always safe for the future change. + */ inode = io_end->inode; + ext4_set_io_unwritten_flag(inode, io_end); /* Add the io_end to per-inode completed io list*/ spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); @@ -3805,6 +3002,12 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, struct inode *inode = file->f_mapping->host; ssize_t ret; + /* + * If we are doing data journalling we don't support O_DIRECT + */ + if (ext4_should_journal_data(inode)) + return 0; + trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs); @@ -3874,6 +3077,7 @@ static const struct address_space_operations ext4_journalled_aops = { .bmap = ext4_bmap, .invalidatepage = ext4_invalidatepage, .releasepage = ext4_releasepage, + .direct_IO = ext4_direct_IO, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, }; @@ -3910,6 +3114,227 @@ void ext4_set_aops(struct inode *inode) inode->i_mapping->a_ops = &ext4_journalled_aops; } + +/* + * ext4_discard_partial_page_buffers() + * Wrapper function for ext4_discard_partial_page_buffers_no_lock. + * This function finds and locks the page containing the offset + * "from" and passes it to ext4_discard_partial_page_buffers_no_lock. + * Calling functions that already have the page locked should call + * ext4_discard_partial_page_buffers_no_lock directly. + */ +int ext4_discard_partial_page_buffers(handle_t *handle, + struct address_space *mapping, loff_t from, + loff_t length, int flags) +{ + struct inode *inode = mapping->host; + struct page *page; + int err = 0; + + page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT, + mapping_gfp_mask(mapping) & ~__GFP_FS); + if (!page) + return -ENOMEM; + + err = ext4_discard_partial_page_buffers_no_lock(handle, inode, page, + from, length, flags); + + unlock_page(page); + page_cache_release(page); + return err; +} + +/* + * ext4_discard_partial_page_buffers_no_lock() + * Zeros a page range of length 'length' starting from offset 'from'. + * Buffer heads that correspond to the block aligned regions of the + * zeroed range will be unmapped. Unblock aligned regions + * will have the corresponding buffer head mapped if needed so that + * that region of the page can be updated with the partial zero out. + * + * This function assumes that the page has already been locked. The + * The range to be discarded must be contained with in the given page. + * If the specified range exceeds the end of the page it will be shortened + * to the end of the page that corresponds to 'from'. This function is + * appropriate for updating a page and it buffer heads to be unmapped and + * zeroed for blocks that have been either released, or are going to be + * released. + * + * handle: The journal handle + * inode: The files inode + * page: A locked page that contains the offset "from" + * from: The starting byte offset (from the begining of the file) + * to begin discarding + * len: The length of bytes to discard + * flags: Optional flags that may be used: + * + * EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED + * Only zero the regions of the page whose buffer heads + * have already been unmapped. This flag is appropriate + * for updateing the contents of a page whose blocks may + * have already been released, and we only want to zero + * out the regions that correspond to those released blocks. + * + * Returns zero on sucess or negative on failure. + */ +int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, + struct inode *inode, struct page *page, loff_t from, + loff_t length, int flags) +{ + ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; + unsigned int offset = from & (PAGE_CACHE_SIZE-1); + unsigned int blocksize, max, pos; + ext4_lblk_t iblock; + struct buffer_head *bh; + int err = 0; + + blocksize = inode->i_sb->s_blocksize; + max = PAGE_CACHE_SIZE - offset; + + if (index != page->index) + return -EINVAL; + + /* + * correct length if it does not fall between + * 'from' and the end of the page + */ + if (length > max || length < 0) + length = max; + + iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); + + if (!page_has_buffers(page)) { + /* + * If the range to be discarded covers a partial block + * we need to get the page buffers. This is because + * partial blocks cannot be released and the page needs + * to be updated with the contents of the block before + * we write the zeros on top of it. + */ + if ((from & (blocksize - 1)) || + ((from + length) & (blocksize - 1))) { + create_empty_buffers(page, blocksize, 0); + } else { + /* + * If there are no partial blocks, + * there is nothing to update, + * so we can return now + */ + return 0; + } + } + + /* Find the buffer that contains "offset" */ + bh = page_buffers(page); + pos = blocksize; + while (offset >= pos) { + bh = bh->b_this_page; + iblock++; + pos += blocksize; + } + + pos = offset; + while (pos < offset + length) { + unsigned int end_of_block, range_to_discard; + + err = 0; + + /* The length of space left to zero and unmap */ + range_to_discard = offset + length - pos; + + /* The length of space until the end of the block */ + end_of_block = blocksize - (pos & (blocksize-1)); + + /* + * Do not unmap or zero past end of block + * for this buffer head + */ + if (range_to_discard > end_of_block) + range_to_discard = end_of_block; + + + /* + * Skip this buffer head if we are only zeroing unampped + * regions of the page + */ + if (flags & EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED && + buffer_mapped(bh)) + goto next; + + /* If the range is block aligned, unmap */ + if (range_to_discard == blocksize) { + clear_buffer_dirty(bh); + bh->b_bdev = NULL; + clear_buffer_mapped(bh); + clear_buffer_req(bh); + clear_buffer_new(bh); + clear_buffer_delay(bh); + clear_buffer_unwritten(bh); + clear_buffer_uptodate(bh); + zero_user(page, pos, range_to_discard); + BUFFER_TRACE(bh, "Buffer discarded"); + goto next; + } + + /* + * If this block is not completely contained in the range + * to be discarded, then it is not going to be released. Because + * we need to keep this block, we need to make sure this part + * of the page is uptodate before we modify it by writeing + * partial zeros on it. + */ + if (!buffer_mapped(bh)) { + /* + * Buffer head must be mapped before we can read + * from the block + */ + BUFFER_TRACE(bh, "unmapped"); + ext4_get_block(inode, iblock, bh, 0); + /* unmapped? It's a hole - nothing to do */ + if (!buffer_mapped(bh)) { + BUFFER_TRACE(bh, "still unmapped"); + goto next; + } + } + + /* Ok, it's mapped. Make sure it's up-to-date */ + if (PageUptodate(page)) + set_buffer_uptodate(bh); + + if (!buffer_uptodate(bh)) { + err = -EIO; + ll_rw_block(READ, 1, &bh); + wait_on_buffer(bh); + /* Uhhuh. Read error. Complain and punt.*/ + if (!buffer_uptodate(bh)) + goto next; + } + + if (ext4_should_journal_data(inode)) { + BUFFER_TRACE(bh, "get write access"); + err = ext4_journal_get_write_access(handle, bh); + if (err) + goto next; + } + + zero_user(page, pos, range_to_discard); + + err = 0; + if (ext4_should_journal_data(inode)) { + err = ext4_handle_dirty_metadata(handle, inode, bh); + } else + mark_buffer_dirty(bh); + + BUFFER_TRACE(bh, "Partial buffer zeroed"); +next: + bh = bh->b_this_page; + iblock++; + pos += range_to_discard; + } + + return err; +} + /* * ext4_block_truncate_page() zeroes out a mapping from file offset `from' * up to the end of the block which corresponds to `from'. @@ -3952,7 +3377,7 @@ int ext4_block_zero_page_range(handle_t *handle, page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT, mapping_gfp_mask(mapping) & ~__GFP_FS); if (!page) - return -EINVAL; + return -ENOMEM; blocksize = inode->i_sb->s_blocksize; max = blocksize - (offset & (blocksize - 1)); @@ -4021,11 +3446,8 @@ int ext4_block_zero_page_range(handle_t *handle, err = 0; if (ext4_should_journal_data(inode)) { err = ext4_handle_dirty_metadata(handle, inode, bh); - } else { - if (ext4_should_order_data(inode) && EXT4_I(inode)->jinode) - err = ext4_jbd2_file_inode(handle, inode); + } else mark_buffer_dirty(bh); - } unlock: unlock_page(page); @@ -4033,383 +3455,6 @@ unlock: return err; } -/* - * Probably it should be a library function... search for first non-zero word - * or memcmp with zero_page, whatever is better for particular architecture. - * Linus? - */ -static inline int all_zeroes(__le32 *p, __le32 *q) -{ - while (p < q) - if (*p++) - return 0; - return 1; -} - -/** - * ext4_find_shared - find the indirect blocks for partial truncation. - * @inode: inode in question - * @depth: depth of the affected branch - * @offsets: offsets of pointers in that branch (see ext4_block_to_path) - * @chain: place to store the pointers to partial indirect blocks - * @top: place to the (detached) top of branch - * - * This is a helper function used by ext4_truncate(). - * - * When we do truncate() we may have to clean the ends of several - * indirect blocks but leave the blocks themselves alive. Block is - * partially truncated if some data below the new i_size is referred - * from it (and it is on the path to the first completely truncated - * data block, indeed). We have to free the top of that path along - * with everything to the right of the path. Since no allocation - * past the truncation point is possible until ext4_truncate() - * finishes, we may safely do the latter, but top of branch may - * require special attention - pageout below the truncation point - * might try to populate it. - * - * We atomically detach the top of branch from the tree, store the - * block number of its root in *@top, pointers to buffer_heads of - * partially truncated blocks - in @chain[].bh and pointers to - * their last elements that should not be removed - in - * @chain[].p. Return value is the pointer to last filled element - * of @chain. - * - * The work left to caller to do the actual freeing of subtrees: - * a) free the subtree starting from *@top - * b) free the subtrees whose roots are stored in - * (@chain[i].p+1 .. end of @chain[i].bh->b_data) - * c) free the subtrees growing from the inode past the @chain[0]. - * (no partially truncated stuff there). */ - -static Indirect *ext4_find_shared(struct inode *inode, int depth, - ext4_lblk_t offsets[4], Indirect chain[4], - __le32 *top) -{ - Indirect *partial, *p; - int k, err; - - *top = 0; - /* Make k index the deepest non-null offset + 1 */ - for (k = depth; k > 1 && !offsets[k-1]; k--) - ; - partial = ext4_get_branch(inode, k, offsets, chain, &err); - /* Writer: pointers */ - if (!partial) - partial = chain + k-1; - /* - * If the branch acquired continuation since we've looked at it - - * fine, it should all survive and (new) top doesn't belong to us. - */ - if (!partial->key && *partial->p) - /* Writer: end */ - goto no_top; - for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--) - ; - /* - * OK, we've found the last block that must survive. The rest of our - * branch should be detached before unlocking. However, if that rest - * of branch is all ours and does not grow immediately from the inode - * it's easier to cheat and just decrement partial->p. - */ - if (p == chain + k - 1 && p > chain) { - p->p--; - } else { - *top = *p->p; - /* Nope, don't do this in ext4. Must leave the tree intact */ -#if 0 - *p->p = 0; -#endif - } - /* Writer: end */ - - while (partial > p) { - brelse(partial->bh); - partial--; - } -no_top: - return partial; -} - -/* - * Zero a number of block pointers in either an inode or an indirect block. - * If we restart the transaction we must again get write access to the - * indirect block for further modification. - * - * We release `count' blocks on disk, but (last - first) may be greater - * than `count' because there can be holes in there. - * - * Return 0 on success, 1 on invalid block range - * and < 0 on fatal error. - */ -static int ext4_clear_blocks(handle_t *handle, struct inode *inode, - struct buffer_head *bh, - ext4_fsblk_t block_to_free, - unsigned long count, __le32 *first, - __le32 *last) -{ - __le32 *p; - int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED; - int err; - - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - flags |= EXT4_FREE_BLOCKS_METADATA; - - if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, - count)) { - EXT4_ERROR_INODE(inode, "attempt to clear invalid " - "blocks %llu len %lu", - (unsigned long long) block_to_free, count); - return 1; - } - - if (try_to_extend_transaction(handle, inode)) { - if (bh) { - BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, inode, bh); - if (unlikely(err)) - goto out_err; - } - err = ext4_mark_inode_dirty(handle, inode); - if (unlikely(err)) - goto out_err; - err = ext4_truncate_restart_trans(handle, inode, - blocks_for_truncate(inode)); - if (unlikely(err)) - goto out_err; - if (bh) { - BUFFER_TRACE(bh, "retaking write access"); - err = ext4_journal_get_write_access(handle, bh); - if (unlikely(err)) - goto out_err; - } - } - - for (p = first; p < last; p++) - *p = 0; - - ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags); - return 0; -out_err: - ext4_std_error(inode->i_sb, err); - return err; -} - -/** - * ext4_free_data - free a list of data blocks - * @handle: handle for this transaction - * @inode: inode we are dealing with - * @this_bh: indirect buffer_head which contains *@first and *@last - * @first: array of block numbers - * @last: points immediately past the end of array - * - * We are freeing all blocks referred from that array (numbers are stored as - * little-endian 32-bit) and updating @inode->i_blocks appropriately. - * - * We accumulate contiguous runs of blocks to free. Conveniently, if these - * blocks are contiguous then releasing them at one time will only affect one - * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't - * actually use a lot of journal space. - * - * @this_bh will be %NULL if @first and @last point into the inode's direct - * block pointers. - */ -static void ext4_free_data(handle_t *handle, struct inode *inode, - struct buffer_head *this_bh, - __le32 *first, __le32 *last) -{ - ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */ - unsigned long count = 0; /* Number of blocks in the run */ - __le32 *block_to_free_p = NULL; /* Pointer into inode/ind - corresponding to - block_to_free */ - ext4_fsblk_t nr; /* Current block # */ - __le32 *p; /* Pointer into inode/ind - for current block */ - int err = 0; - - if (this_bh) { /* For indirect block */ - BUFFER_TRACE(this_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, this_bh); - /* Important: if we can't update the indirect pointers - * to the blocks, we can't free them. */ - if (err) - return; - } - - for (p = first; p < last; p++) { - nr = le32_to_cpu(*p); - if (nr) { - /* accumulate blocks to free if they're contiguous */ - if (count == 0) { - block_to_free = nr; - block_to_free_p = p; - count = 1; - } else if (nr == block_to_free + count) { - count++; - } else { - err = ext4_clear_blocks(handle, inode, this_bh, - block_to_free, count, - block_to_free_p, p); - if (err) - break; - block_to_free = nr; - block_to_free_p = p; - count = 1; - } - } - } - - if (!err && count > 0) - err = ext4_clear_blocks(handle, inode, this_bh, block_to_free, - count, block_to_free_p, p); - if (err < 0) - /* fatal error */ - return; - - if (this_bh) { - BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata"); - - /* - * The buffer head should have an attached journal head at this - * point. However, if the data is corrupted and an indirect - * block pointed to itself, it would have been detached when - * the block was cleared. Check for this instead of OOPSing. - */ - if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) - ext4_handle_dirty_metadata(handle, inode, this_bh); - else - EXT4_ERROR_INODE(inode, - "circular indirect block detected at " - "block %llu", - (unsigned long long) this_bh->b_blocknr); - } -} - -/** - * ext4_free_branches - free an array of branches - * @handle: JBD handle for this transaction - * @inode: inode we are dealing with - * @parent_bh: the buffer_head which contains *@first and *@last - * @first: array of block numbers - * @last: pointer immediately past the end of array - * @depth: depth of the branches to free - * - * We are freeing all blocks referred from these branches (numbers are - * stored as little-endian 32-bit) and updating @inode->i_blocks - * appropriately. - */ -static void ext4_free_branches(handle_t *handle, struct inode *inode, - struct buffer_head *parent_bh, - __le32 *first, __le32 *last, int depth) -{ - ext4_fsblk_t nr; - __le32 *p; - - if (ext4_handle_is_aborted(handle)) - return; - - if (depth--) { - struct buffer_head *bh; - int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); - p = last; - while (--p >= first) { - nr = le32_to_cpu(*p); - if (!nr) - continue; /* A hole */ - - if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), - nr, 1)) { - EXT4_ERROR_INODE(inode, - "invalid indirect mapped " - "block %lu (level %d)", - (unsigned long) nr, depth); - break; - } - - /* Go read the buffer for the next level down */ - bh = sb_bread(inode->i_sb, nr); - - /* - * A read failure? Report error and clear slot - * (should be rare). - */ - if (!bh) { - EXT4_ERROR_INODE_BLOCK(inode, nr, - "Read failure"); - continue; - } - - /* This zaps the entire block. Bottom up. */ - BUFFER_TRACE(bh, "free child branches"); - ext4_free_branches(handle, inode, bh, - (__le32 *) bh->b_data, - (__le32 *) bh->b_data + addr_per_block, - depth); - brelse(bh); - - /* - * Everything below this this pointer has been - * released. Now let this top-of-subtree go. - * - * We want the freeing of this indirect block to be - * atomic in the journal with the updating of the - * bitmap block which owns it. So make some room in - * the journal. - * - * We zero the parent pointer *after* freeing its - * pointee in the bitmaps, so if extend_transaction() - * for some reason fails to put the bitmap changes and - * the release into the same transaction, recovery - * will merely complain about releasing a free block, - * rather than leaking blocks. - */ - if (ext4_handle_is_aborted(handle)) - return; - if (try_to_extend_transaction(handle, inode)) { - ext4_mark_inode_dirty(handle, inode); - ext4_truncate_restart_trans(handle, inode, - blocks_for_truncate(inode)); - } - - /* - * The forget flag here is critical because if - * we are journaling (and not doing data - * journaling), we have to make sure a revoke - * record is written to prevent the journal - * replay from overwriting the (former) - * indirect block if it gets reallocated as a - * data block. This must happen in the same - * transaction where the data blocks are - * actually freed. - */ - ext4_free_blocks(handle, inode, NULL, nr, 1, - EXT4_FREE_BLOCKS_METADATA| - EXT4_FREE_BLOCKS_FORGET); - - if (parent_bh) { - /* - * The block which we have just freed is - * pointed to by an indirect block: journal it - */ - BUFFER_TRACE(parent_bh, "get_write_access"); - if (!ext4_journal_get_write_access(handle, - parent_bh)){ - *p = 0; - BUFFER_TRACE(parent_bh, - "call ext4_handle_dirty_metadata"); - ext4_handle_dirty_metadata(handle, - inode, - parent_bh); - } - } - } - } else { - /* We have reached the bottom of the tree. */ - BUFFER_TRACE(parent_bh, "free data blocks"); - ext4_free_data(handle, inode, parent_bh, first, last); - } -} - int ext4_can_truncate(struct inode *inode) { if (S_ISREG(inode->i_mode)) @@ -4443,6 +3488,11 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) return -ENOTSUPP; } + if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) { + /* TODO: Add support for bigalloc file systems */ + return -ENOTSUPP; + } + return ext4_ext_punch_hole(file, offset, length); } @@ -4476,19 +3526,6 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) */ void ext4_truncate(struct inode *inode) { - handle_t *handle; - struct ext4_inode_info *ei = EXT4_I(inode); - __le32 *i_data = ei->i_data; - int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); - struct address_space *mapping = inode->i_mapping; - ext4_lblk_t offsets[4]; - Indirect chain[4]; - Indirect *partial; - __le32 nr = 0; - int n = 0; - ext4_lblk_t last_block, max_block; - unsigned blocksize = inode->i_sb->s_blocksize; - trace_ext4_truncate_enter(inode); if (!ext4_can_truncate(inode)) @@ -4499,149 +3536,11 @@ void ext4_truncate(struct inode *inode) if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) ext4_ext_truncate(inode); - trace_ext4_truncate_exit(inode); - return; - } - - handle = start_transaction(inode); - if (IS_ERR(handle)) - return; /* AKPM: return what? */ - - last_block = (inode->i_size + blocksize-1) - >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); - max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) - >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); - - if (inode->i_size & (blocksize - 1)) - if (ext4_block_truncate_page(handle, mapping, inode->i_size)) - goto out_stop; - - if (last_block != max_block) { - n = ext4_block_to_path(inode, last_block, offsets, NULL); - if (n == 0) - goto out_stop; /* error */ - } - - /* - * OK. This truncate is going to happen. We add the inode to the - * orphan list, so that if this truncate spans multiple transactions, - * and we crash, we will resume the truncate when the filesystem - * recovers. It also marks the inode dirty, to catch the new size. - * - * Implication: the file must always be in a sane, consistent - * truncatable state while each transaction commits. - */ - if (ext4_orphan_add(handle, inode)) - goto out_stop; - - /* - * From here we block out all ext4_get_block() callers who want to - * modify the block allocation tree. - */ - down_write(&ei->i_data_sem); - - ext4_discard_preallocations(inode); - - /* - * The orphan list entry will now protect us from any crash which - * occurs before the truncate completes, so it is now safe to propagate - * the new, shorter inode size (held for now in i_size) into the - * on-disk inode. We do this via i_disksize, which is the value which - * ext4 *really* writes onto the disk inode. - */ - ei->i_disksize = inode->i_size; - - if (last_block == max_block) { - /* - * It is unnecessary to free any data blocks if last_block is - * equal to the indirect block limit. - */ - goto out_unlock; - } else if (n == 1) { /* direct blocks */ - ext4_free_data(handle, inode, NULL, i_data+offsets[0], - i_data + EXT4_NDIR_BLOCKS); - goto do_indirects; - } - - partial = ext4_find_shared(inode, n, offsets, chain, &nr); - /* Kill the top of shared branch (not detached) */ - if (nr) { - if (partial == chain) { - /* Shared branch grows from the inode */ - ext4_free_branches(handle, inode, NULL, - &nr, &nr+1, (chain+n-1) - partial); - *partial->p = 0; - /* - * We mark the inode dirty prior to restart, - * and prior to stop. No need for it here. - */ - } else { - /* Shared branch grows from an indirect block */ - BUFFER_TRACE(partial->bh, "get_write_access"); - ext4_free_branches(handle, inode, partial->bh, - partial->p, - partial->p+1, (chain+n-1) - partial); - } - } - /* Clear the ends of indirect blocks on the shared branch */ - while (partial > chain) { - ext4_free_branches(handle, inode, partial->bh, partial->p + 1, - (__le32*)partial->bh->b_data+addr_per_block, - (chain+n-1) - partial); - BUFFER_TRACE(partial->bh, "call brelse"); - brelse(partial->bh); - partial--; - } -do_indirects: - /* Kill the remaining (whole) subtrees */ - switch (offsets[0]) { - default: - nr = i_data[EXT4_IND_BLOCK]; - if (nr) { - ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); - i_data[EXT4_IND_BLOCK] = 0; - } - case EXT4_IND_BLOCK: - nr = i_data[EXT4_DIND_BLOCK]; - if (nr) { - ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); - i_data[EXT4_DIND_BLOCK] = 0; - } - case EXT4_DIND_BLOCK: - nr = i_data[EXT4_TIND_BLOCK]; - if (nr) { - ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); - i_data[EXT4_TIND_BLOCK] = 0; - } - case EXT4_TIND_BLOCK: - ; - } - -out_unlock: - up_write(&ei->i_data_sem); - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); - ext4_mark_inode_dirty(handle, inode); - - /* - * In a multi-transaction truncate, we only make the final transaction - * synchronous - */ - if (IS_SYNC(inode)) - ext4_handle_sync(handle); -out_stop: - /* - * If this was a simple ftruncate(), and the file will remain alive - * then we need to clear up the orphan record which we created above. - * However, if this was a real unlink then we were called by - * ext4_delete_inode(), and we allow that function to clean up the - * orphan info for us. - */ - if (inode->i_nlink) - ext4_orphan_del(handle, inode); + else + ext4_ind_truncate(inode); - ext4_journal_stop(handle); trace_ext4_truncate_exit(inode); } @@ -4777,7 +3676,7 @@ make_io: trace_ext4_load_inode(inode); get_bh(bh); bh->b_end_io = end_buffer_read_sync; - submit_bh(READ_META, bh); + submit_bh(READ | REQ_META | REQ_PRIO, bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { EXT4_ERROR_INODE_BLOCK(inode, block, @@ -4893,7 +3792,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; } - inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); + set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ ei->i_dir_start_lookup = 0; @@ -5012,7 +3911,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) (S_ISLNK(inode->i_mode) && !ext4_inode_is_fast_symlink(inode))) { /* Validate block references which are part of inode */ - ret = ext4_check_inode_blockref(inode); + ret = ext4_ind_check_inode(inode); } if (ret) goto bad_inode; @@ -5459,34 +4358,10 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, return 0; } -static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks, - int chunk) -{ - int indirects; - - /* if nrblocks are contiguous */ - if (chunk) { - /* - * With N contiguous data blocks, we need at most - * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks, - * 2 dindirect blocks, and 1 tindirect block - */ - return DIV_ROUND_UP(nrblocks, - EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4; - } - /* - * if nrblocks are not contiguous, worse case, each block touch - * a indirect block, and each indirect block touch a double indirect - * block, plus a triple indirect block - */ - indirects = nrblocks * 2 + 1; - return indirects; -} - static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) { if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) - return ext4_indirect_trans_blocks(inode, nrblocks, chunk); + return ext4_ind_trans_blocks(inode, nrblocks, chunk); return ext4_ext_index_trans_blocks(inode, nrblocks, chunk); } @@ -5919,6 +4794,7 @@ retry_alloc: PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) { unlock_page(page); ret = VM_FAULT_SIGBUS; + ext4_journal_stop(handle); goto out; } ext4_set_inode_state(inode, EXT4_STATE_JDATA); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 808c554e773f..a56796814d6a 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -21,6 +21,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = filp->f_dentry->d_inode; + struct super_block *sb = inode->i_sb; struct ext4_inode_info *ei = EXT4_I(inode); unsigned int flags; @@ -173,41 +174,24 @@ setversion_out: mnt_drop_write(filp->f_path.mnt); return err; } -#ifdef CONFIG_JBD2_DEBUG - case EXT4_IOC_WAIT_FOR_READONLY: - /* - * This is racy - by the time we're woken up and running, - * the superblock could be released. And the module could - * have been unloaded. So sue me. - * - * Returns 1 if it slept, else zero. - */ - { - struct super_block *sb = inode->i_sb; - DECLARE_WAITQUEUE(wait, current); - int ret = 0; - - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&EXT4_SB(sb)->ro_wait_queue, &wait); - if (timer_pending(&EXT4_SB(sb)->turn_ro_timer)) { - schedule(); - ret = 1; - } - remove_wait_queue(&EXT4_SB(sb)->ro_wait_queue, &wait); - return ret; - } -#endif case EXT4_IOC_GROUP_EXTEND: { ext4_fsblk_t n_blocks_count; - struct super_block *sb = inode->i_sb; int err, err2=0; - if (!capable(CAP_SYS_RESOURCE)) - return -EPERM; + err = ext4_resize_begin(sb); + if (err) + return err; if (get_user(n_blocks_count, (__u32 __user *)arg)) return -EFAULT; + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { + ext4_msg(sb, KERN_ERR, + "Online resizing not supported with bigalloc"); + return -EOPNOTSUPP; + } + err = mnt_want_write(filp->f_path.mnt); if (err) return err; @@ -221,6 +205,7 @@ setversion_out: if (err == 0) err = err2; mnt_drop_write(filp->f_path.mnt); + ext4_resize_end(sb); return err; } @@ -248,6 +233,13 @@ setversion_out: goto mext_out; } + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { + ext4_msg(sb, KERN_ERR, + "Online defrag not supported with bigalloc"); + return -EOPNOTSUPP; + } + err = mnt_want_write(filp->f_path.mnt); if (err) goto mext_out; @@ -268,16 +260,23 @@ mext_out: case EXT4_IOC_GROUP_ADD: { struct ext4_new_group_data input; - struct super_block *sb = inode->i_sb; int err, err2=0; - if (!capable(CAP_SYS_RESOURCE)) - return -EPERM; + err = ext4_resize_begin(sb); + if (err) + return err; if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg, sizeof(input))) return -EFAULT; + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { + ext4_msg(sb, KERN_ERR, + "Online resizing not supported with bigalloc"); + return -EOPNOTSUPP; + } + err = mnt_want_write(filp->f_path.mnt); if (err) return err; @@ -291,6 +290,7 @@ mext_out: if (err == 0) err = err2; mnt_drop_write(filp->f_path.mnt); + ext4_resize_end(sb); return err; } @@ -333,7 +333,6 @@ mext_out: case FITRIM: { - struct super_block *sb = inode->i_sb; struct request_queue *q = bdev_get_queue(sb->s_bdev); struct fstrim_range range; int ret = 0; @@ -344,7 +343,14 @@ mext_out: if (!blk_queue_discard(q)) return -EOPNOTSUPP; - if (copy_from_user(&range, (struct fstrim_range *)arg, + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { + ext4_msg(sb, KERN_ERR, + "FITRIM not supported with bigalloc"); + return -EOPNOTSUPP; + } + + if (copy_from_user(&range, (struct fstrim_range __user *)arg, sizeof(range))) return -EFAULT; @@ -354,7 +360,7 @@ mext_out: if (ret < 0) return ret; - if (copy_to_user((struct fstrim_range *)arg, &range, + if (copy_to_user((struct fstrim_range __user *)arg, &range, sizeof(range))) return -EFAULT; @@ -392,11 +398,6 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case EXT4_IOC32_SETVERSION_OLD: cmd = EXT4_IOC_SETVERSION_OLD; break; -#ifdef CONFIG_JBD2_DEBUG - case EXT4_IOC32_WAIT_FOR_READONLY: - cmd = EXT4_IOC_WAIT_FOR_READONLY; - break; -#endif case EXT4_IOC32_GETRSVSZ: cmd = EXT4_IOC_GETRSVSZ; break; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 6ed859d56850..e2d8be8f28bf 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -70,13 +70,13 @@ * * pa_lstart -> the logical start block for this prealloc space * pa_pstart -> the physical start block for this prealloc space - * pa_len -> length for this prealloc space - * pa_free -> free space available in this prealloc space + * pa_len -> length for this prealloc space (in clusters) + * pa_free -> free space available in this prealloc space (in clusters) * * The inode preallocation space is used looking at the _logical_ start * block. If only the logical file block falls within the range of prealloc - * space we will consume the particular prealloc space. This make sure that - * that the we have contiguous physical blocks representing the file blocks + * space we will consume the particular prealloc space. This makes sure that + * we have contiguous physical blocks representing the file blocks * * The important thing to be noted in case of inode prealloc space is that * we don't modify the values associated to inode prealloc space except @@ -84,7 +84,7 @@ * * If we are not able to find blocks in the inode prealloc space and if we * have the group allocation flag set then we look at the locality group - * prealloc space. These are per CPU prealloc list repreasented as + * prealloc space. These are per CPU prealloc list represented as * * ext4_sb_info.s_locality_groups[smp_processor_id()] * @@ -126,14 +126,16 @@ * list. In case of inode preallocation we follow a list of heuristics * based on file size. This can be found in ext4_mb_normalize_request. If * we are doing a group prealloc we try to normalize the request to - * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is + * sbi->s_mb_group_prealloc. The default value of s_mb_group_prealloc is + * dependent on the cluster size; for non-bigalloc file systems, it is * 512 blocks. This can be tuned via - * /sys/fs/ext4/<partition/mb_group_prealloc. The value is represented in + * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in * terms of number of blocks. If we have mounted the file system with -O * stripe=<value> option the group prealloc request is normalized to the - * stripe value (sbi->s_stripe) + * the smallest multiple of the stripe value (sbi->s_stripe) which is + * greater than the default mb_group_prealloc. * - * The regular allocator(using the buddy cache) supports few tunables. + * The regular allocator (using the buddy cache) supports a few tunables. * * /sys/fs/ext4/<partition>/mb_min_to_scan * /sys/fs/ext4/<partition>/mb_max_to_scan @@ -152,7 +154,7 @@ * best extent in the found extents. Searching for the blocks starts with * the group specified as the goal value in allocation context via * ac_g_ex. Each group is first checked based on the criteria whether it - * can used for allocation. ext4_mb_good_group explains how the groups are + * can be used for allocation. ext4_mb_good_group explains how the groups are * checked. * * Both the prealloc space are getting populated as above. So for the first @@ -458,7 +460,7 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, ext4_fsblk_t blocknr; blocknr = ext4_group_first_block_no(sb, e4b->bd_group); - blocknr += first + i; + blocknr += EXT4_C2B(EXT4_SB(sb), first + i); ext4_grp_locked_error(sb, e4b->bd_group, inode ? inode->i_ino : 0, blocknr, @@ -492,10 +494,11 @@ static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) b2 = (unsigned char *) bitmap; for (i = 0; i < e4b->bd_sb->s_blocksize; i++) { if (b1[i] != b2[i]) { - printk(KERN_ERR "corruption in group %u " - "at byte %u(%u): %x in copy != %x " - "on disk/prealloc\n", - e4b->bd_group, i, i * 8, b1[i], b2[i]); + ext4_msg(e4b->bd_sb, KERN_ERR, + "corruption in group %u " + "at byte %u(%u): %x in copy != %x " + "on disk/prealloc", + e4b->bd_group, i, i * 8, b1[i], b2[i]); BUG(); } } @@ -578,7 +581,7 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, continue; } - /* both bits in buddy2 must be 0 */ + /* both bits in buddy2 must be 1 */ MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2)); MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2)); @@ -651,7 +654,7 @@ static void ext4_mb_mark_free_simple(struct super_block *sb, ext4_grpblk_t chunk; unsigned short border; - BUG_ON(len > EXT4_BLOCKS_PER_GROUP(sb)); + BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb)); border = 2 << sb->s_blocksize_bits; @@ -703,7 +706,7 @@ void ext4_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap, ext4_group_t group) { struct ext4_group_info *grp = ext4_get_group_info(sb, group); - ext4_grpblk_t max = EXT4_BLOCKS_PER_GROUP(sb); + ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); ext4_grpblk_t i = 0; ext4_grpblk_t first; ext4_grpblk_t len; @@ -732,7 +735,7 @@ void ext4_mb_generate_buddy(struct super_block *sb, if (free != grp->bb_free) { ext4_grp_locked_error(sb, group, 0, 0, - "%u blocks in bitmap, %u in gd", + "%u clusters in bitmap, %u in gd", free, grp->bb_free); /* * If we intent to continue, we consider group descritor @@ -1125,7 +1128,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, grp = ext4_get_group_info(sb, group); e4b->bd_blkbits = sb->s_blocksize_bits; - e4b->bd_info = ext4_get_group_info(sb, group); + e4b->bd_info = grp; e4b->bd_sb = sb; e4b->bd_group = group; e4b->bd_buddy_page = NULL; @@ -1281,7 +1284,7 @@ static void mb_clear_bits(void *bm, int cur, int len) } } -static void mb_set_bits(void *bm, int cur, int len) +void ext4_set_bits(void *bm, int cur, int len) { __u32 *addr; @@ -1337,7 +1340,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, ext4_fsblk_t blocknr; blocknr = ext4_group_first_block_no(sb, e4b->bd_group); - blocknr += block; + blocknr += EXT4_C2B(EXT4_SB(sb), block); ext4_grp_locked_error(sb, e4b->bd_group, inode ? inode->i_ino : 0, blocknr, @@ -1388,7 +1391,6 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block, { int next = block; int max; - int ord; void *buddy; assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); @@ -1430,9 +1432,8 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block, if (mb_test_bit(next, EXT4_MB_BITMAP(e4b))) break; - ord = mb_find_order_for_block(e4b, next); + order = mb_find_order_for_block(e4b, next); - order = ord; block = next >> order; ex->fe_len += 1 << order; } @@ -1510,7 +1511,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) } mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); - mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0); + ext4_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0); mb_check_buddy(e4b); return ret; @@ -1622,8 +1623,8 @@ static void ext4_mb_measure_extent(struct ext4_allocation_context *ac, struct ext4_free_extent *gex = &ac->ac_g_ex; BUG_ON(ex->fe_len <= 0); - BUG_ON(ex->fe_len > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); - BUG_ON(ex->fe_start >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); + BUG_ON(ex->fe_len > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb)); + BUG_ON(ex->fe_start >= EXT4_CLUSTERS_PER_GROUP(ac->ac_sb)); BUG_ON(ac->ac_status != AC_STATUS_CONTINUE); ac->ac_found++; @@ -1821,15 +1822,15 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, while (free && ac->ac_status == AC_STATUS_CONTINUE) { i = mb_find_next_zero_bit(bitmap, - EXT4_BLOCKS_PER_GROUP(sb), i); - if (i >= EXT4_BLOCKS_PER_GROUP(sb)) { + EXT4_CLUSTERS_PER_GROUP(sb), i); + if (i >= EXT4_CLUSTERS_PER_GROUP(sb)) { /* * IF we have corrupt bitmap, we won't find any * free blocks even though group info says we * we have free blocks */ ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, - "%d free blocks as per " + "%d free clusters as per " "group info. But bitmap says 0", free); break; @@ -1839,7 +1840,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, BUG_ON(ex.fe_len <= 0); if (free < ex.fe_len) { ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, - "%d free blocks as per " + "%d free clusters as per " "group info. But got %d blocks", free, ex.fe_len); /* @@ -1885,7 +1886,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, do_div(a, sbi->s_stripe); i = (a * sbi->s_stripe) - first_group_block; - while (i < EXT4_BLOCKS_PER_GROUP(sb)) { + while (i < EXT4_CLUSTERS_PER_GROUP(sb)) { if (!mb_test_bit(i, bitmap)) { max = mb_find_extent(e4b, 0, i, sbi->s_stripe, &ex); if (max >= sbi->s_stripe) { @@ -2223,8 +2224,8 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, EXT4_DESC_PER_BLOCK_BITS(sb); meta_group_info = kmalloc(metalen, GFP_KERNEL); if (meta_group_info == NULL) { - printk(KERN_ERR "EXT4-fs: can't allocate mem for a " - "buddy group\n"); + ext4_msg(sb, KERN_ERR, "EXT4-fs: can't allocate mem " + "for a buddy group"); goto exit_meta_group_info; } sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = @@ -2237,7 +2238,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL); if (meta_group_info[i] == NULL) { - printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n"); + ext4_msg(sb, KERN_ERR, "EXT4-fs: can't allocate buddy mem"); goto exit_group_info; } memset(meta_group_info[i], 0, kmem_cache_size(cachep)); @@ -2250,10 +2251,10 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, */ if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { meta_group_info[i]->bb_free = - ext4_free_blocks_after_init(sb, group, desc); + ext4_free_clusters_after_init(sb, group, desc); } else { meta_group_info[i]->bb_free = - ext4_free_blks_count(sb, desc); + ext4_free_group_clusters(sb, desc); } INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); @@ -2279,8 +2280,10 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, exit_group_info: /* If a meta_group_info table has been allocated, release it now */ - if (group % EXT4_DESC_PER_BLOCK(sb) == 0) + if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]); + sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = NULL; + } exit_meta_group_info: return -ENOMEM; } /* ext4_mb_add_groupinfo */ @@ -2328,23 +2331,26 @@ static int ext4_mb_init_backend(struct super_block *sb) /* An 8TB filesystem with 64-bit pointers requires a 4096 byte * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. * So a two level scheme suffices for now. */ - sbi->s_group_info = kzalloc(array_size, GFP_KERNEL); + sbi->s_group_info = ext4_kvzalloc(array_size, GFP_KERNEL); if (sbi->s_group_info == NULL) { - printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n"); + ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group"); return -ENOMEM; } sbi->s_buddy_cache = new_inode(sb); if (sbi->s_buddy_cache == NULL) { - printk(KERN_ERR "EXT4-fs: can't get new inode\n"); + ext4_msg(sb, KERN_ERR, "can't get new inode"); goto err_freesgi; } - sbi->s_buddy_cache->i_ino = get_next_ino(); + /* To avoid potentially colliding with an valid on-disk inode number, + * use EXT4_BAD_INO for the buddy cache inode number. This inode is + * not in the inode hash, so it should never be found by iget(), but + * this will avoid confusion if it ever shows up during debugging. */ + sbi->s_buddy_cache->i_ino = EXT4_BAD_INO; EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; for (i = 0; i < ngroups; i++) { desc = ext4_get_group_desc(sb, i, NULL); if (desc == NULL) { - printk(KERN_ERR - "EXT4-fs: can't read descriptor %u\n", i); + ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i); goto err_freebuddy; } if (ext4_mb_add_groupinfo(sb, i, desc) != 0) @@ -2362,7 +2368,7 @@ err_freebuddy: kfree(sbi->s_group_info[i]); iput(sbi->s_buddy_cache); err_freesgi: - kfree(sbi->s_group_info); + ext4_kvfree(sbi->s_group_info); return -ENOMEM; } @@ -2404,14 +2410,15 @@ static int ext4_groupinfo_create_slab(size_t size) slab_size, 0, SLAB_RECLAIM_ACCOUNT, NULL); + ext4_groupinfo_caches[cache_index] = cachep; + mutex_unlock(&ext4_grpinfo_slab_create_mutex); if (!cachep) { - printk(KERN_EMERG "EXT4: no memory for groupinfo slab cache\n"); + printk(KERN_EMERG + "EXT4-fs: no memory for groupinfo slab cache\n"); return -ENOMEM; } - ext4_groupinfo_caches[cache_index] = cachep; - return 0; } @@ -2457,12 +2464,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) i++; } while (i <= sb->s_blocksize_bits + 1); - /* init file for buddy data */ - ret = ext4_mb_init_backend(sb); - if (ret != 0) { - goto out; - } - spin_lock_init(&sbi->s_md_lock); spin_lock_init(&sbi->s_bal_lock); @@ -2471,12 +2472,37 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) sbi->s_mb_stats = MB_DEFAULT_STATS; sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; - sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC; + /* + * The default group preallocation is 512, which for 4k block + * sizes translates to 2 megabytes. However for bigalloc file + * systems, this is probably too big (i.e, if the cluster size + * is 1 megabyte, then group preallocation size becomes half a + * gigabyte!). As a default, we will keep a two megabyte + * group pralloc size for cluster sizes up to 64k, and after + * that, we will force a minimum group preallocation size of + * 32 clusters. This translates to 8 megs when the cluster + * size is 256k, and 32 megs when the cluster size is 1 meg, + * which seems reasonable as a default. + */ + sbi->s_mb_group_prealloc = max(MB_DEFAULT_GROUP_PREALLOC >> + sbi->s_cluster_bits, 32); + /* + * If there is a s_stripe > 1, then we set the s_mb_group_prealloc + * to the lowest multiple of s_stripe which is bigger than + * the s_mb_group_prealloc as determined above. We want + * the preallocation size to be an exact multiple of the + * RAID stripe size so that preallocations don't fragment + * the stripes. + */ + if (sbi->s_stripe > 1) { + sbi->s_mb_group_prealloc = roundup( + sbi->s_mb_group_prealloc, sbi->s_stripe); + } sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); if (sbi->s_locality_groups == NULL) { ret = -ENOMEM; - goto out; + goto out_free_groupinfo_slab; } for_each_possible_cpu(i) { struct ext4_locality_group *lg; @@ -2487,17 +2513,30 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) spin_lock_init(&lg->lg_prealloc_lock); } + /* init file for buddy data */ + ret = ext4_mb_init_backend(sb); + if (ret != 0) + goto out_free_locality_groups; + if (sbi->s_proc) proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, &ext4_mb_seq_groups_fops, sb); if (sbi->s_journal) sbi->s_journal->j_commit_callback = release_blocks_on_commit; + + return 0; + +out_free_locality_groups: + free_percpu(sbi->s_locality_groups); + sbi->s_locality_groups = NULL; +out_free_groupinfo_slab: + ext4_groupinfo_destroy_slabs(); out: - if (ret) { - kfree(sbi->s_mb_offsets); - kfree(sbi->s_mb_maxs); - } + kfree(sbi->s_mb_offsets); + sbi->s_mb_offsets = NULL; + kfree(sbi->s_mb_maxs); + sbi->s_mb_maxs = NULL; return ret; } @@ -2544,32 +2583,32 @@ int ext4_mb_release(struct super_block *sb) EXT4_DESC_PER_BLOCK_BITS(sb); for (i = 0; i < num_meta_group_infos; i++) kfree(sbi->s_group_info[i]); - kfree(sbi->s_group_info); + ext4_kvfree(sbi->s_group_info); } kfree(sbi->s_mb_offsets); kfree(sbi->s_mb_maxs); if (sbi->s_buddy_cache) iput(sbi->s_buddy_cache); if (sbi->s_mb_stats) { - printk(KERN_INFO - "EXT4-fs: mballoc: %u blocks %u reqs (%u success)\n", + ext4_msg(sb, KERN_INFO, + "mballoc: %u blocks %u reqs (%u success)", atomic_read(&sbi->s_bal_allocated), atomic_read(&sbi->s_bal_reqs), atomic_read(&sbi->s_bal_success)); - printk(KERN_INFO - "EXT4-fs: mballoc: %u extents scanned, %u goal hits, " - "%u 2^N hits, %u breaks, %u lost\n", + ext4_msg(sb, KERN_INFO, + "mballoc: %u extents scanned, %u goal hits, " + "%u 2^N hits, %u breaks, %u lost", atomic_read(&sbi->s_bal_ex_scanned), atomic_read(&sbi->s_bal_goals), atomic_read(&sbi->s_bal_2orders), atomic_read(&sbi->s_bal_breaks), atomic_read(&sbi->s_mb_lost_chunks)); - printk(KERN_INFO - "EXT4-fs: mballoc: %lu generated and it took %Lu\n", - sbi->s_mb_buddies_generated++, + ext4_msg(sb, KERN_INFO, + "mballoc: %lu generated and it took %Lu", + sbi->s_mb_buddies_generated, sbi->s_mb_generation_time); - printk(KERN_INFO - "EXT4-fs: mballoc: %u preallocated, %u discarded\n", + ext4_msg(sb, KERN_INFO, + "mballoc: %u preallocated, %u discarded", atomic_read(&sbi->s_mb_preallocated), atomic_read(&sbi->s_mb_discarded)); } @@ -2582,11 +2621,13 @@ int ext4_mb_release(struct super_block *sb) } static inline int ext4_issue_discard(struct super_block *sb, - ext4_group_t block_group, ext4_grpblk_t block, int count) + ext4_group_t block_group, ext4_grpblk_t cluster, int count) { ext4_fsblk_t discard_block; - discard_block = block + ext4_group_first_block_no(sb, block_group); + discard_block = (EXT4_C2B(EXT4_SB(sb), cluster) + + ext4_group_first_block_no(sb, block_group)); + count = EXT4_C2B(EXT4_SB(sb), count); trace_ext4_discard_blocks(sb, (unsigned long long) discard_block, count); return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0); @@ -2613,7 +2654,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) if (test_opt(sb, DISCARD)) ext4_issue_discard(sb, entry->group, - entry->start_blk, entry->count); + entry->start_cluster, entry->count); err = ext4_mb_load_buddy(sb, entry->group, &e4b); /* we expect to find existing buddy because it's pinned */ @@ -2626,7 +2667,16 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) ext4_lock_group(sb, entry->group); /* Take it out of per group rb tree */ rb_erase(&entry->node, &(db->bb_free_root)); - mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count); + mb_free_blocks(NULL, &e4b, entry->start_cluster, entry->count); + + /* + * Clear the trimmed flag for the group so that the next + * ext4_trim_fs can trim it. + * If the volume is mounted with -o discard, online discard + * is supported and the free blocks will be trimmed online. + */ + if (!test_opt(sb, DISCARD)) + EXT4_MB_GRP_CLEAR_TRIMMED(db); if (!db->bb_free_root.rb_node) { /* No more items in the per group rb tree @@ -2723,7 +2773,7 @@ void ext4_exit_mballoc(void) */ static noinline_for_stack int ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, - handle_t *handle, unsigned int reserv_blks) + handle_t *handle, unsigned int reserv_clstrs) { struct buffer_head *bitmap_bh = NULL; struct ext4_group_desc *gdp; @@ -2754,7 +2804,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, goto out_err; ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group, - ext4_free_blks_count(sb, gdp)); + ext4_free_group_clusters(sb, gdp)); err = ext4_journal_get_write_access(handle, gdp_bh); if (err) @@ -2762,7 +2812,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); - len = ac->ac_b_ex.fe_len; + len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len); if (!ext4_data_block_valid(sbi, block, len)) { ext4_error(sb, "Allocating blocks %llu-%llu which overlap " "fs metadata\n", block, block+len); @@ -2771,8 +2821,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, * We leak some of the blocks here. */ ext4_lock_group(sb, ac->ac_b_ex.fe_group); - mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, - ac->ac_b_ex.fe_len); + ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, + ac->ac_b_ex.fe_len); ext4_unlock_group(sb, ac->ac_b_ex.fe_group); err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); if (!err) @@ -2790,31 +2840,33 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, } } #endif - mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,ac->ac_b_ex.fe_len); + ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, + ac->ac_b_ex.fe_len); if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); - ext4_free_blks_set(sb, gdp, - ext4_free_blocks_after_init(sb, - ac->ac_b_ex.fe_group, gdp)); + ext4_free_group_clusters_set(sb, gdp, + ext4_free_clusters_after_init(sb, + ac->ac_b_ex.fe_group, gdp)); } - len = ext4_free_blks_count(sb, gdp) - ac->ac_b_ex.fe_len; - ext4_free_blks_set(sb, gdp, len); + len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len; + ext4_free_group_clusters_set(sb, gdp, len); gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); ext4_unlock_group(sb, ac->ac_b_ex.fe_group); - percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len); + percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len); /* * Now reduce the dirty block count also. Should not go negative */ if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED)) /* release all the reserved blocks if non delalloc */ - percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks); + percpu_counter_sub(&sbi->s_dirtyclusters_counter, + reserv_clstrs); if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, ac->ac_b_ex.fe_group); atomic_sub(ac->ac_b_ex.fe_len, - &sbi->s_flex_groups[flex_group].free_blocks); + &sbi->s_flex_groups[flex_group].free_clusters); } err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); @@ -2830,8 +2882,9 @@ out_err: /* * here we normalize request for locality group - * Group request are normalized to s_strip size if we set the same via mount - * option. If not we set it to s_mb_group_prealloc which can be configured via + * Group request are normalized to s_mb_group_prealloc, which goes to + * s_strip if we set the same via mount option. + * s_mb_group_prealloc can be configured via * /sys/fs/ext4/<partition>/mb_group_prealloc * * XXX: should we try to preallocate more than the group has now? @@ -2842,10 +2895,7 @@ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac) struct ext4_locality_group *lg = ac->ac_lg; BUG_ON(lg == NULL); - if (EXT4_SB(sb)->s_stripe) - ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe; - else - ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc; + ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc; mb_debug(1, "#%u: goal %u blocks for locality group\n", current->pid, ac->ac_g_ex.fe_len); } @@ -2858,6 +2908,7 @@ static noinline_for_stack void ext4_mb_normalize_request(struct ext4_allocation_context *ac, struct ext4_allocation_request *ar) { + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); int bsbits, max; ext4_lblk_t end; loff_t size, orig_size, start_off; @@ -2888,7 +2939,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, /* first, let's learn actual file size * given current request is allocated */ - size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; + size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len); size = size << bsbits; if (size < i_size_read(ac->ac_inode)) size = i_size_read(ac->ac_inode); @@ -2960,7 +3011,8 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, continue; } - pa_end = pa->pa_lstart + pa->pa_len; + pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb), + pa->pa_len); /* PA must not overlap original request */ BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end || @@ -2990,9 +3042,11 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, rcu_read_lock(); list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { ext4_lblk_t pa_end; + spin_lock(&pa->pa_lock); if (pa->pa_deleted == 0) { - pa_end = pa->pa_lstart + pa->pa_len; + pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb), + pa->pa_len); BUG_ON(!(start >= pa_end || end <= pa->pa_lstart)); } spin_unlock(&pa->pa_lock); @@ -3001,20 +3055,21 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, if (start + size <= ac->ac_o_ex.fe_logical && start > ac->ac_o_ex.fe_logical) { - printk(KERN_ERR "start %lu, size %lu, fe_logical %lu\n", - (unsigned long) start, (unsigned long) size, - (unsigned long) ac->ac_o_ex.fe_logical); + ext4_msg(ac->ac_sb, KERN_ERR, + "start %lu, size %lu, fe_logical %lu", + (unsigned long) start, (unsigned long) size, + (unsigned long) ac->ac_o_ex.fe_logical); } BUG_ON(start + size <= ac->ac_o_ex.fe_logical && start > ac->ac_o_ex.fe_logical); - BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); + BUG_ON(size <= 0 || size > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb)); /* now prepare goal request */ /* XXX: is it better to align blocks WRT to logical * placement or satisfy big request as is */ ac->ac_g_ex.fe_logical = start; - ac->ac_g_ex.fe_len = size; + ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size); /* define goal start in order to merge */ if (ar->pright && (ar->lright == (start + size))) { @@ -3083,14 +3138,16 @@ static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac) static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac, struct ext4_prealloc_space *pa) { + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); ext4_fsblk_t start; ext4_fsblk_t end; int len; /* found preallocated blocks, use them */ start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart); - end = min(pa->pa_pstart + pa->pa_len, start + ac->ac_o_ex.fe_len); - len = end - start; + end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len), + start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len)); + len = EXT4_NUM_B2C(sbi, end - start); ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group, &ac->ac_b_ex.fe_start); ac->ac_b_ex.fe_len = len; @@ -3098,7 +3155,7 @@ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac, ac->ac_pa = pa; BUG_ON(start < pa->pa_pstart); - BUG_ON(start + len > pa->pa_pstart + pa->pa_len); + BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len)); BUG_ON(pa->pa_free < len); pa->pa_free -= len; @@ -3164,6 +3221,7 @@ ext4_mb_check_group_pa(ext4_fsblk_t goal_block, static noinline_for_stack int ext4_mb_use_preallocated(struct ext4_allocation_context *ac) { + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); int order, i; struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_locality_group *lg; @@ -3181,12 +3239,14 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) /* all fields in this condition don't change, * so we can skip locking for them */ if (ac->ac_o_ex.fe_logical < pa->pa_lstart || - ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len) + ac->ac_o_ex.fe_logical >= (pa->pa_lstart + + EXT4_C2B(sbi, pa->pa_len))) continue; /* non-extent files can't have physical blocks past 2^32 */ if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) && - pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS) + (pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len) > + EXT4_MAX_BLOCK_FILE_PHYS)) continue; /* found preallocated blocks, use them */ @@ -3262,7 +3322,7 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, while (n) { entry = rb_entry(n, struct ext4_free_data, node); - mb_set_bits(bitmap, entry->start_blk, entry->count); + ext4_set_bits(bitmap, entry->start_cluster, entry->count); n = rb_next(n); } return; @@ -3283,7 +3343,6 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t groupnr; ext4_grpblk_t start; int preallocated = 0; - int count = 0; int len; /* all form of preallocation discards first load group, @@ -3304,9 +3363,8 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, if (unlikely(len == 0)) continue; BUG_ON(groupnr != group); - mb_set_bits(bitmap, start, len); + ext4_set_bits(bitmap, start, len); preallocated += len; - count++; } mb_debug(1, "prellocated %u for group %u\n", preallocated, group); } @@ -3383,6 +3441,7 @@ static noinline_for_stack int ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_prealloc_space *pa; struct ext4_group_info *grp; struct ext4_inode_info *ei; @@ -3414,16 +3473,18 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical; /* also, we should cover whole original request */ - wins = ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len; + wins = EXT4_C2B(sbi, ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len); /* the smallest one defines real window */ win = min(winl, wins); - offs = ac->ac_o_ex.fe_logical % ac->ac_b_ex.fe_len; + offs = ac->ac_o_ex.fe_logical % + EXT4_C2B(sbi, ac->ac_b_ex.fe_len); if (offs && offs < win) win = offs; - ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - win; + ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - + EXT4_B2C(sbi, win); BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical); BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len); } @@ -3448,7 +3509,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) trace_ext4_mb_new_inode_pa(ac, pa); ext4_mb_use_inode_pa(ac, pa); - atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); + atomic_add(pa->pa_free, &sbi->s_mb_preallocated); ei = EXT4_I(ac->ac_inode); grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); @@ -3563,7 +3624,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, BUG_ON(pa->pa_deleted == 0); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); - grp_blk_start = pa->pa_pstart - bit; + grp_blk_start = pa->pa_pstart - EXT4_C2B(sbi, bit); BUG_ON(group != e4b->bd_group && pa->pa_len != 0); end = bit + pa->pa_len; @@ -3578,16 +3639,18 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, free += next - bit; trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit); - trace_ext4_mb_release_inode_pa(pa, grp_blk_start + bit, + trace_ext4_mb_release_inode_pa(pa, (grp_blk_start + + EXT4_C2B(sbi, bit)), next - bit); mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); bit = next + 1; } if (free != pa->pa_free) { - printk(KERN_CRIT "pa %p: logic %lu, phys. %lu, len %lu\n", - pa, (unsigned long) pa->pa_lstart, - (unsigned long) pa->pa_pstart, - (unsigned long) pa->pa_len); + ext4_msg(e4b->bd_sb, KERN_CRIT, + "pa %p: logic %lu, phys. %lu, len %lu", + pa, (unsigned long) pa->pa_lstart, + (unsigned long) pa->pa_pstart, + (unsigned long) pa->pa_len); ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u", free, pa->pa_free); /* @@ -3660,7 +3723,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, } if (needed == 0) - needed = EXT4_BLOCKS_PER_GROUP(sb) + 1; + needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1; INIT_LIST_HEAD(&list); repeat: @@ -3775,7 +3838,8 @@ repeat: * use preallocation while we're discarding it */ spin_unlock(&pa->pa_lock); spin_unlock(&ei->i_prealloc_lock); - printk(KERN_ERR "uh-oh! used pa while discarding\n"); + ext4_msg(sb, KERN_ERR, + "uh-oh! used pa while discarding"); WARN_ON(1); schedule_timeout_uninterruptible(HZ); goto repeat; @@ -3852,12 +3916,13 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) return; - printk(KERN_ERR "EXT4-fs: Can't allocate:" - " Allocation context details:\n"); - printk(KERN_ERR "EXT4-fs: status %d flags %d\n", + ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: Can't allocate:" + " Allocation context details:"); + ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: status %d flags %d", ac->ac_status, ac->ac_flags); - printk(KERN_ERR "EXT4-fs: orig %lu/%lu/%lu@%lu, goal %lu/%lu/%lu@%lu, " - "best %lu/%lu/%lu@%lu cr %d\n", + ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: orig %lu/%lu/%lu@%lu, " + "goal %lu/%lu/%lu@%lu, " + "best %lu/%lu/%lu@%lu cr %d", (unsigned long)ac->ac_o_ex.fe_group, (unsigned long)ac->ac_o_ex.fe_start, (unsigned long)ac->ac_o_ex.fe_len, @@ -3871,9 +3936,9 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) (unsigned long)ac->ac_b_ex.fe_len, (unsigned long)ac->ac_b_ex.fe_logical, (int)ac->ac_criteria); - printk(KERN_ERR "EXT4-fs: %lu scanned, %d found\n", ac->ac_ex_scanned, - ac->ac_found); - printk(KERN_ERR "EXT4-fs: groups: \n"); + ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: %lu scanned, %d found", + ac->ac_ex_scanned, ac->ac_found); + ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: groups: "); ngroups = ext4_get_groups_count(sb); for (i = 0; i < ngroups; i++) { struct ext4_group_info *grp = ext4_get_group_info(sb, i); @@ -3926,7 +3991,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) return; - size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; + size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len); isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) >> bsbits; @@ -3937,6 +4002,11 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) return; } + if (sbi->s_mb_group_prealloc <= 0) { + ac->ac_flags |= EXT4_MB_STREAM_ALLOC; + return; + } + /* don't use group allocation for large files */ size = max(size, isize); if (size > sbi->s_mb_stream_request) { @@ -3975,8 +4045,8 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, len = ar->len; /* just a dirty hack to filter too big requests */ - if (len >= EXT4_BLOCKS_PER_GROUP(sb) - 10) - len = EXT4_BLOCKS_PER_GROUP(sb) - 10; + if (len >= EXT4_CLUSTERS_PER_GROUP(sb) - 10) + len = EXT4_CLUSTERS_PER_GROUP(sb) - 10; /* start searching from the goal */ goal = ar->goal; @@ -3987,18 +4057,15 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, /* set up allocation goals */ memset(ac, 0, sizeof(struct ext4_allocation_context)); - ac->ac_b_ex.fe_logical = ar->logical; + ac->ac_b_ex.fe_logical = ar->logical & ~(sbi->s_cluster_ratio - 1); ac->ac_status = AC_STATUS_CONTINUE; ac->ac_sb = sb; ac->ac_inode = ar->inode; - ac->ac_o_ex.fe_logical = ar->logical; + ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical; ac->ac_o_ex.fe_group = group; ac->ac_o_ex.fe_start = block; ac->ac_o_ex.fe_len = len; - ac->ac_g_ex.fe_logical = ar->logical; - ac->ac_g_ex.fe_group = group; - ac->ac_g_ex.fe_start = block; - ac->ac_g_ex.fe_len = len; + ac->ac_g_ex = ac->ac_o_ex; ac->ac_flags = ar->flags; /* we have to define context: we'll we work with a file or @@ -4150,13 +4217,14 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) */ static int ext4_mb_release_context(struct ext4_allocation_context *ac) { + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_prealloc_space *pa = ac->ac_pa; if (pa) { if (pa->pa_type == MB_GROUP_PA) { /* see comment in ext4_mb_use_group_pa() */ spin_lock(&pa->pa_lock); - pa->pa_pstart += ac->ac_b_ex.fe_len; - pa->pa_lstart += ac->ac_b_ex.fe_len; + pa->pa_pstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len); + pa->pa_lstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len); pa->pa_free -= ac->ac_b_ex.fe_len; pa->pa_len -= ac->ac_b_ex.fe_len; spin_unlock(&pa->pa_lock); @@ -4217,13 +4285,17 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, struct super_block *sb; ext4_fsblk_t block = 0; unsigned int inquota = 0; - unsigned int reserv_blks = 0; + unsigned int reserv_clstrs = 0; sb = ar->inode->i_sb; sbi = EXT4_SB(sb); trace_ext4_request_blocks(ar); + /* Allow to use superuser reservation for quota file */ + if (IS_NOQUOTA(ar->inode)) + ar->flags |= EXT4_MB_USE_ROOT_BLOCKS; + /* * For delayed allocation, we could skip the ENOSPC and * EDQUOT check, as blocks and quotas have been already @@ -4237,7 +4309,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, * and verify allocation doesn't exceed the quota limits. */ while (ar->len && - ext4_claim_free_blocks(sbi, ar->len, ar->flags)) { + ext4_claim_free_clusters(sbi, ar->len, ar->flags)) { /* let others to free the space */ yield(); @@ -4247,12 +4319,14 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, *errp = -ENOSPC; return 0; } - reserv_blks = ar->len; + reserv_clstrs = ar->len; if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) { - dquot_alloc_block_nofail(ar->inode, ar->len); + dquot_alloc_block_nofail(ar->inode, + EXT4_C2B(sbi, ar->len)); } else { while (ar->len && - dquot_alloc_block(ar->inode, ar->len)) { + dquot_alloc_block(ar->inode, + EXT4_C2B(sbi, ar->len))) { ar->flags |= EXT4_MB_HINT_NOPREALLOC; ar->len--; @@ -4296,7 +4370,7 @@ repeat: ext4_mb_new_preallocation(ac); } if (likely(ac->ac_status == AC_STATUS_FOUND)) { - *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks); + *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs); if (*errp == -EAGAIN) { /* * drop the reference that we took @@ -4332,13 +4406,13 @@ out: if (ac) kmem_cache_free(ext4_ac_cachep, ac); if (inquota && ar->len < inquota) - dquot_free_block(ar->inode, inquota - ar->len); + dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len)); if (!ar->len) { if (!ext4_test_inode_state(ar->inode, EXT4_STATE_DELALLOC_RESERVED)) /* release all the reserved blocks if non delalloc */ - percpu_counter_sub(&sbi->s_dirtyblocks_counter, - reserv_blks); + percpu_counter_sub(&sbi->s_dirtyclusters_counter, + reserv_clstrs); } trace_ext4_allocate_blocks(ar, (unsigned long long)block); @@ -4356,7 +4430,7 @@ static int can_merge(struct ext4_free_data *entry1, { if ((entry1->t_tid == entry2->t_tid) && (entry1->group == entry2->group) && - ((entry1->start_blk + entry1->count) == entry2->start_blk)) + ((entry1->start_cluster + entry1->count) == entry2->start_cluster)) return 1; return 0; } @@ -4366,7 +4440,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, struct ext4_free_data *new_entry) { ext4_group_t group = e4b->bd_group; - ext4_grpblk_t block; + ext4_grpblk_t cluster; struct ext4_free_data *entry; struct ext4_group_info *db = e4b->bd_info; struct super_block *sb = e4b->bd_sb; @@ -4379,7 +4453,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, BUG_ON(e4b->bd_buddy_page == NULL); new_node = &new_entry->node; - block = new_entry->start_blk; + cluster = new_entry->start_cluster; if (!*n) { /* first free block exent. We need to @@ -4393,13 +4467,14 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, while (*n) { parent = *n; entry = rb_entry(parent, struct ext4_free_data, node); - if (block < entry->start_blk) + if (cluster < entry->start_cluster) n = &(*n)->rb_left; - else if (block >= (entry->start_blk + entry->count)) + else if (cluster >= (entry->start_cluster + entry->count)) n = &(*n)->rb_right; else { ext4_grp_locked_error(sb, group, 0, - ext4_group_first_block_no(sb, group) + block, + ext4_group_first_block_no(sb, group) + + EXT4_C2B(sbi, cluster), "Block already on to-be-freed list"); return 0; } @@ -4413,7 +4488,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, if (node) { entry = rb_entry(node, struct ext4_free_data, node); if (can_merge(entry, new_entry)) { - new_entry->start_blk = entry->start_blk; + new_entry->start_cluster = entry->start_cluster; new_entry->count += entry->count; rb_erase(node, &(db->bb_free_root)); spin_lock(&sbi->s_md_lock); @@ -4464,6 +4539,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, ext4_group_t block_group; struct ext4_sb_info *sbi; struct ext4_buddy e4b; + unsigned int count_clusters; int err = 0; int ret; @@ -4512,6 +4588,38 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, if (!ext4_should_writeback_data(inode)) flags |= EXT4_FREE_BLOCKS_METADATA; + /* + * If the extent to be freed does not begin on a cluster + * boundary, we need to deal with partial clusters at the + * beginning and end of the extent. Normally we will free + * blocks at the beginning or the end unless we are explicitly + * requested to avoid doing so. + */ + overflow = block & (sbi->s_cluster_ratio - 1); + if (overflow) { + if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) { + overflow = sbi->s_cluster_ratio - overflow; + block += overflow; + if (count > overflow) + count -= overflow; + else + return; + } else { + block -= overflow; + count += overflow; + } + } + overflow = count & (sbi->s_cluster_ratio - 1); + if (overflow) { + if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) { + if (count > overflow) + count -= overflow; + else + return; + } else + count += sbi->s_cluster_ratio - overflow; + } + do_more: overflow = 0; ext4_get_group_no_and_offset(sb, block, &block_group, &bit); @@ -4520,10 +4628,12 @@ do_more: * Check to see if we are freeing blocks across a group * boundary. */ - if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) { - overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb); + if (EXT4_C2B(sbi, bit) + count > EXT4_BLOCKS_PER_GROUP(sb)) { + overflow = EXT4_C2B(sbi, bit) + count - + EXT4_BLOCKS_PER_GROUP(sb); count -= overflow; } + count_clusters = EXT4_B2C(sbi, count); bitmap_bh = ext4_read_block_bitmap(sb, block_group); if (!bitmap_bh) { err = -EIO; @@ -4538,9 +4648,9 @@ do_more: if (in_range(ext4_block_bitmap(sb, gdp), block, count) || in_range(ext4_inode_bitmap(sb, gdp), block, count) || in_range(block, ext4_inode_table(sb, gdp), - EXT4_SB(sb)->s_itb_per_group) || + EXT4_SB(sb)->s_itb_per_group) || in_range(block + count - 1, ext4_inode_table(sb, gdp), - EXT4_SB(sb)->s_itb_per_group)) { + EXT4_SB(sb)->s_itb_per_group)) { ext4_error(sb, "Freeing blocks in system zone - " "Block = %llu, count = %lu", block, count); @@ -4565,11 +4675,11 @@ do_more: #ifdef AGGRESSIVE_CHECK { int i; - for (i = 0; i < count; i++) + for (i = 0; i < count_clusters; i++) BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data)); } #endif - trace_ext4_mballoc_free(sb, inode, block_group, bit, count); + trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters); err = ext4_mb_load_buddy(sb, block_group, &e4b); if (err) @@ -4586,13 +4696,13 @@ do_more: err = -ENOMEM; goto error_return; } - new_entry->start_blk = bit; + new_entry->start_cluster = bit; new_entry->group = block_group; - new_entry->count = count; + new_entry->count = count_clusters; new_entry->t_tid = handle->h_transaction->t_tid; ext4_lock_group(sb, block_group); - mb_clear_bits(bitmap_bh->b_data, bit, count); + mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); ext4_mb_free_metadata(handle, &e4b, new_entry); } else { /* need to update group_info->bb_free and bitmap @@ -4600,25 +4710,29 @@ do_more: * them with group lock_held */ ext4_lock_group(sb, block_group); - mb_clear_bits(bitmap_bh->b_data, bit, count); - mb_free_blocks(inode, &e4b, bit, count); + mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); + mb_free_blocks(inode, &e4b, bit, count_clusters); } - ret = ext4_free_blks_count(sb, gdp) + count; - ext4_free_blks_set(sb, gdp, ret); + ret = ext4_free_group_clusters(sb, gdp) + count_clusters; + ext4_free_group_clusters_set(sb, gdp, ret); gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); ext4_unlock_group(sb, block_group); - percpu_counter_add(&sbi->s_freeblocks_counter, count); + percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters); if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, block_group); - atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks); + atomic_add(count_clusters, + &sbi->s_flex_groups[flex_group].free_clusters); } ext4_mb_unload_buddy(&e4b); freed += count; + if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) + dquot_free_block(inode, EXT4_C2B(sbi, count_clusters)); + /* We dirtied the bitmap block */ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); @@ -4637,15 +4751,13 @@ do_more: } ext4_mark_super_dirty(sb); error_return: - if (freed) - dquot_free_block(inode, freed); brelse(bitmap_bh); ext4_std_error(sb, err); return; } /** - * ext4_add_groupblocks() -- Add given blocks to an existing group + * ext4_group_add_blocks() -- Add given blocks to an existing group * @handle: handle to this transaction * @sb: super block * @block: start physcial block to add to the block group @@ -4653,7 +4765,7 @@ error_return: * * This marks the blocks as free in the bitmap and buddy. */ -void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, +int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, ext4_fsblk_t block, unsigned long count) { struct buffer_head *bitmap_bh = NULL; @@ -4666,25 +4778,35 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, struct ext4_buddy e4b; int err = 0, ret, blk_free_count; ext4_grpblk_t blocks_freed; - struct ext4_group_info *grp; ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); + if (count == 0) + return 0; + ext4_get_group_no_and_offset(sb, block, &block_group, &bit); - grp = ext4_get_group_info(sb, block_group); /* * Check to see if we are freeing blocks across a group * boundary. */ - if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) + if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) { + ext4_warning(sb, "too much blocks added to group %u\n", + block_group); + err = -EINVAL; goto error_return; + } bitmap_bh = ext4_read_block_bitmap(sb, block_group); - if (!bitmap_bh) + if (!bitmap_bh) { + err = -EIO; goto error_return; + } + desc = ext4_get_group_desc(sb, block_group, &gd_bh); - if (!desc) + if (!desc) { + err = -EIO; goto error_return; + } if (in_range(ext4_block_bitmap(sb, desc), block, count) || in_range(ext4_inode_bitmap(sb, desc), block, count) || @@ -4694,6 +4816,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, ext4_error(sb, "Adding blocks in system zones - " "Block = %llu, count = %lu", block, count); + err = -EINVAL; goto error_return; } @@ -4735,16 +4858,17 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, ext4_lock_group(sb, block_group); mb_clear_bits(bitmap_bh->b_data, bit, count); mb_free_blocks(NULL, &e4b, bit, count); - blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc); - ext4_free_blks_set(sb, desc, blk_free_count); + blk_free_count = blocks_freed + ext4_free_group_clusters(sb, desc); + ext4_free_group_clusters_set(sb, desc, blk_free_count); desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc); ext4_unlock_group(sb, block_group); - percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed); + percpu_counter_add(&sbi->s_freeclusters_counter, + EXT4_B2C(sbi, blocks_freed)); if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, block_group); - atomic_add(blocks_freed, - &sbi->s_flex_groups[flex_group].free_blocks); + atomic_add(EXT4_B2C(sbi, blocks_freed), + &sbi->s_flex_groups[flex_group].free_clusters); } ext4_mb_unload_buddy(&e4b); @@ -4762,7 +4886,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, error_return: brelse(bitmap_bh); ext4_std_error(sb, err); - return; + return err; } /** @@ -4782,6 +4906,8 @@ static void ext4_trim_extent(struct super_block *sb, int start, int count, { struct ext4_free_extent ex; + trace_ext4_trim_extent(sb, group, start, count); + assert_spin_locked(ext4_group_lock_ptr(sb, group)); ex.fe_start = start; @@ -4802,7 +4928,7 @@ static void ext4_trim_extent(struct super_block *sb, int start, int count, /** * ext4_trim_all_free -- function to trim all free space in alloc. group * @sb: super block for file system - * @e4b: ext4 buddy + * @group: group to be trimmed * @start: first group block to examine * @max: last group block to examine * @minblocks: minimum extent block count @@ -4823,10 +4949,12 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, ext4_grpblk_t minblocks) { void *bitmap; - ext4_grpblk_t next, count = 0; + ext4_grpblk_t next, count = 0, free_count = 0; struct ext4_buddy e4b; int ret; + trace_ext4_trim_all_free(sb, group, start, max); + ret = ext4_mb_load_buddy(sb, group, &e4b); if (ret) { ext4_error(sb, "Error in loading buddy " @@ -4836,6 +4964,10 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, bitmap = e4b.bd_bitmap; ext4_lock_group(sb, group); + if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) && + minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks)) + goto out; + start = (e4b.bd_info->bb_first_free > start) ? e4b.bd_info->bb_first_free : start; @@ -4850,6 +4982,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, next - start, group, &e4b); count += next - start; } + free_count += next - start; start = next + 1; if (fatal_signal_pending(current)) { @@ -4863,9 +4996,13 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, ext4_lock_group(sb, group); } - if ((e4b.bd_info->bb_free - count) < minblocks) + if ((e4b.bd_info->bb_free - free_count) < minblocks) break; } + + if (!ret) + EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info); +out: ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); @@ -4892,7 +5029,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) struct ext4_group_info *grp; ext4_group_t first_group, last_group; ext4_group_t group, ngroups = ext4_get_groups_count(sb); - ext4_grpblk_t cnt = 0, first_block, last_block; + ext4_grpblk_t cnt = 0, first_cluster, last_cluster; uint64_t start, len, minlen, trimmed = 0; ext4_fsblk_t first_data_blk = le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); @@ -4902,8 +5039,10 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) len = range->len >> sb->s_blocksize_bits; minlen = range->minlen >> sb->s_blocksize_bits; - if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb))) + if (unlikely(minlen > EXT4_CLUSTERS_PER_GROUP(sb))) return -EINVAL; + if (start + len <= first_data_blk) + goto out; if (start < first_data_blk) { len -= first_data_blk - start; start = first_data_blk; @@ -4911,11 +5050,11 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) /* Determine first and last group to examine based on start and len */ ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start, - &first_group, &first_block); + &first_group, &first_cluster); ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) (start + len), - &last_group, &last_block); + &last_group, &last_cluster); last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group; - last_block = EXT4_BLOCKS_PER_GROUP(sb); + last_cluster = EXT4_CLUSTERS_PER_GROUP(sb); if (first_group > last_group) return -EINVAL; @@ -4935,22 +5074,26 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) * change it for the last group in which case start + * len < EXT4_BLOCKS_PER_GROUP(sb). */ - if (first_block + len < EXT4_BLOCKS_PER_GROUP(sb)) - last_block = first_block + len; - len -= last_block - first_block; + if (first_cluster + len < EXT4_CLUSTERS_PER_GROUP(sb)) + last_cluster = first_cluster + len; + len -= last_cluster - first_cluster; if (grp->bb_free >= minlen) { - cnt = ext4_trim_all_free(sb, group, first_block, - last_block, minlen); + cnt = ext4_trim_all_free(sb, group, first_cluster, + last_cluster, minlen); if (cnt < 0) { ret = cnt; break; } } trimmed += cnt; - first_block = 0; + first_cluster = 0; } range->len = trimmed * sb->s_blocksize; + if (!ret) + atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen); + +out: return ret; } diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 20b5e7bfebd1..47705f3285e3 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -106,7 +106,7 @@ struct ext4_free_data { ext4_group_t group; /* free block extent */ - ext4_grpblk_t start_blk; + ext4_grpblk_t start_cluster; ext4_grpblk_t count; /* transaction which freed this extent */ @@ -139,9 +139,9 @@ enum { struct ext4_free_extent { ext4_lblk_t fe_logical; - ext4_grpblk_t fe_start; + ext4_grpblk_t fe_start; /* In cluster units */ ext4_group_t fe_group; - ext4_grpblk_t fe_len; + ext4_grpblk_t fe_len; /* In cluster units */ }; /* @@ -175,7 +175,7 @@ struct ext4_allocation_context { /* the best found extent */ struct ext4_free_extent ac_b_ex; - /* copy of the bext found extent taken before preallocation efforts */ + /* copy of the best found extent taken before preallocation efforts */ struct ext4_free_extent ac_f_ex; /* number of iterations done. we have to track to limit searching */ @@ -187,7 +187,6 @@ struct ext4_allocation_context { __u16 ac_flags; /* allocation hints */ __u8 ac_status; __u8 ac_criteria; - __u8 ac_repeats; __u8 ac_2order; /* if request is to allocate 2^N blocks and * N > 0, the field stores N, otherwise 0 */ __u8 ac_op; /* operation, for history only */ @@ -217,6 +216,7 @@ struct ext4_buddy { static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, struct ext4_free_extent *fex) { - return ext4_group_first_block_no(sb, fex->fe_group) + fex->fe_start; + return ext4_group_first_block_no(sb, fex->fe_group) + + (fex->fe_start << EXT4_SB(sb)->s_cluster_bits); } #endif diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index b57b98fb44d1..16ac228dbec6 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -15,19 +15,18 @@ #include <linux/module.h> #include <linux/slab.h> #include "ext4_jbd2.h" -#include "ext4_extents.h" /* * The contiguous blocks details which can be * represented by a single extent */ -struct list_blocks_struct { - ext4_lblk_t first_block, last_block; +struct migrate_struct { + ext4_lblk_t first_block, last_block, curr_block; ext4_fsblk_t first_pblock, last_pblock; }; static int finish_range(handle_t *handle, struct inode *inode, - struct list_blocks_struct *lb) + struct migrate_struct *lb) { int retval = 0, needed; @@ -87,8 +86,7 @@ err_out: } static int update_extent_range(handle_t *handle, struct inode *inode, - ext4_fsblk_t pblock, ext4_lblk_t blk_num, - struct list_blocks_struct *lb) + ext4_fsblk_t pblock, struct migrate_struct *lb) { int retval; /* @@ -96,9 +94,10 @@ static int update_extent_range(handle_t *handle, struct inode *inode, */ if (lb->first_pblock && (lb->last_pblock+1 == pblock) && - (lb->last_block+1 == blk_num)) { + (lb->last_block+1 == lb->curr_block)) { lb->last_pblock = pblock; - lb->last_block = blk_num; + lb->last_block = lb->curr_block; + lb->curr_block++; return 0; } /* @@ -106,64 +105,49 @@ static int update_extent_range(handle_t *handle, struct inode *inode, */ retval = finish_range(handle, inode, lb); lb->first_pblock = lb->last_pblock = pblock; - lb->first_block = lb->last_block = blk_num; - + lb->first_block = lb->last_block = lb->curr_block; + lb->curr_block++; return retval; } static int update_ind_extent_range(handle_t *handle, struct inode *inode, - ext4_fsblk_t pblock, ext4_lblk_t *blk_nump, - struct list_blocks_struct *lb) + ext4_fsblk_t pblock, + struct migrate_struct *lb) { struct buffer_head *bh; __le32 *i_data; int i, retval = 0; - ext4_lblk_t blk_count = *blk_nump; unsigned long max_entries = inode->i_sb->s_blocksize >> 2; - if (!pblock) { - /* Only update the file block number */ - *blk_nump += max_entries; - return 0; - } - bh = sb_bread(inode->i_sb, pblock); if (!bh) return -EIO; i_data = (__le32 *)bh->b_data; - for (i = 0; i < max_entries; i++, blk_count++) { + for (i = 0; i < max_entries; i++) { if (i_data[i]) { retval = update_extent_range(handle, inode, - le32_to_cpu(i_data[i]), - blk_count, lb); + le32_to_cpu(i_data[i]), lb); if (retval) break; + } else { + lb->curr_block++; } } - - /* Update the file block number */ - *blk_nump = blk_count; put_bh(bh); return retval; } static int update_dind_extent_range(handle_t *handle, struct inode *inode, - ext4_fsblk_t pblock, ext4_lblk_t *blk_nump, - struct list_blocks_struct *lb) + ext4_fsblk_t pblock, + struct migrate_struct *lb) { struct buffer_head *bh; __le32 *i_data; int i, retval = 0; - ext4_lblk_t blk_count = *blk_nump; unsigned long max_entries = inode->i_sb->s_blocksize >> 2; - if (!pblock) { - /* Only update the file block number */ - *blk_nump += max_entries * max_entries; - return 0; - } bh = sb_bread(inode->i_sb, pblock); if (!bh) return -EIO; @@ -172,38 +156,28 @@ static int update_dind_extent_range(handle_t *handle, struct inode *inode, for (i = 0; i < max_entries; i++) { if (i_data[i]) { retval = update_ind_extent_range(handle, inode, - le32_to_cpu(i_data[i]), - &blk_count, lb); + le32_to_cpu(i_data[i]), lb); if (retval) break; } else { /* Only update the file block number */ - blk_count += max_entries; + lb->curr_block += max_entries; } } - - /* Update the file block number */ - *blk_nump = blk_count; put_bh(bh); return retval; } static int update_tind_extent_range(handle_t *handle, struct inode *inode, - ext4_fsblk_t pblock, ext4_lblk_t *blk_nump, - struct list_blocks_struct *lb) + ext4_fsblk_t pblock, + struct migrate_struct *lb) { struct buffer_head *bh; __le32 *i_data; int i, retval = 0; - ext4_lblk_t blk_count = *blk_nump; unsigned long max_entries = inode->i_sb->s_blocksize >> 2; - if (!pblock) { - /* Only update the file block number */ - *blk_nump += max_entries * max_entries * max_entries; - return 0; - } bh = sb_bread(inode->i_sb, pblock); if (!bh) return -EIO; @@ -212,16 +186,14 @@ static int update_tind_extent_range(handle_t *handle, struct inode *inode, for (i = 0; i < max_entries; i++) { if (i_data[i]) { retval = update_dind_extent_range(handle, inode, - le32_to_cpu(i_data[i]), - &blk_count, lb); + le32_to_cpu(i_data[i]), lb); if (retval) break; - } else + } else { /* Only update the file block number */ - blk_count += max_entries * max_entries; + lb->curr_block += max_entries * max_entries; + } } - /* Update the file block number */ - *blk_nump = blk_count; put_bh(bh); return retval; @@ -462,12 +434,12 @@ int ext4_ext_migrate(struct inode *inode) handle_t *handle; int retval = 0, i; __le32 *i_data; - ext4_lblk_t blk_count = 0; struct ext4_inode_info *ei; struct inode *tmp_inode = NULL; - struct list_blocks_struct lb; + struct migrate_struct lb; unsigned long max_entries; __u32 goal; + uid_t owner[2]; /* * If the filesystem does not support extents, or the inode @@ -495,10 +467,12 @@ int ext4_ext_migrate(struct inode *inode) } goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) * EXT4_INODES_PER_GROUP(inode->i_sb)) + 1; + owner[0] = inode->i_uid; + owner[1] = inode->i_gid; tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode, - S_IFREG, NULL, goal); + S_IFREG, NULL, goal, owner); if (IS_ERR(tmp_inode)) { - retval = -ENOMEM; + retval = PTR_ERR(inode); ext4_journal_stop(handle); return retval; } @@ -507,7 +481,7 @@ int ext4_ext_migrate(struct inode *inode) * Set the i_nlink to zero so it will be deleted later * when we drop inode reference. */ - tmp_inode->i_nlink = 0; + clear_nlink(tmp_inode); ext4_ext_tree_init(handle, tmp_inode); ext4_orphan_add(handle, tmp_inode); @@ -551,35 +525,32 @@ int ext4_ext_migrate(struct inode *inode) /* 32 bit block address 4 bytes */ max_entries = inode->i_sb->s_blocksize >> 2; - for (i = 0; i < EXT4_NDIR_BLOCKS; i++, blk_count++) { + for (i = 0; i < EXT4_NDIR_BLOCKS; i++) { if (i_data[i]) { retval = update_extent_range(handle, tmp_inode, - le32_to_cpu(i_data[i]), - blk_count, &lb); + le32_to_cpu(i_data[i]), &lb); if (retval) goto err_out; - } + } else + lb.curr_block++; } if (i_data[EXT4_IND_BLOCK]) { retval = update_ind_extent_range(handle, tmp_inode, - le32_to_cpu(i_data[EXT4_IND_BLOCK]), - &blk_count, &lb); + le32_to_cpu(i_data[EXT4_IND_BLOCK]), &lb); if (retval) goto err_out; } else - blk_count += max_entries; + lb.curr_block += max_entries; if (i_data[EXT4_DIND_BLOCK]) { retval = update_dind_extent_range(handle, tmp_inode, - le32_to_cpu(i_data[EXT4_DIND_BLOCK]), - &blk_count, &lb); + le32_to_cpu(i_data[EXT4_DIND_BLOCK]), &lb); if (retval) goto err_out; } else - blk_count += max_entries * max_entries; + lb.curr_block += max_entries * max_entries; if (i_data[EXT4_TIND_BLOCK]) { retval = update_tind_extent_range(handle, tmp_inode, - le32_to_cpu(i_data[EXT4_TIND_BLOCK]), - &blk_count, &lb); + le32_to_cpu(i_data[EXT4_TIND_BLOCK]), &lb); if (retval) goto err_out; } diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 9bdef3f537c5..7ea4ba4eff2a 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -109,7 +109,7 @@ static int kmmpd(void *data) mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval); bdevname(bh->b_bdev, mmp->mmp_bdevname); - memcpy(mmp->mmp_nodename, init_utsname()->sysname, + memcpy(mmp->mmp_nodename, init_utsname()->nodename, sizeof(mmp->mmp_nodename)); while (!kthread_should_stop()) { @@ -125,8 +125,9 @@ static int kmmpd(void *data) * Don't spew too many error messages. Print one every * (s_mmp_update_interval * 60) seconds. */ - if (retval && (failed_writes % 60) == 0) { - ext4_error(sb, "Error writing to MMP block"); + if (retval) { + if ((failed_writes % 60) == 0) + ext4_error(sb, "Error writing to MMP block"); failed_writes++; } @@ -295,7 +296,8 @@ skip: /* * write a new random sequence number. */ - mmp->mmp_seq = seq = cpu_to_le32(mmp_new_seq()); + seq = mmp_new_seq(); + mmp->mmp_seq = cpu_to_le32(seq); retval = write_mmp_block(bh); if (retval) diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index f57455a1b1b2..c5826c623e7a 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -17,7 +17,6 @@ #include <linux/quotaops.h> #include <linux/slab.h> #include "ext4_jbd2.h" -#include "ext4_extents.h" #include "ext4.h" /** diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 707d605bf769..aa4c782c9dd7 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -289,7 +289,7 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_ent while (len--) printk("%c", *name++); ext4fs_dirhash(de->name, de->name_len, &h); printk(":%x.%u ", h.hash, - ((char *) de - base)); + (unsigned) ((char *) de - base)); } space += EXT4_DIR_REC_LEN(de->name_len); names++; @@ -922,7 +922,8 @@ restart: bh = ext4_getblk(NULL, dir, b++, 0, &err); bh_use[ra_max] = bh; if (bh) - ll_rw_block(READ_META, 1, &bh); + ll_rw_block(READ | REQ_META | REQ_PRIO, + 1, &bh); } } if ((bh = bh_use[ra_ptr++]) == NULL) @@ -1013,7 +1014,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q *err = -ENOENT; errout: - dxtrace(printk(KERN_DEBUG "%s not found\n", name)); + dxtrace(printk(KERN_DEBUG "%s not found\n", d_name->name)); dx_release (frames); return NULL; } @@ -1585,7 +1586,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, dxtrace(dx_show_index("node", frames[1].entries)); dxtrace(dx_show_index("node", ((struct dx_node *) bh2->b_data)->entries)); - err = ext4_handle_dirty_metadata(handle, inode, bh2); + err = ext4_handle_dirty_metadata(handle, dir, bh2); if (err) goto journal_error; brelse (bh2); @@ -1611,7 +1612,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, if (err) goto journal_error; } - err = ext4_handle_dirty_metadata(handle, inode, frames[0].bh); + err = ext4_handle_dirty_metadata(handle, dir, frames[0].bh); if (err) { ext4_std_error(inode->i_sb, err); goto cleanup; @@ -1693,7 +1694,7 @@ static void ext4_inc_count(handle_t *handle, struct inode *inode) if (is_dx(inode) && inode->i_nlink > 1) { /* limit is 16-bit i_links_count */ if (inode->i_nlink >= EXT4_LINK_MAX || inode->i_nlink == 2) { - inode->i_nlink = 1; + set_nlink(inode, 1); EXT4_SET_RO_COMPAT_FEATURE(inode->i_sb, EXT4_FEATURE_RO_COMPAT_DIR_NLINK); } @@ -1706,9 +1707,8 @@ static void ext4_inc_count(handle_t *handle, struct inode *inode) */ static void ext4_dec_count(handle_t *handle, struct inode *inode) { - drop_nlink(inode); - if (S_ISDIR(inode->i_mode) && inode->i_nlink == 0) - inc_nlink(inode); + if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2) + drop_nlink(inode); } @@ -1755,7 +1755,7 @@ retry: if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); - inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0); + inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL); err = PTR_ERR(inode); if (!IS_ERR(inode)) { inode->i_op = &ext4_file_inode_operations; @@ -1791,7 +1791,7 @@ retry: if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); - inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0); + inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL); err = PTR_ERR(inode); if (!IS_ERR(inode)) { init_special_inode(inode, inode->i_mode, rdev); @@ -1831,7 +1831,7 @@ retry: ext4_handle_sync(handle); inode = ext4_new_inode(handle, dir, S_IFDIR | mode, - &dentry->d_name, 0); + &dentry->d_name, 0, NULL); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_stop; @@ -1860,9 +1860,9 @@ retry: de->name_len = 2; strcpy(de->name, ".."); ext4_set_de_type(dir->i_sb, de, S_IFDIR); - inode->i_nlink = 2; + set_nlink(inode, 2); BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, dir, dir_block); + err = ext4_handle_dirty_metadata(handle, inode, dir_block); if (err) goto out_clear_inode; err = ext4_mark_inode_dirty(handle, inode); @@ -1985,18 +1985,11 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) if (!list_empty(&EXT4_I(inode)->i_orphan)) goto out_unlock; - /* Orphan handling is only valid for files with data blocks - * being truncated, or files being unlinked. */ - - /* @@@ FIXME: Observation from aviro: - * I think I can trigger J_ASSERT in ext4_orphan_add(). We block - * here (on s_orphan_lock), so race with ext4_link() which might bump - * ->i_nlink. For, say it, character device. Not a regular file, - * not a directory, not a symlink and ->i_nlink > 0. - * - * tytso, 4/25/2009: I'm not sure how that could happen; - * shouldn't the fs core protect us from these sort of - * unlink()/link() races? + /* + * Orphan handling is only valid for files with data blocks + * being truncated, or files being unlinked. Note that we either + * hold i_mutex, or the inode can not be referenced from outside, + * so i_nlink should not be bumped due to race */ J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); @@ -2220,7 +2213,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) ext4_warning(inode->i_sb, "Deleting nonexistent file (%lu), %d", inode->i_ino, inode->i_nlink); - inode->i_nlink = 1; + set_nlink(inode, 1); } retval = ext4_delete_entry(handle, dir, de, bh); if (retval) @@ -2260,9 +2253,11 @@ static int ext4_symlink(struct inode *dir, /* * For non-fast symlinks, we just allocate inode and put it on * orphan list in the first transaction => we need bitmap, - * group descriptor, sb, inode block, quota blocks. + * group descriptor, sb, inode block, quota blocks, and + * possibly selinux xattr blocks. */ - credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb); + credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) + + EXT4_XATTR_TRANS_BLOCKS; } else { /* * Fast symlink. We have to add entry to directory @@ -2283,7 +2278,7 @@ retry: ext4_handle_sync(handle); inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO, - &dentry->d_name, 0); + &dentry->d_name, 0, NULL); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_stop; @@ -2534,7 +2529,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) = cpu_to_le32(new_dir->i_ino); BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata"); - retval = ext4_handle_dirty_metadata(handle, old_dir, dir_bh); + retval = ext4_handle_dirty_metadata(handle, old_inode, dir_bh); if (retval) { ext4_std_error(old_dir->i_sb, retval); goto end_rename; @@ -2543,7 +2538,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, if (new_inode) { /* checked empty_dir above, can't have another parent, * ext4_dec_count() won't work for many-linked dirs */ - new_inode->i_nlink = 0; + clear_nlink(new_inode); } else { ext4_inc_count(handle, new_dir); ext4_update_dx_flag(new_dir); @@ -2590,7 +2585,7 @@ const struct inode_operations ext4_dir_inode_operations = { .listxattr = ext4_listxattr, .removexattr = generic_removexattr, #endif - .check_acl = ext4_check_acl, + .get_acl = ext4_get_acl, .fiemap = ext4_fiemap, }; @@ -2602,5 +2597,5 @@ const struct inode_operations ext4_special_inode_operations = { .listxattr = ext4_listxattr, .removexattr = generic_removexattr, #endif - .check_acl = ext4_check_acl, + .get_acl = ext4_get_acl, }; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 7bb8f76d470a..7ce1d0b19c94 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -70,7 +70,6 @@ static void put_io_page(struct ext4_io_page *io_page) void ext4_free_io_end(ext4_io_end_t *io) { int i; - wait_queue_head_t *wq; BUG_ON(!io); if (io->page) @@ -78,56 +77,43 @@ void ext4_free_io_end(ext4_io_end_t *io) for (i = 0; i < io->num_io_pages; i++) put_io_page(io->pages[i]); io->num_io_pages = 0; - wq = ext4_ioend_wq(io->inode); - if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) && - waitqueue_active(wq)) - wake_up_all(wq); + if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count)) + wake_up_all(ext4_ioend_wq(io->inode)); kmem_cache_free(io_end_cachep, io); } /* * check a range of space and convert unwritten extents to written. + * + * Called with inode->i_mutex; we depend on this when we manipulate + * io->flag, since we could otherwise race with ext4_flush_completed_IO() */ int ext4_end_io_nolock(ext4_io_end_t *io) { struct inode *inode = io->inode; loff_t offset = io->offset; ssize_t size = io->size; - wait_queue_head_t *wq; int ret = 0; ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," "list->prev 0x%p\n", io, inode->i_ino, io->list.next, io->list.prev); - if (list_empty(&io->list)) - return ret; - - if (!(io->flag & EXT4_IO_END_UNWRITTEN)) - return ret; - ret = ext4_convert_unwritten_extents(inode, offset, size); if (ret < 0) { - printk(KERN_EMERG "%s: failed to convert unwritten " - "extents to written extents, error is %d " - "io is still on inode %lu aio dio list\n", - __func__, ret, inode->i_ino); - return ret; + ext4_msg(inode->i_sb, KERN_EMERG, + "failed to convert unwritten extents to written " + "extents -- potential data loss! " + "(inode %lu, offset %llu, size %zd, error %d)", + inode->i_ino, offset, size, ret); } if (io->iocb) aio_complete(io->iocb, io->result, 0); - /* clear the DIO AIO unwritten flag */ - if (io->flag & EXT4_IO_END_UNWRITTEN) { - io->flag &= ~EXT4_IO_END_UNWRITTEN; - /* Wake up anyone waiting on unwritten extent conversion */ - wq = ext4_ioend_wq(io->inode); - if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten) && - waitqueue_active(wq)) { - wake_up_all(wq); - } - } + /* Wake up anyone waiting on unwritten extent conversion */ + if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten)) + wake_up_all(ext4_ioend_wq(io->inode)); return ret; } @@ -140,20 +126,36 @@ static void ext4_end_io_work(struct work_struct *work) struct inode *inode = io->inode; struct ext4_inode_info *ei = EXT4_I(inode); unsigned long flags; - int ret; - mutex_lock(&inode->i_mutex); - ret = ext4_end_io_nolock(io); - if (ret < 0) { - mutex_unlock(&inode->i_mutex); - return; + spin_lock_irqsave(&ei->i_completed_io_lock, flags); + if (list_empty(&io->list)) { + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); + goto free; } - spin_lock_irqsave(&ei->i_completed_io_lock, flags); - if (!list_empty(&io->list)) - list_del_init(&io->list); + if (!mutex_trylock(&inode->i_mutex)) { + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); + /* + * Requeue the work instead of waiting so that the work + * items queued after this can be processed. + */ + queue_work(EXT4_SB(inode->i_sb)->dio_unwritten_wq, &io->work); + /* + * To prevent the ext4-dio-unwritten thread from keeping + * requeueing end_io requests and occupying cpu for too long, + * yield the cpu if it sees an end_io request that has already + * been requeued. + */ + if (io->flag & EXT4_IO_END_QUEUED) + yield(); + io->flag |= EXT4_IO_END_QUEUED; + return; + } + list_del_init(&io->list); spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); + (void) ext4_end_io_nolock(io); mutex_unlock(&inode->i_mutex); +free: ext4_free_io_end(io); } @@ -285,11 +287,7 @@ static int io_submit_init(struct ext4_io_submit *io, io_end = ext4_init_io_end(inode, GFP_NOFS); if (!io_end) return -ENOMEM; - do { - bio = bio_alloc(GFP_NOIO, nvecs); - nvecs >>= 1; - } while (bio == NULL); - + bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES)); bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_bdev = bh->b_bdev; bio->bi_private = io->io_end = io_end; @@ -339,7 +337,7 @@ submit_and_retry: (io_end->pages[io_end->num_io_pages-1] != io_page)) goto submit_and_retry; if (buffer_uninit(bh)) - io->io_end->flag |= EXT4_IO_END_UNWRITTEN; + ext4_set_io_unwritten_flag(inode, io_end); io->io_end->size += bh->b_size; io->io_next_block++; ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh)); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 80bbc9c60c24..996780ab4f4e 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -16,6 +16,35 @@ #include "ext4_jbd2.h" +int ext4_resize_begin(struct super_block *sb) +{ + int ret = 0; + + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + + /* + * We are not allowed to do online-resizing on a filesystem mounted + * with error, because it can destroy the filesystem easily. + */ + if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { + ext4_warning(sb, "There are errors in the filesystem, " + "so online resizing is not allowed\n"); + return -EPERM; + } + + if (test_and_set_bit_lock(EXT4_RESIZING, &EXT4_SB(sb)->s_resize_flags)) + ret = -EBUSY; + + return ret; +} + +void ext4_resize_end(struct super_block *sb) +{ + clear_bit_unlock(EXT4_RESIZING, &EXT4_SB(sb)->s_resize_flags); + smp_mb__after_clear_bit(); +} + #define outside(b, first, last) ((b) < (first) || (b) >= (last)) #define inside(b, first, last) ((b) >= (first) && (b) < (last)) @@ -118,10 +147,8 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, brelse(bh); bh = ERR_PTR(err); } else { - lock_buffer(bh); memset(bh->b_data, 0, sb->s_blocksize); set_buffer_uptodate(bh); - unlock_buffer(bh); } return bh; @@ -132,8 +159,7 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, * If that fails, restart the transaction & regain write access for the * buffer head which is used for block_bitmap modifications. */ -static int extend_or_restart_transaction(handle_t *handle, int thresh, - struct buffer_head *bh) +static int extend_or_restart_transaction(handle_t *handle, int thresh) { int err; @@ -144,9 +170,8 @@ static int extend_or_restart_transaction(handle_t *handle, int thresh, if (err < 0) return err; if (err) { - if ((err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA))) - return err; - if ((err = ext4_journal_get_write_access(handle, bh))) + err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA); + if (err) return err; } @@ -181,21 +206,7 @@ static int setup_new_group_blocks(struct super_block *sb, if (IS_ERR(handle)) return PTR_ERR(handle); - mutex_lock(&sbi->s_resize_lock); - if (input->group != sbi->s_groups_count) { - err = -EBUSY; - goto exit_journal; - } - - if (IS_ERR(bh = bclean(handle, sb, input->block_bitmap))) { - err = PTR_ERR(bh); - goto exit_journal; - } - - if (ext4_bg_has_super(sb, input->group)) { - ext4_debug("mark backup superblock %#04llx (+0)\n", start); - ext4_set_bit(0, bh->b_data); - } + BUG_ON(input->group != sbi->s_groups_count); /* Copy all of the GDT blocks into the backup in this group */ for (i = 0, bit = 1, block = start + 1; @@ -203,29 +214,26 @@ static int setup_new_group_blocks(struct super_block *sb, struct buffer_head *gdb; ext4_debug("update backup group %#04llx (+%d)\n", block, bit); - - if ((err = extend_or_restart_transaction(handle, 1, bh))) - goto exit_bh; + err = extend_or_restart_transaction(handle, 1); + if (err) + goto exit_journal; gdb = sb_getblk(sb, block); if (!gdb) { err = -EIO; - goto exit_bh; + goto exit_journal; } if ((err = ext4_journal_get_write_access(handle, gdb))) { brelse(gdb); - goto exit_bh; + goto exit_journal; } - lock_buffer(gdb); memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size); set_buffer_uptodate(gdb); - unlock_buffer(gdb); err = ext4_handle_dirty_metadata(handle, NULL, gdb); if (unlikely(err)) { brelse(gdb); - goto exit_bh; + goto exit_journal; } - ext4_set_bit(bit, bh->b_data); brelse(gdb); } @@ -235,9 +243,22 @@ static int setup_new_group_blocks(struct super_block *sb, err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb, GFP_NOFS); if (err) - goto exit_bh; - for (i = 0, bit = gdblocks + 1; i < reserved_gdb; i++, bit++) - ext4_set_bit(bit, bh->b_data); + goto exit_journal; + + err = extend_or_restart_transaction(handle, 2); + if (err) + goto exit_journal; + + bh = bclean(handle, sb, input->block_bitmap); + if (IS_ERR(bh)) { + err = PTR_ERR(bh); + goto exit_journal; + } + + if (ext4_bg_has_super(sb, input->group)) { + ext4_debug("mark backup group tables %#04llx (+0)\n", start); + ext4_set_bits(bh->b_data, 0, gdblocks + reserved_gdb + 1); + } ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap, input->block_bitmap - start); @@ -253,12 +274,9 @@ static int setup_new_group_blocks(struct super_block *sb, err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS); if (err) goto exit_bh; - for (i = 0, bit = input->inode_table - start; - i < sbi->s_itb_per_group; i++, bit++) - ext4_set_bit(bit, bh->b_data); + ext4_set_bits(bh->b_data, input->inode_table - start, + sbi->s_itb_per_group); - if ((err = extend_or_restart_transaction(handle, 2, bh))) - goto exit_bh; ext4_mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, bh->b_data); @@ -285,7 +303,6 @@ exit_bh: brelse(bh); exit_journal: - mutex_unlock(&sbi->s_resize_lock); if ((err2 = ext4_journal_stop(handle)) && !err) err = err2; @@ -377,15 +394,15 @@ static int verify_reserved_gdb(struct super_block *sb, * fail once we start modifying the data on disk, because JBD has no rollback. */ static int add_new_gdb(handle_t *handle, struct inode *inode, - struct ext4_new_group_data *input, - struct buffer_head **primary) + ext4_group_t group) { struct super_block *sb = inode->i_sb; struct ext4_super_block *es = EXT4_SB(sb)->s_es; - unsigned long gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb); + unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb); ext4_fsblk_t gdblock = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num; struct buffer_head **o_group_desc, **n_group_desc; struct buffer_head *dind; + struct buffer_head *gdb_bh; int gdbackups; struct ext4_iloc iloc; __le32 *data; @@ -408,11 +425,12 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, return -EPERM; } - *primary = sb_bread(sb, gdblock); - if (!*primary) + gdb_bh = sb_bread(sb, gdblock); + if (!gdb_bh) return -EIO; - if ((gdbackups = verify_reserved_gdb(sb, *primary)) < 0) { + gdbackups = verify_reserved_gdb(sb, gdb_bh); + if (gdbackups < 0) { err = gdbackups; goto exit_bh; } @@ -427,7 +445,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, data = (__le32 *)dind->b_data; if (le32_to_cpu(data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)]) != gdblock) { ext4_warning(sb, "new group %u GDT block %llu not reserved", - input->group, gdblock); + group, gdblock); err = -EINVAL; goto exit_dind; } @@ -436,7 +454,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, if (unlikely(err)) goto exit_dind; - err = ext4_journal_get_write_access(handle, *primary); + err = ext4_journal_get_write_access(handle, gdb_bh); if (unlikely(err)) goto exit_sbh; @@ -449,12 +467,13 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, if (unlikely(err)) goto exit_dindj; - n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *), - GFP_NOFS); + n_group_desc = ext4_kvmalloc((gdb_num + 1) * + sizeof(struct buffer_head *), + GFP_NOFS); if (!n_group_desc) { err = -ENOMEM; - ext4_warning(sb, - "not enough memory for %lu groups", gdb_num + 1); + ext4_warning(sb, "not enough memory for %lu groups", + gdb_num + 1); goto exit_inode; } @@ -475,8 +494,8 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, } inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9; ext4_mark_iloc_dirty(handle, inode, &iloc); - memset((*primary)->b_data, 0, sb->s_blocksize); - err = ext4_handle_dirty_metadata(handle, NULL, *primary); + memset(gdb_bh->b_data, 0, sb->s_blocksize); + err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh); if (unlikely(err)) { ext4_std_error(sb, err); goto exit_inode; @@ -486,10 +505,10 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, o_group_desc = EXT4_SB(sb)->s_group_desc; memcpy(n_group_desc, o_group_desc, EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *)); - n_group_desc[gdb_num] = *primary; + n_group_desc[gdb_num] = gdb_bh; EXT4_SB(sb)->s_group_desc = n_group_desc; EXT4_SB(sb)->s_gdb_count++; - kfree(o_group_desc); + ext4_kvfree(o_group_desc); le16_add_cpu(&es->s_reserved_gdt_blocks, -1); err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); @@ -499,6 +518,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, return err; exit_inode: + ext4_kvfree(n_group_desc); /* ext4_handle_release_buffer(handle, iloc.bh); */ brelse(iloc.bh); exit_dindj: @@ -508,7 +528,7 @@ exit_sbh: exit_dind: brelse(dind); exit_bh: - brelse(*primary); + brelse(gdb_bh); ext4_debug("leaving with error %d\n", err); return err; @@ -528,7 +548,7 @@ exit_bh: * backup GDT blocks are stored in their reserved primary GDT block. */ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, - struct ext4_new_group_data *input) + ext4_group_t group) { struct super_block *sb = inode->i_sb; int reserved_gdb =le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks); @@ -599,7 +619,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, * Finally we can add each of the reserved backup GDT blocks from * the new group to its reserved primary GDT block. */ - blk = input->group * EXT4_BLOCKS_PER_GROUP(sb); + blk = group * EXT4_BLOCKS_PER_GROUP(sb); for (i = 0; i < reserved_gdb; i++) { int err2; data = (__le32 *)primary[i]->b_data; @@ -799,13 +819,6 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) goto exit_put; } - mutex_lock(&sbi->s_resize_lock); - if (input->group != sbi->s_groups_count) { - ext4_warning(sb, "multiple resizers run on filesystem!"); - err = -EBUSY; - goto exit_journal; - } - if ((err = ext4_journal_get_write_access(handle, sbi->s_sbh))) goto exit_journal; @@ -820,16 +833,25 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) if ((err = ext4_journal_get_write_access(handle, primary))) goto exit_journal; - if (reserved_gdb && ext4_bg_num_gdb(sb, input->group) && - (err = reserve_backup_gdb(handle, inode, input))) + if (reserved_gdb && ext4_bg_num_gdb(sb, input->group)) { + err = reserve_backup_gdb(handle, inode, input->group); + if (err) + goto exit_journal; + } + } else { + /* + * Note that we can access new group descriptor block safely + * only if add_new_gdb() succeeds. + */ + err = add_new_gdb(handle, inode, input->group); + if (err) goto exit_journal; - } else if ((err = add_new_gdb(handle, inode, input, &primary))) - goto exit_journal; + primary = sbi->s_group_desc[gdb_num]; + } /* * OK, now we've set up the new group. Time to make it active. * - * We do not lock all allocations via s_resize_lock * so we have to be safe wrt. concurrent accesses the group * data. So we need to be careful to set all of the relevant * group descriptor data etc. *before* we enable the group. @@ -853,7 +875,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */ ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */ ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */ - ext4_free_blks_set(sb, gdp, input->free_blocks_count); + ext4_free_group_clusters_set(sb, gdp, input->free_blocks_count); ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb)); gdp->bg_flags = cpu_to_le16(EXT4_BG_INODE_ZEROED); gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp); @@ -886,13 +908,9 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) * * The precise rules we use are: * - * * Writers of s_groups_count *must* hold s_resize_lock - * AND * * Writers must perform a smp_wmb() after updating all dependent * data and before modifying the groups count * - * * Readers must hold s_resize_lock over the access - * OR * * Readers must perform an smp_rmb() after reading the groups count * and before reading any dependent data. * @@ -919,8 +937,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) input->reserved_blocks); /* Update the free space counts */ - percpu_counter_add(&sbi->s_freeblocks_counter, - input->free_blocks_count); + percpu_counter_add(&sbi->s_freeclusters_counter, + EXT4_B2C(sbi, input->free_blocks_count)); percpu_counter_add(&sbi->s_freeinodes_counter, EXT4_INODES_PER_GROUP(sb)); @@ -928,8 +946,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) sbi->s_log_groups_per_flex) { ext4_group_t flex_group; flex_group = ext4_flex_group(sbi, input->group); - atomic_add(input->free_blocks_count, - &sbi->s_flex_groups[flex_group].free_blocks); + atomic_add(EXT4_B2C(sbi, input->free_blocks_count), + &sbi->s_flex_groups[flex_group].free_clusters); atomic_add(EXT4_INODES_PER_GROUP(sb), &sbi->s_flex_groups[flex_group].free_inodes); } @@ -937,10 +955,9 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) ext4_handle_dirty_super(handle, sb); exit_journal: - mutex_unlock(&sbi->s_resize_lock); if ((err2 = ext4_journal_stop(handle)) && !err) err = err2; - if (!err) { + if (!err && primary) { update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es, sizeof(struct ext4_super_block)); update_backups(sb, primary->b_blocknr, primary->b_data, @@ -969,16 +986,13 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, ext4_grpblk_t add; struct buffer_head *bh; handle_t *handle; - int err; + int err, err2; ext4_group_t group; - /* We don't need to worry about locking wrt other resizers just - * yet: we're going to revalidate es->s_blocks_count after - * taking the s_resize_lock below. */ o_blocks_count = ext4_blocks_count(es); if (test_opt(sb, DEBUG)) - printk(KERN_DEBUG "EXT4-fs: extending last group from %llu uto %llu blocks\n", + printk(KERN_DEBUG "EXT4-fs: extending last group from %llu to %llu blocks\n", o_blocks_count, n_blocks_count); if (n_blocks_count == 0 || n_blocks_count == o_blocks_count) @@ -995,7 +1009,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, if (n_blocks_count < o_blocks_count) { ext4_warning(sb, "can't shrink FS - resize aborted"); - return -EBUSY; + return -EINVAL; } /* Handle the remaining blocks in the last group only. */ @@ -1038,32 +1052,25 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, goto exit_put; } - mutex_lock(&EXT4_SB(sb)->s_resize_lock); - if (o_blocks_count != ext4_blocks_count(es)) { - ext4_warning(sb, "multiple resizers run on filesystem!"); - mutex_unlock(&EXT4_SB(sb)->s_resize_lock); - ext4_journal_stop(handle); - err = -EBUSY; - goto exit_put; - } - if ((err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh))) { ext4_warning(sb, "error %d on journal write access", err); - mutex_unlock(&EXT4_SB(sb)->s_resize_lock); ext4_journal_stop(handle); goto exit_put; } ext4_blocks_count_set(es, o_blocks_count + add); - mutex_unlock(&EXT4_SB(sb)->s_resize_lock); ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, o_blocks_count + add); /* We add the blocks to the bitmap and set the group need init bit */ - ext4_add_groupblocks(handle, sb, o_blocks_count, add); + err = ext4_group_add_blocks(handle, sb, o_blocks_count, add); ext4_handle_dirty_super(handle, sb); ext4_debug("freed blocks %llu through %llu\n", o_blocks_count, o_blocks_count + add); - if ((err = ext4_journal_stop(handle))) + err2 = ext4_journal_stop(handle); + if (!err && err2) + err = err2; + + if (err) goto exit_put; if (test_opt(sb, DEBUG)) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9ea71aa864b3..3858767ec672 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -45,6 +45,7 @@ #include <linux/freezer.h> #include "ext4.h" +#include "ext4_extents.h" #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" @@ -110,6 +111,35 @@ static struct file_system_type ext3_fs_type = { #define IS_EXT3_SB(sb) (0) #endif +void *ext4_kvmalloc(size_t size, gfp_t flags) +{ + void *ret; + + ret = kmalloc(size, flags); + if (!ret) + ret = __vmalloc(size, flags, PAGE_KERNEL); + return ret; +} + +void *ext4_kvzalloc(size_t size, gfp_t flags) +{ + void *ret; + + ret = kzalloc(size, flags); + if (!ret) + ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL); + return ret; +} + +void ext4_kvfree(void *ptr) +{ + if (is_vmalloc_addr(ptr)) + vfree(ptr); + else + kfree(ptr); + +} + ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, struct ext4_group_desc *bg) { @@ -134,8 +164,8 @@ ext4_fsblk_t ext4_inode_table(struct super_block *sb, (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0); } -__u32 ext4_free_blks_count(struct super_block *sb, - struct ext4_group_desc *bg) +__u32 ext4_free_group_clusters(struct super_block *sb, + struct ext4_group_desc *bg) { return le16_to_cpu(bg->bg_free_blocks_count_lo) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? @@ -190,8 +220,8 @@ void ext4_inode_table_set(struct super_block *sb, bg->bg_inode_table_hi = cpu_to_le32(blk >> 32); } -void ext4_free_blks_set(struct super_block *sb, - struct ext4_group_desc *bg, __u32 count) +void ext4_free_group_clusters_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count) { bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count); if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) @@ -269,6 +299,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks) journal_t *journal; handle_t *handle; + trace_ext4_journal_start(sb, nblocks, _RET_IP_); if (sb->s_flags & MS_RDONLY) return ERR_PTR(-EROFS); @@ -384,6 +415,22 @@ static void save_error_info(struct super_block *sb, const char *func, ext4_commit_super(sb, 1); } +/* + * The del_gendisk() function uninitializes the disk-specific data + * structures, including the bdi structure, without telling anyone + * else. Once this happens, any attempt to call mark_buffer_dirty() + * (for example, by ext4_commit_super), will cause a kernel OOPS. + * This is a kludge to prevent these oops until we can put in a proper + * hook in del_gendisk() to inform the VFS and file system layers. + */ +static int block_device_ejected(struct super_block *sb) +{ + struct inode *bd_inode = sb->s_bdev->bd_inode; + struct backing_dev_info *bdi = bd_inode->i_mapping->backing_dev_info; + + return bdi->dev == NULL; +} + /* Deal with the reporting of failure conditions on a filesystem such as * inconsistencies detected or read IO failures. @@ -789,15 +836,12 @@ static void ext4_put_super(struct super_block *sb) for (i = 0; i < sbi->s_gdb_count; i++) brelse(sbi->s_group_desc[i]); - kfree(sbi->s_group_desc); - if (is_vmalloc_addr(sbi->s_flex_groups)) - vfree(sbi->s_flex_groups); - else - kfree(sbi->s_flex_groups); - percpu_counter_destroy(&sbi->s_freeblocks_counter); + ext4_kvfree(sbi->s_group_desc); + ext4_kvfree(sbi->s_flex_groups); + percpu_counter_destroy(&sbi->s_freeclusters_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); - percpu_counter_destroy(&sbi->s_dirtyblocks_counter); + percpu_counter_destroy(&sbi->s_dirtyclusters_counter); brelse(sbi->s_sbh); #ifdef CONFIG_QUOTA for (i = 0; i < MAXQUOTAS; i++) @@ -892,7 +936,6 @@ static void ext4_i_callback(struct rcu_head *head) static void ext4_destroy_inode(struct inode *inode) { - ext4_ioend_wait(inode); if (!list_empty(&(EXT4_I(inode)->i_orphan))) { ext4_msg(inode->i_sb, KERN_ERR, "Inode %lu (%p): orphan list check failed!", @@ -1031,8 +1074,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",nouid32"); if (test_opt(sb, DEBUG) && !(def_mount_opts & EXT4_DEFM_DEBUG)) seq_puts(seq, ",debug"); - if (test_opt(sb, OLDALLOC)) - seq_puts(seq, ",oldalloc"); #ifdef CONFIG_EXT4_FS_XATTR if (test_opt(sb, XATTR_USER)) seq_puts(seq, ",user_xattr"); @@ -1541,10 +1582,12 @@ static int parse_options(char *options, struct super_block *sb, set_opt(sb, DEBUG); break; case Opt_oldalloc: - set_opt(sb, OLDALLOC); + ext4_msg(sb, KERN_WARNING, + "Ignoring deprecated oldalloc option"); break; case Opt_orlov: - clear_opt(sb, OLDALLOC); + ext4_msg(sb, KERN_WARNING, + "Ignoring deprecated orlov option"); break; #ifdef CONFIG_EXT4_FS_XATTR case Opt_user_xattr: @@ -1640,7 +1683,9 @@ static int parse_options(char *options, struct super_block *sb, data_opt = EXT4_MOUNT_WRITEBACK_DATA; datacheck: if (is_remount) { - if (test_opt(sb, DATA_FLAGS) != data_opt) { + if (!sbi->s_journal) + ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option"); + else if (test_opt(sb, DATA_FLAGS) != data_opt) { ext4_msg(sb, KERN_ERR, "Cannot change data mode on remount"); return 0; @@ -1775,6 +1820,7 @@ set_qf_format: break; case Opt_nodelalloc: clear_opt(sb, DELALLOC); + clear_opt2(sb, EXPLICIT_DELALLOC); break; case Opt_mblk_io_submit: set_opt(sb, MBLK_IO_SUBMIT); @@ -1791,6 +1837,7 @@ set_qf_format: break; case Opt_delalloc: set_opt(sb, DELALLOC); + set_opt2(sb, EXPLICIT_DELALLOC); break; case Opt_block_validity: set_opt(sb, BLOCK_VALIDITY); @@ -1909,7 +1956,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, res = MS_RDONLY; } if (read_only) - return res; + goto done; if (!(sbi->s_mount_state & EXT4_VALID_FS)) ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, " "running e2fsck is recommended"); @@ -1940,6 +1987,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); ext4_commit_super(sb, 1); +done: if (test_opt(sb, DEBUG)) printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, " "bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n", @@ -1976,15 +2024,11 @@ static int ext4_fill_flex_info(struct super_block *sb) ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) << EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex; size = flex_group_count * sizeof(struct flex_groups); - sbi->s_flex_groups = kzalloc(size, GFP_KERNEL); + sbi->s_flex_groups = ext4_kvzalloc(size, GFP_KERNEL); if (sbi->s_flex_groups == NULL) { - sbi->s_flex_groups = vzalloc(size); - if (sbi->s_flex_groups == NULL) { - ext4_msg(sb, KERN_ERR, - "not enough memory for %u flex groups", - flex_group_count); - goto failed; - } + ext4_msg(sb, KERN_ERR, "not enough memory for %u flex groups", + flex_group_count); + goto failed; } for (i = 0; i < sbi->s_groups_count; i++) { @@ -1993,8 +2037,8 @@ static int ext4_fill_flex_info(struct super_block *sb) flex_group = ext4_flex_group(sbi, i); atomic_add(ext4_free_inodes_count(sb, gdp), &sbi->s_flex_groups[flex_group].free_inodes); - atomic_add(ext4_free_blks_count(sb, gdp), - &sbi->s_flex_groups[flex_group].free_blocks); + atomic_add(ext4_free_group_clusters(sb, gdp), + &sbi->s_flex_groups[flex_group].free_clusters); atomic_add(ext4_used_dirs_count(sb, gdp), &sbi->s_flex_groups[flex_group].used_dirs); } @@ -2112,7 +2156,8 @@ static int ext4_check_descriptors(struct super_block *sb, if (NULL != first_not_zeroed) *first_not_zeroed = grp; - ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb)); + ext4_free_blocks_count_set(sbi->s_es, + EXT4_C2B(sbi, ext4_count_free_clusters(sb))); sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb)); return 1; } @@ -2383,17 +2428,25 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi) unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride); unsigned long stripe_width = le32_to_cpu(sbi->s_es->s_raid_stripe_width); + int ret; if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group) - return sbi->s_stripe; - - if (stripe_width <= sbi->s_blocks_per_group) - return stripe_width; + ret = sbi->s_stripe; + else if (stripe_width <= sbi->s_blocks_per_group) + ret = stripe_width; + else if (stride <= sbi->s_blocks_per_group) + ret = stride; + else + ret = 0; - if (stride <= sbi->s_blocks_per_group) - return stride; + /* + * If the stripe width is 1, this makes no sense and + * we set it to 0 to turn off stripe handling code. + */ + if (ret <= 1) + ret = 0; - return 0; + return ret; } /* sysfs supprt */ @@ -2424,7 +2477,8 @@ static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a, char *buf) { return snprintf(buf, PAGE_SIZE, "%llu\n", - (s64) percpu_counter_sum(&sbi->s_dirtyblocks_counter)); + (s64) EXT4_C2B(sbi, + percpu_counter_sum(&sbi->s_dirtyclusters_counter))); } static ssize_t session_write_kbytes_show(struct ext4_attr *a, @@ -2652,6 +2706,13 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly) return 0; } } + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC) && + !EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { + ext4_msg(sb, KERN_ERR, + "Can't support bigalloc feature without " + "extents feature\n"); + return 0; + } return 1; } @@ -3040,8 +3101,6 @@ static void ext4_destroy_lazyinit_thread(void) } static int ext4_fill_super(struct super_block *sb, void *data, int silent) - __releases(kernel_lock) - __acquires(kernel_lock) { char *orig_data = kstrdup(data, GFP_KERNEL); struct buffer_head *bh; @@ -3057,10 +3116,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) char *cp; const char *descr; int ret = -ENOMEM; - int blocksize; + int blocksize, clustersize; unsigned int db_count; unsigned int i; - int needs_recovery, has_huge_files; + int needs_recovery, has_huge_files, has_bigalloc; __u64 blocks_count; int err; unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; @@ -3194,6 +3253,33 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) &journal_ioprio, NULL, 0)) goto failed_mount; + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { + printk_once(KERN_WARNING "EXT4-fs: Warning: mounting " + "with data=journal disables delayed " + "allocation and O_DIRECT support!\n"); + if (test_opt2(sb, EXPLICIT_DELALLOC)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "both data=journal and delalloc"); + goto failed_mount; + } + if (test_opt(sb, DIOREAD_NOLOCK)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "both data=journal and delalloc"); + goto failed_mount; + } + if (test_opt(sb, DELALLOC)) + clear_opt(sb, DELALLOC); + } + + blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); + if (test_opt(sb, DIOREAD_NOLOCK)) { + if (blocksize < PAGE_SIZE) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "dioread_nolock if block size != PAGE_SIZE"); + goto failed_mount; + } + } + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); @@ -3235,8 +3321,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY))) goto failed_mount; - blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); - if (blocksize < EXT4_MIN_BLOCK_SIZE || blocksize > EXT4_MAX_BLOCK_SIZE) { ext4_msg(sb, KERN_ERR, @@ -3339,12 +3423,53 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sb->s_dirt = 1; } - if (sbi->s_blocks_per_group > blocksize * 8) { - ext4_msg(sb, KERN_ERR, - "#blocks per group too big: %lu", - sbi->s_blocks_per_group); - goto failed_mount; + /* Handle clustersize */ + clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size); + has_bigalloc = EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_BIGALLOC); + if (has_bigalloc) { + if (clustersize < blocksize) { + ext4_msg(sb, KERN_ERR, + "cluster size (%d) smaller than " + "block size (%d)", clustersize, blocksize); + goto failed_mount; + } + sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) - + le32_to_cpu(es->s_log_block_size); + sbi->s_clusters_per_group = + le32_to_cpu(es->s_clusters_per_group); + if (sbi->s_clusters_per_group > blocksize * 8) { + ext4_msg(sb, KERN_ERR, + "#clusters per group too big: %lu", + sbi->s_clusters_per_group); + goto failed_mount; + } + if (sbi->s_blocks_per_group != + (sbi->s_clusters_per_group * (clustersize / blocksize))) { + ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and " + "clusters per group (%lu) inconsistent", + sbi->s_blocks_per_group, + sbi->s_clusters_per_group); + goto failed_mount; + } + } else { + if (clustersize != blocksize) { + ext4_warning(sb, "fragment/cluster size (%d) != " + "block size (%d)", clustersize, + blocksize); + clustersize = blocksize; + } + if (sbi->s_blocks_per_group > blocksize * 8) { + ext4_msg(sb, KERN_ERR, + "#blocks per group too big: %lu", + sbi->s_blocks_per_group); + goto failed_mount; + } + sbi->s_clusters_per_group = sbi->s_blocks_per_group; + sbi->s_cluster_bits = 0; } + sbi->s_cluster_ratio = clustersize / blocksize; + if (sbi->s_inodes_per_group > blocksize * 8) { ext4_msg(sb, KERN_ERR, "#inodes per group too big: %lu", @@ -3408,17 +3533,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / EXT4_DESC_PER_BLOCK(sb); - sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *), - GFP_KERNEL); + sbi->s_group_desc = ext4_kvmalloc(db_count * + sizeof(struct buffer_head *), + GFP_KERNEL); if (sbi->s_group_desc == NULL) { ext4_msg(sb, KERN_ERR, "not enough memory"); goto failed_mount; } -#ifdef CONFIG_PROC_FS if (ext4_proc_root) sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root); -#endif bgl_lock_init(sbi->s_blockgroup_lock); @@ -3452,8 +3576,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_err_report.function = print_daily_error_info; sbi->s_err_report.data = (unsigned long) sb; - err = percpu_counter_init(&sbi->s_freeblocks_counter, - ext4_count_free_blocks(sb)); + err = percpu_counter_init(&sbi->s_freeclusters_counter, + ext4_count_free_clusters(sb)); if (!err) { err = percpu_counter_init(&sbi->s_freeinodes_counter, ext4_count_free_inodes(sb)); @@ -3463,7 +3587,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) ext4_count_dirs(sb)); } if (!err) { - err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0); + err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0); } if (err) { ext4_msg(sb, KERN_ERR, "insufficient memory"); @@ -3491,7 +3615,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ mutex_init(&sbi->s_orphan_lock); - mutex_init(&sbi->s_resize_lock); + sbi->s_resize_flags = 0; sb->s_root = NULL; @@ -3578,13 +3702,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) * The journal may have updated the bg summary counts, so we * need to update the global counters. */ - percpu_counter_set(&sbi->s_freeblocks_counter, - ext4_count_free_blocks(sb)); + percpu_counter_set(&sbi->s_freeclusters_counter, + ext4_count_free_clusters(sb)); percpu_counter_set(&sbi->s_freeinodes_counter, ext4_count_free_inodes(sb)); percpu_counter_set(&sbi->s_dirs_counter, ext4_count_dirs(sb)); - percpu_counter_set(&sbi->s_dirtyblocks_counter, 0); + percpu_counter_set(&sbi->s_dirtyclusters_counter, 0); no_journal: /* @@ -3648,25 +3772,6 @@ no_journal: "available"); } - if (test_opt(sb, DELALLOC) && - (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) { - ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - " - "requested data journaling mode"); - clear_opt(sb, DELALLOC); - } - if (test_opt(sb, DIOREAD_NOLOCK)) { - if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { - ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock " - "option - requested data journaling mode"); - clear_opt(sb, DIOREAD_NOLOCK); - } - if (sb->s_blocksize < PAGE_SIZE) { - ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock " - "option - block size is too small"); - clear_opt(sb, DIOREAD_NOLOCK); - } - } - err = ext4_setup_system_zone(sb); if (err) { ext4_msg(sb, KERN_ERR, "failed to initialize system " @@ -3679,22 +3784,19 @@ no_journal: if (err) { ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", err); - goto failed_mount4; + goto failed_mount5; } err = ext4_register_li_request(sb, first_not_zeroed); if (err) - goto failed_mount4; + goto failed_mount6; sbi->s_kobj.kset = ext4_kset; init_completion(&sbi->s_kobj_unregister); err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL, "%s", sb->s_id); - if (err) { - ext4_mb_release(sb); - ext4_ext_release(sb); - goto failed_mount4; - }; + if (err) + goto failed_mount7; EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS; ext4_orphan_cleanup(sb, es); @@ -3728,35 +3830,37 @@ cantfind_ext4: ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); goto failed_mount; +failed_mount7: + ext4_unregister_li_request(sb); +failed_mount6: + ext4_ext_release(sb); +failed_mount5: + ext4_mb_release(sb); + ext4_release_system_zone(sb); failed_mount4: iput(root); sb->s_root = NULL; ext4_msg(sb, KERN_ERR, "mount failed"); destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq); failed_mount_wq: - ext4_release_system_zone(sb); if (sbi->s_journal) { jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; } failed_mount3: del_timer(&sbi->s_err_report); - if (sbi->s_flex_groups) { - if (is_vmalloc_addr(sbi->s_flex_groups)) - vfree(sbi->s_flex_groups); - else - kfree(sbi->s_flex_groups); - } - percpu_counter_destroy(&sbi->s_freeblocks_counter); + if (sbi->s_flex_groups) + ext4_kvfree(sbi->s_flex_groups); + percpu_counter_destroy(&sbi->s_freeclusters_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); - percpu_counter_destroy(&sbi->s_dirtyblocks_counter); + percpu_counter_destroy(&sbi->s_dirtyclusters_counter); if (sbi->s_mmp_tsk) kthread_stop(sbi->s_mmp_tsk); failed_mount2: for (i = 0; i < db_count; i++) brelse(sbi->s_group_desc[i]); - kfree(sbi->s_group_desc); + ext4_kvfree(sbi->s_group_desc); failed_mount: if (sbi->s_proc) { remove_proc_entry(sb->s_id, ext4_proc_root); @@ -4037,7 +4141,7 @@ static int ext4_commit_super(struct super_block *sb, int sync) struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; int error = 0; - if (!sbh) + if (!sbh || block_device_ejected(sb)) return error; if (buffer_write_io_error(sbh)) { /* @@ -4073,8 +4177,9 @@ static int ext4_commit_super(struct super_block *sb, int sync) else es->s_kbytes_written = cpu_to_le64(EXT4_SB(sb)->s_kbytes_written); - ext4_free_blocks_count_set(es, percpu_counter_sum_positive( - &EXT4_SB(sb)->s_freeblocks_counter)); + ext4_free_blocks_count_set(es, + EXT4_C2B(EXT4_SB(sb), percpu_counter_sum_positive( + &EXT4_SB(sb)->s_freeclusters_counter))); es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive( &EXT4_SB(sb)->s_freeinodes_counter)); @@ -4479,16 +4584,34 @@ restore_opts: return err; } +/* + * Note: calculating the overhead so we can be compatible with + * historical BSD practice is quite difficult in the face of + * clusters/bigalloc. This is because multiple metadata blocks from + * different block group can end up in the same allocation cluster. + * Calculating the exact overhead in the face of clustered allocation + * requires either O(all block bitmaps) in memory or O(number of block + * groups**2) in time. We will still calculate the superblock for + * older file systems --- and if we come across with a bigalloc file + * system with zero in s_overhead_clusters the estimate will be close to + * correct especially for very large cluster sizes --- but for newer + * file systems, it's better to calculate this figure once at mkfs + * time, and store it in the superblock. If the superblock value is + * present (even for non-bigalloc file systems), we will use it. + */ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; + struct ext4_group_desc *gdp; u64 fsid; s64 bfree; if (test_opt(sb, MINIX_DF)) { sbi->s_overhead_last = 0; + } else if (es->s_overhead_clusters) { + sbi->s_overhead_last = le32_to_cpu(es->s_overhead_clusters); } else if (sbi->s_blocks_last != ext4_blocks_count(es)) { ext4_group_t i, ngroups = ext4_get_groups_count(sb); ext4_fsblk_t overhead = 0; @@ -4503,24 +4626,16 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) * All of the blocks before first_data_block are * overhead */ - overhead = le32_to_cpu(es->s_first_data_block); + overhead = EXT4_B2C(sbi, le32_to_cpu(es->s_first_data_block)); /* - * Add the overhead attributed to the superblock and - * block group descriptors. If the sparse superblocks - * feature is turned on, then not all groups have this. + * Add the overhead found in each block group */ for (i = 0; i < ngroups; i++) { - overhead += ext4_bg_has_super(sb, i) + - ext4_bg_num_gdb(sb, i); + gdp = ext4_get_group_desc(sb, i, NULL); + overhead += ext4_num_overhead_clusters(sb, i, gdp); cond_resched(); } - - /* - * Every block group has an inode bitmap, a block - * bitmap, and an inode table. - */ - overhead += ngroups * (2 + sbi->s_itb_per_group); sbi->s_overhead_last = overhead; smp_wmb(); sbi->s_blocks_last = ext4_blocks_count(es); @@ -4528,11 +4643,12 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_type = EXT4_SUPER_MAGIC; buf->f_bsize = sb->s_blocksize; - buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last; - bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) - - percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter); + buf->f_blocks = (ext4_blocks_count(es) - + EXT4_C2B(sbi, sbi->s_overhead_last)); + bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) - + percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter); /* prevent underflow in case that few free space is available */ - buf->f_bfree = max_t(s64, bfree, 0); + buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0)); buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es); if (buf->f_bfree < ext4_r_blocks_count(es)) buf->f_bavail = 0; @@ -4953,13 +5069,11 @@ static int __init ext4_init_fs(void) return err; err = ext4_init_system_zone(); if (err) - goto out7; + goto out6; ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj); if (!ext4_kset) - goto out6; - ext4_proc_root = proc_mkdir("fs/ext4", NULL); - if (!ext4_proc_root) goto out5; + ext4_proc_root = proc_mkdir("fs/ext4", NULL); err = ext4_init_feat_adverts(); if (err) @@ -4995,12 +5109,12 @@ out2: out3: ext4_exit_feat_adverts(); out4: - remove_proc_entry("fs/ext4", NULL); -out5: + if (ext4_proc_root) + remove_proc_entry("fs/ext4", NULL); kset_unregister(ext4_kset); -out6: +out5: ext4_exit_system_zone(); -out7: +out6: ext4_exit_pageio(); return err; } diff --git a/fs/ext4/truncate.h b/fs/ext4/truncate.h new file mode 100644 index 000000000000..011ba6670d99 --- /dev/null +++ b/fs/ext4/truncate.h @@ -0,0 +1,43 @@ +/* + * linux/fs/ext4/truncate.h + * + * Common inline functions needed for truncate support + */ + +/* + * Truncate blocks that were not used by write. We have to truncate the + * pagecache as well so that corresponding buffers get properly unmapped. + */ +static inline void ext4_truncate_failed_write(struct inode *inode) +{ + truncate_inode_pages(inode->i_mapping, inode->i_size); + ext4_truncate(inode); +} + +/* + * Work out how many blocks we need to proceed with the next chunk of a + * truncate transaction. + */ +static inline unsigned long ext4_blocks_for_truncate(struct inode *inode) +{ + ext4_lblk_t needed; + + needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); + + /* Give ourselves just enough room to cope with inodes in which + * i_blocks is corrupt: we've seen disk corruptions in the past + * which resulted in random data in an inode which looked enough + * like a regular file for ext4 to try to delete it. Things + * will go a bit crazy if that happens, but at least we should + * try not to panic the whole kernel. */ + if (needed < 2) + needed = 2; + + /* But we need to bound the transaction so we don't overflow the + * journal. */ + if (needed > EXT4_MAX_TRANS_DATA) + needed = EXT4_MAX_TRANS_DATA; + + return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed; +} + diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index c757adc97250..93a00d89a220 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -820,8 +820,14 @@ inserted: if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; + /* + * take i_data_sem because we will test + * i_delalloc_reserved_flag in ext4_mb_new_blocks + */ + down_read((&EXT4_I(inode)->i_data_sem)); block = ext4_new_meta_blocks(handle, inode, goal, 0, NULL, &error); + up_read((&EXT4_I(inode)->i_data_sem)); if (error) goto cleanup; @@ -985,11 +991,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, no_expand = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND); ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND); - error = ext4_get_inode_loc(inode, &is.iloc); - if (error) - goto cleanup; - - error = ext4_journal_get_write_access(handle, is.iloc.bh); + error = ext4_reserve_inode_write(handle, inode, &is.iloc); if (error) goto cleanup; diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c index 007c3bfbf094..34e4350dd4d9 100644 --- a/fs/ext4/xattr_security.c +++ b/fs/ext4/xattr_security.c @@ -48,28 +48,32 @@ ext4_xattr_security_set(struct dentry *dentry, const char *name, name, value, size, flags); } -int -ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir, - const struct qstr *qstr) +int ext4_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) { - int err; - size_t len; - void *value; - char *name; + const struct xattr *xattr; + handle_t *handle = fs_info; + int err = 0; - err = security_inode_init_security(inode, dir, qstr, &name, &value, &len); - if (err) { - if (err == -EOPNOTSUPP) - return 0; - return err; + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + err = ext4_xattr_set_handle(handle, inode, + EXT4_XATTR_INDEX_SECURITY, + xattr->name, xattr->value, + xattr->value_len, 0); + if (err < 0) + break; } - err = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_SECURITY, - name, value, len, 0); - kfree(name); - kfree(value); return err; } +int +ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir, + const struct qstr *qstr) +{ + return security_inode_init_security(inode, dir, qstr, + &ext4_initxattrs, handle); +} + const struct xattr_handler ext4_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .list = ext4_xattr_security_list, diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 4ad64732cbce..aca191bd5f8f 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -156,8 +156,8 @@ static int uni16_to_x8(struct super_block *sb, unsigned char *ascii, } else { if (uni_xlate == 1) { *op++ = ':'; - op = pack_hex_byte(op, ec >> 8); - op = pack_hex_byte(op, ec); + op = hex_byte_pack(op, ec >> 8); + op = hex_byte_pack(op, ec); len -= 5; } else { *op++ = '?'; @@ -1231,7 +1231,7 @@ int fat_add_entries(struct inode *dir, void *slots, int nr_slots, struct super_block *sb = dir->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); struct buffer_head *bh, *prev, *bhs[3]; /* 32*slots (672bytes) */ - struct msdos_dir_entry *de; + struct msdos_dir_entry *uninitialized_var(de); int err, free_slots, i, nr_bhs; loff_t pos, i_pos; diff --git a/fs/fat/fat.h b/fs/fat/fat.h index a5d3853822e0..1510a4d51990 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -326,15 +326,14 @@ extern int fat_fill_super(struct super_block *sb, void *data, int silent, extern int fat_flush_inodes(struct super_block *sb, struct inode *i1, struct inode *i2); /* fat/misc.c */ -extern void -__fat_fs_error(struct super_block *sb, int report, const char *fmt, ...) - __attribute__ ((format (printf, 3, 4))) __cold; +extern __printf(3, 4) __cold +void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...); #define fat_fs_error(sb, fmt, args...) \ __fat_fs_error(sb, 1, fmt , ## args) #define fat_fs_error_ratelimit(sb, fmt, args...) \ __fat_fs_error(sb, __ratelimit(&MSDOS_SB(sb)->ratelimit), fmt , ## args) -void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...) - __attribute__ ((format (printf, 3, 4))) __cold; +__printf(3, 4) __cold +void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...); extern int fat_clusters_flush(struct super_block *sb); extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster); extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts, diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 5942fec22c65..808cac7edcfb 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -379,7 +379,7 @@ static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) return error; MSDOS_I(inode)->mmu_private = inode->i_size; - inode->i_nlink = fat_subdirs(inode); + set_nlink(inode, fat_subdirs(inode)); } else { /* not a directory */ inode->i_generation |= 1; inode->i_mode = fat_make_mode(sbi, de->attr, @@ -1188,9 +1188,9 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat, out: /* UTF-8 doesn't provide FAT semantics */ if (!strcmp(opts->iocharset, "utf8")) { - fat_msg(sb, KERN_ERR, "utf8 is not a recommended IO charset" + fat_msg(sb, KERN_WARNING, "utf8 is not a recommended IO charset" " for FAT filesystems, filesystem will be " - "case sensitive!\n"); + "case sensitive!"); } /* If user doesn't specify allow_utime, it's initialized from dmask. */ @@ -1233,7 +1233,7 @@ static int fat_read_root(struct inode *inode) fat_save_attrs(inode, ATTR_DIR); inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec = 0; inode->i_mtime.tv_nsec = inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = 0; - inode->i_nlink = fat_subdirs(inode)+2; + set_nlink(inode, fat_subdirs(inode)+2); return 0; } @@ -1367,6 +1367,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, sbi->free_clusters = -1; /* Don't know yet */ sbi->free_clus_valid = 0; sbi->prev_free = FAT_START_ENT; + sb->s_maxbytes = 0xffffffff; if (!sbi->fat_length && b->fat32_length) { struct fat_boot_fsinfo *fsinfo; @@ -1377,8 +1378,6 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, sbi->fat_length = le32_to_cpu(b->fat32_length); sbi->root_cluster = le32_to_cpu(b->root_cluster); - sb->s_maxbytes = 0xffffffff; - /* MC - if info_sector is 0, don't multiply by 0 */ sbi->fsinfo_sector = le16_to_cpu(b->info_sector); if (sbi->fsinfo_sector == 0) diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c index 66e83b845455..216b419f30e2 100644 --- a/fs/fat/namei_msdos.c +++ b/fs/fat/namei_msdos.c @@ -387,7 +387,7 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, int mode) /* the directory was completed, just return a error */ goto out; } - inode->i_nlink = 2; + set_nlink(inode, 2); inode->i_mtime = inode->i_atime = inode->i_ctime = ts; /* timestamp is already written, so mark_inode_dirty() is unneeded. */ diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index bb3f29c3557b..a87a65663c25 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -900,7 +900,7 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, int mode) goto out; } inode->i_version++; - inode->i_nlink = 2; + set_nlink(inode, 2); inode->i_mtime = inode->i_atime = inode->i_ctime = ts; /* timestamp is already written, so mark_inode_dirty() is unneeded. */ diff --git a/fs/file_table.c b/fs/file_table.c index 01e4c1e8e6b6..c322794f7360 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -25,7 +25,7 @@ #include <linux/percpu.h> #include <linux/ima.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include "internal.h" diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c index 1a4311437a8b..7b2af5abe2fa 100644 --- a/fs/freevxfs/vxfs_inode.c +++ b/fs/freevxfs/vxfs_inode.c @@ -227,7 +227,7 @@ vxfs_iinit(struct inode *ip, struct vxfs_inode_info *vip) ip->i_uid = (uid_t)vip->vii_uid; ip->i_gid = (gid_t)vip->vii_gid; - ip->i_nlink = vip->vii_nlink; + set_nlink(ip, vip->vii_nlink); ip->i_size = vip->vii_size; ip->i_atime.tv_sec = vip->vii_atime; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index b8c507ca42f7..73c3992b2bb4 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -35,15 +35,29 @@ struct wb_writeback_work { long nr_pages; struct super_block *sb; + unsigned long *older_than_this; enum writeback_sync_modes sync_mode; + unsigned int tagged_writepages:1; unsigned int for_kupdate:1; unsigned int range_cyclic:1; unsigned int for_background:1; + enum wb_reason reason; /* why was writeback initiated? */ struct list_head list; /* pending work list */ struct completion *done; /* set if the caller waits */ }; +const char *wb_reason_name[] = { + [WB_REASON_BACKGROUND] = "background", + [WB_REASON_TRY_TO_FREE_PAGES] = "try_to_free_pages", + [WB_REASON_SYNC] = "sync", + [WB_REASON_PERIODIC] = "periodic", + [WB_REASON_LAPTOP_TIMER] = "laptop_timer", + [WB_REASON_FREE_MORE_MEM] = "free_more_memory", + [WB_REASON_FS_FREE_SPACE] = "fs_free_space", + [WB_REASON_FORKER_THREAD] = "forker_thread" +}; + /* * Include the creation of the trace points after defining the * wb_writeback_work structure so that the definition remains local to this @@ -113,7 +127,7 @@ static void bdi_queue_work(struct backing_dev_info *bdi, static void __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, - bool range_cyclic) + bool range_cyclic, enum wb_reason reason) { struct wb_writeback_work *work; @@ -133,6 +147,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, work->sync_mode = WB_SYNC_NONE; work->nr_pages = nr_pages; work->range_cyclic = range_cyclic; + work->reason = reason; bdi_queue_work(bdi, work); } @@ -148,9 +163,10 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, * completion. Caller need not hold sb s_umount semaphore. * */ -void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages) +void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, + enum wb_reason reason) { - __bdi_start_writeback(bdi, nr_pages, true); + __bdi_start_writeback(bdi, nr_pages, true, reason); } /** @@ -180,12 +196,13 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi) */ void inode_wb_list_del(struct inode *inode) { - spin_lock(&inode_wb_list_lock); + struct backing_dev_info *bdi = inode_to_bdi(inode); + + spin_lock(&bdi->wb.list_lock); list_del_init(&inode->i_wb_list); - spin_unlock(&inode_wb_list_lock); + spin_unlock(&bdi->wb.list_lock); } - /* * Redirty an inode: set its when-it-was dirtied timestamp and move it to the * furthest end of its superblock's dirty-inode list. @@ -195,11 +212,9 @@ void inode_wb_list_del(struct inode *inode) * the case then the inode must have been redirtied while it was being written * out and we don't reset its dirtied_when. */ -static void redirty_tail(struct inode *inode) +static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) { - struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; - - assert_spin_locked(&inode_wb_list_lock); + assert_spin_locked(&wb->list_lock); if (!list_empty(&wb->b_dirty)) { struct inode *tail; @@ -213,11 +228,9 @@ static void redirty_tail(struct inode *inode) /* * requeue inode for re-scanning after bdi->b_io list is exhausted. */ -static void requeue_io(struct inode *inode) +static void requeue_io(struct inode *inode, struct bdi_writeback *wb) { - struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; - - assert_spin_locked(&inode_wb_list_lock); + assert_spin_locked(&wb->list_lock); list_move(&inode->i_wb_list, &wb->b_more_io); } @@ -225,7 +238,7 @@ static void inode_sync_complete(struct inode *inode) { /* * Prevent speculative execution through - * spin_unlock(&inode_wb_list_lock); + * spin_unlock(&wb->list_lock); */ smp_mb(); @@ -250,31 +263,33 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t) /* * Move expired dirty inodes from @delaying_queue to @dispatch_queue. */ -static void move_expired_inodes(struct list_head *delaying_queue, +static int move_expired_inodes(struct list_head *delaying_queue, struct list_head *dispatch_queue, - unsigned long *older_than_this) + struct wb_writeback_work *work) { LIST_HEAD(tmp); struct list_head *pos, *node; struct super_block *sb = NULL; struct inode *inode; int do_sb_sort = 0; + int moved = 0; while (!list_empty(delaying_queue)) { inode = wb_inode(delaying_queue->prev); - if (older_than_this && - inode_dirtied_after(inode, *older_than_this)) + if (work->older_than_this && + inode_dirtied_after(inode, *work->older_than_this)) break; if (sb && sb != inode->i_sb) do_sb_sort = 1; sb = inode->i_sb; list_move(&inode->i_wb_list, &tmp); + moved++; } /* just one sb in list, splice to dispatch_queue and we're done */ if (!do_sb_sort) { list_splice(&tmp, dispatch_queue); - return; + goto out; } /* Move inodes from one superblock together */ @@ -286,6 +301,8 @@ static void move_expired_inodes(struct list_head *delaying_queue, list_move(&inode->i_wb_list, dispatch_queue); } } +out: + return moved; } /* @@ -299,11 +316,13 @@ static void move_expired_inodes(struct list_head *delaying_queue, * | * +--> dequeue for IO */ -static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) +static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work) { - assert_spin_locked(&inode_wb_list_lock); + int moved; + assert_spin_locked(&wb->list_lock); list_splice_init(&wb->b_more_io, &wb->b_io); - move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); + moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, work); + trace_writeback_queue_io(wb, work, moved); } static int write_inode(struct inode *inode, struct writeback_control *wbc) @@ -316,7 +335,8 @@ static int write_inode(struct inode *inode, struct writeback_control *wbc) /* * Wait for writeback on an inode to complete. */ -static void inode_wait_for_writeback(struct inode *inode) +static void inode_wait_for_writeback(struct inode *inode, + struct bdi_writeback *wb) { DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC); wait_queue_head_t *wqh; @@ -324,15 +344,15 @@ static void inode_wait_for_writeback(struct inode *inode) wqh = bit_waitqueue(&inode->i_state, __I_SYNC); while (inode->i_state & I_SYNC) { spin_unlock(&inode->i_lock); - spin_unlock(&inode_wb_list_lock); + spin_unlock(&wb->list_lock); __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); - spin_lock(&inode_wb_list_lock); + spin_lock(&wb->list_lock); spin_lock(&inode->i_lock); } } /* - * Write out an inode's dirty pages. Called under inode_wb_list_lock and + * Write out an inode's dirty pages. Called under wb->list_lock and * inode->i_lock. Either the caller has an active reference on the inode or * the inode has I_WILL_FREE set. * @@ -343,13 +363,15 @@ static void inode_wait_for_writeback(struct inode *inode) * livelocks, etc. */ static int -writeback_single_inode(struct inode *inode, struct writeback_control *wbc) +writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, + struct writeback_control *wbc) { struct address_space *mapping = inode->i_mapping; + long nr_to_write = wbc->nr_to_write; unsigned dirty; int ret; - assert_spin_locked(&inode_wb_list_lock); + assert_spin_locked(&wb->list_lock); assert_spin_locked(&inode->i_lock); if (!atomic_read(&inode->i_count)) @@ -367,14 +389,16 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * completed a full scan of b_io. */ if (wbc->sync_mode != WB_SYNC_ALL) { - requeue_io(inode); + requeue_io(inode, wb); + trace_writeback_single_inode_requeue(inode, wbc, + nr_to_write); return 0; } /* * It's a data-integrity sync. We must wait. */ - inode_wait_for_writeback(inode); + inode_wait_for_writeback(inode, wb); } BUG_ON(inode->i_state & I_SYNC); @@ -383,7 +407,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) inode->i_state |= I_SYNC; inode->i_state &= ~I_DIRTY_PAGES; spin_unlock(&inode->i_lock); - spin_unlock(&inode_wb_list_lock); + spin_unlock(&wb->list_lock); ret = do_writepages(mapping, wbc); @@ -414,10 +438,19 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) ret = err; } - spin_lock(&inode_wb_list_lock); + spin_lock(&wb->list_lock); spin_lock(&inode->i_lock); inode->i_state &= ~I_SYNC; if (!(inode->i_state & I_FREEING)) { + /* + * Sync livelock prevention. Each inode is tagged and synced in + * one shot. If still dirty, it will be redirty_tail()'ed below. + * Update the dirty time to prevent enqueue and sync it again. + */ + if ((inode->i_state & I_DIRTY) && + (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)) + inode->dirtied_when = jiffies; + if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { /* * We didn't write back all the pages. nfs_writepages() @@ -428,7 +461,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) /* * slice used up: queue for next turn */ - requeue_io(inode); + requeue_io(inode, wb); } else { /* * Writeback blocked by something other than @@ -437,7 +470,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * retrying writeback of the dirty page/inode * that cannot be performed immediately. */ - redirty_tail(inode); + redirty_tail(inode, wb); } } else if (inode->i_state & I_DIRTY) { /* @@ -446,7 +479,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * submission or metadata updates after data IO * completion. */ - redirty_tail(inode); + redirty_tail(inode, wb); } else { /* * The inode is clean. At this point we either have @@ -457,9 +490,41 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) } } inode_sync_complete(inode); + trace_writeback_single_inode(inode, wbc, nr_to_write); return ret; } +static long writeback_chunk_size(struct backing_dev_info *bdi, + struct wb_writeback_work *work) +{ + long pages; + + /* + * WB_SYNC_ALL mode does livelock avoidance by syncing dirty + * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX + * here avoids calling into writeback_inodes_wb() more than once. + * + * The intended call sequence for WB_SYNC_ALL writeback is: + * + * wb_writeback() + * writeback_sb_inodes() <== called only once + * write_cache_pages() <== called once for each inode + * (quickly) tag currently dirty pages + * (maybe slowly) sync all tagged pages + */ + if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages) + pages = LONG_MAX; + else { + pages = min(bdi->avg_write_bandwidth / 2, + global_dirty_limit / DIRTY_SCOPE); + pages = min(pages, work->nr_pages); + pages = round_down(pages + MIN_WRITEBACK_PAGES, + MIN_WRITEBACK_PAGES); + } + + return pages; +} + /* * Write a portion of b_io inodes which belong to @sb. * @@ -467,24 +532,36 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * inodes. Otherwise write only ones which go sequentially * in reverse order. * - * Return 1, if the caller writeback routine should be - * interrupted. Otherwise return 0. + * Return the number of pages and/or inodes written. */ -static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, - struct writeback_control *wbc, bool only_this_sb) +static long writeback_sb_inodes(struct super_block *sb, + struct bdi_writeback *wb, + struct wb_writeback_work *work) { + struct writeback_control wbc = { + .sync_mode = work->sync_mode, + .tagged_writepages = work->tagged_writepages, + .for_kupdate = work->for_kupdate, + .for_background = work->for_background, + .range_cyclic = work->range_cyclic, + .range_start = 0, + .range_end = LLONG_MAX, + }; + unsigned long start_time = jiffies; + long write_chunk; + long wrote = 0; /* count both pages and inodes */ + while (!list_empty(&wb->b_io)) { - long pages_skipped; struct inode *inode = wb_inode(wb->b_io.prev); if (inode->i_sb != sb) { - if (only_this_sb) { + if (work->sb) { /* * We only want to write back data for this * superblock, move all inodes not belonging * to it back onto the dirty list. */ - redirty_tail(inode); + redirty_tail(inode, wb); continue; } @@ -493,7 +570,7 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, * Bounce back to the caller to unpin this and * pin the next superblock. */ - return 0; + break; } /* @@ -504,104 +581,124 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, spin_lock(&inode->i_lock); if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { spin_unlock(&inode->i_lock); - requeue_io(inode); + redirty_tail(inode, wb); continue; } - - /* - * Was this inode dirtied after sync_sb_inodes was called? - * This keeps sync from extra jobs and livelock. - */ - if (inode_dirtied_after(inode, wbc->wb_start)) { - spin_unlock(&inode->i_lock); - return 1; - } - __iget(inode); + write_chunk = writeback_chunk_size(wb->bdi, work); + wbc.nr_to_write = write_chunk; + wbc.pages_skipped = 0; - pages_skipped = wbc->pages_skipped; - writeback_single_inode(inode, wbc); - if (wbc->pages_skipped != pages_skipped) { + writeback_single_inode(inode, wb, &wbc); + + work->nr_pages -= write_chunk - wbc.nr_to_write; + wrote += write_chunk - wbc.nr_to_write; + if (!(inode->i_state & I_DIRTY)) + wrote++; + if (wbc.pages_skipped) { /* * writeback is not making progress due to locked * buffers. Skip this inode for now. */ - redirty_tail(inode); + redirty_tail(inode, wb); } spin_unlock(&inode->i_lock); - spin_unlock(&inode_wb_list_lock); + spin_unlock(&wb->list_lock); iput(inode); cond_resched(); - spin_lock(&inode_wb_list_lock); - if (wbc->nr_to_write <= 0) { - wbc->more_io = 1; - return 1; + spin_lock(&wb->list_lock); + /* + * bail out to wb_writeback() often enough to check + * background threshold and other termination conditions. + */ + if (wrote) { + if (time_is_before_jiffies(start_time + HZ / 10UL)) + break; + if (work->nr_pages <= 0) + break; } - if (!list_empty(&wb->b_more_io)) - wbc->more_io = 1; } - /* b_io is empty */ - return 1; + return wrote; } -void writeback_inodes_wb(struct bdi_writeback *wb, - struct writeback_control *wbc) +static long __writeback_inodes_wb(struct bdi_writeback *wb, + struct wb_writeback_work *work) { - int ret = 0; - - if (!wbc->wb_start) - wbc->wb_start = jiffies; /* livelock avoidance */ - spin_lock(&inode_wb_list_lock); - if (!wbc->for_kupdate || list_empty(&wb->b_io)) - queue_io(wb, wbc->older_than_this); + unsigned long start_time = jiffies; + long wrote = 0; while (!list_empty(&wb->b_io)) { struct inode *inode = wb_inode(wb->b_io.prev); struct super_block *sb = inode->i_sb; if (!grab_super_passive(sb)) { - requeue_io(inode); + /* + * grab_super_passive() may fail consistently due to + * s_umount being grabbed by someone else. Don't use + * requeue_io() to avoid busy retrying the inode/sb. + */ + redirty_tail(inode, wb); continue; } - ret = writeback_sb_inodes(sb, wb, wbc, false); + wrote += writeback_sb_inodes(sb, wb, work); drop_super(sb); - if (ret) - break; + /* refer to the same tests at the end of writeback_sb_inodes */ + if (wrote) { + if (time_is_before_jiffies(start_time + HZ / 10UL)) + break; + if (work->nr_pages <= 0) + break; + } } - spin_unlock(&inode_wb_list_lock); /* Leave any unwritten inodes on b_io */ + return wrote; } -static void __writeback_inodes_sb(struct super_block *sb, - struct bdi_writeback *wb, struct writeback_control *wbc) +long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, + enum wb_reason reason) { - WARN_ON(!rwsem_is_locked(&sb->s_umount)); + struct wb_writeback_work work = { + .nr_pages = nr_pages, + .sync_mode = WB_SYNC_NONE, + .range_cyclic = 1, + .reason = reason, + }; - spin_lock(&inode_wb_list_lock); - if (!wbc->for_kupdate || list_empty(&wb->b_io)) - queue_io(wb, wbc->older_than_this); - writeback_sb_inodes(sb, wb, wbc, true); - spin_unlock(&inode_wb_list_lock); -} + spin_lock(&wb->list_lock); + if (list_empty(&wb->b_io)) + queue_io(wb, &work); + __writeback_inodes_wb(wb, &work); + spin_unlock(&wb->list_lock); -/* - * The maximum number of pages to writeout in a single bdi flush/kupdate - * operation. We do this so we don't hold I_SYNC against an inode for - * enormous amounts of time, which would block a userspace task which has - * been forced to throttle against that inode. Also, the code reevaluates - * the dirty each time it has written this many pages. - */ -#define MAX_WRITEBACK_PAGES 1024 + return nr_pages - work.nr_pages; +} -static inline bool over_bground_thresh(void) +static bool over_bground_thresh(struct backing_dev_info *bdi) { unsigned long background_thresh, dirty_thresh; global_dirty_limits(&background_thresh, &dirty_thresh); - return (global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS) > background_thresh); + if (global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS) > background_thresh) + return true; + + if (bdi_stat(bdi, BDI_RECLAIMABLE) > + bdi_dirty_limit(bdi, background_thresh)) + return true; + + return false; +} + +/* + * Called under wb->list_lock. If there are multiple wb per bdi, + * only the flusher working on the first wb should do it. + */ +static void wb_update_bandwidth(struct bdi_writeback *wb, + unsigned long start_time) +{ + __bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, 0, start_time); } /* @@ -622,47 +719,16 @@ static inline bool over_bground_thresh(void) static long wb_writeback(struct bdi_writeback *wb, struct wb_writeback_work *work) { - struct writeback_control wbc = { - .sync_mode = work->sync_mode, - .older_than_this = NULL, - .for_kupdate = work->for_kupdate, - .for_background = work->for_background, - .range_cyclic = work->range_cyclic, - }; + unsigned long wb_start = jiffies; + long nr_pages = work->nr_pages; unsigned long oldest_jif; - long wrote = 0; - long write_chunk; struct inode *inode; + long progress; - if (wbc.for_kupdate) { - wbc.older_than_this = &oldest_jif; - oldest_jif = jiffies - - msecs_to_jiffies(dirty_expire_interval * 10); - } - if (!wbc.range_cyclic) { - wbc.range_start = 0; - wbc.range_end = LLONG_MAX; - } - - /* - * WB_SYNC_ALL mode does livelock avoidance by syncing dirty - * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX - * here avoids calling into writeback_inodes_wb() more than once. - * - * The intended call sequence for WB_SYNC_ALL writeback is: - * - * wb_writeback() - * __writeback_inodes_sb() <== called only once - * write_cache_pages() <== called once for each inode - * (quickly) tag currently dirty pages - * (maybe slowly) sync all tagged pages - */ - if (wbc.sync_mode == WB_SYNC_NONE) - write_chunk = MAX_WRITEBACK_PAGES; - else - write_chunk = LONG_MAX; + oldest_jif = jiffies; + work->older_than_this = &oldest_jif; - wbc.wb_start = jiffies; /* livelock avoidance */ + spin_lock(&wb->list_lock); for (;;) { /* * Stop writeback when nr_pages has been consumed @@ -684,55 +750,57 @@ static long wb_writeback(struct bdi_writeback *wb, * For background writeout, stop when we are below the * background dirty threshold */ - if (work->for_background && !over_bground_thresh()) + if (work->for_background && !over_bground_thresh(wb->bdi)) break; - wbc.more_io = 0; - wbc.nr_to_write = write_chunk; - wbc.pages_skipped = 0; + if (work->for_kupdate) { + oldest_jif = jiffies - + msecs_to_jiffies(dirty_expire_interval * 10); + work->older_than_this = &oldest_jif; + } - trace_wbc_writeback_start(&wbc, wb->bdi); + trace_writeback_start(wb->bdi, work); + if (list_empty(&wb->b_io)) + queue_io(wb, work); if (work->sb) - __writeback_inodes_sb(work->sb, wb, &wbc); + progress = writeback_sb_inodes(work->sb, wb, work); else - writeback_inodes_wb(wb, &wbc); - trace_wbc_writeback_written(&wbc, wb->bdi); + progress = __writeback_inodes_wb(wb, work); + trace_writeback_written(wb->bdi, work); - work->nr_pages -= write_chunk - wbc.nr_to_write; - wrote += write_chunk - wbc.nr_to_write; + wb_update_bandwidth(wb, wb_start); /* - * If we consumed everything, see if we have more + * Did we write something? Try for more + * + * Dirty inodes are moved to b_io for writeback in batches. + * The completion of the current batch does not necessarily + * mean the overall work is done. So we keep looping as long + * as made some progress on cleaning pages or inodes. */ - if (wbc.nr_to_write <= 0) + if (progress) continue; /* - * Didn't write everything and we don't have more IO, bail + * No more inodes for IO, bail */ - if (!wbc.more_io) + if (list_empty(&wb->b_more_io)) break; /* - * Did we write something? Try for more - */ - if (wbc.nr_to_write < write_chunk) - continue; - /* * Nothing written. Wait for some inode to * become available for writeback. Otherwise * we'll just busyloop. */ - spin_lock(&inode_wb_list_lock); if (!list_empty(&wb->b_more_io)) { + trace_writeback_wait(wb->bdi, work); inode = wb_inode(wb->b_more_io.prev); - trace_wbc_writeback_wait(&wbc, wb->bdi); spin_lock(&inode->i_lock); - inode_wait_for_writeback(inode); + inode_wait_for_writeback(inode, wb); spin_unlock(&inode->i_lock); } - spin_unlock(&inode_wb_list_lock); } + spin_unlock(&wb->list_lock); - return wrote; + return nr_pages - work->nr_pages; } /* @@ -766,13 +834,14 @@ static unsigned long get_nr_dirty_pages(void) static long wb_check_background_flush(struct bdi_writeback *wb) { - if (over_bground_thresh()) { + if (over_bground_thresh(wb->bdi)) { struct wb_writeback_work work = { .nr_pages = LONG_MAX, .sync_mode = WB_SYNC_NONE, .for_background = 1, .range_cyclic = 1, + .reason = WB_REASON_BACKGROUND, }; return wb_writeback(wb, &work); @@ -806,6 +875,7 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb) .sync_mode = WB_SYNC_NONE, .for_kupdate = 1, .range_cyclic = 1, + .reason = WB_REASON_PERIODIC, }; return wb_writeback(wb, &work); @@ -924,7 +994,7 @@ int bdi_writeback_thread(void *data) * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back * the whole world. */ -void wakeup_flusher_threads(long nr_pages) +void wakeup_flusher_threads(long nr_pages, enum wb_reason reason) { struct backing_dev_info *bdi; @@ -937,7 +1007,7 @@ void wakeup_flusher_threads(long nr_pages) list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { if (!bdi_has_dirty_io(bdi)) continue; - __bdi_start_writeback(bdi, nr_pages, false); + __bdi_start_writeback(bdi, nr_pages, false, reason); } rcu_read_unlock(); } @@ -1063,10 +1133,10 @@ void __mark_inode_dirty(struct inode *inode, int flags) } spin_unlock(&inode->i_lock); - spin_lock(&inode_wb_list_lock); + spin_lock(&bdi->wb.list_lock); inode->dirtied_when = jiffies; list_move(&inode->i_wb_list, &bdi->wb.b_dirty); - spin_unlock(&inode_wb_list_lock); + spin_unlock(&bdi->wb.list_lock); if (wakeup_bdi) bdi_wakeup_thread_delayed(bdi); @@ -1158,14 +1228,18 @@ static void wait_sb_inodes(struct super_block *sb) * on how many (if any) will be written, and this function does not wait * for IO completion of submitted IO. */ -void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr) +void writeback_inodes_sb_nr(struct super_block *sb, + unsigned long nr, + enum wb_reason reason) { DECLARE_COMPLETION_ONSTACK(done); struct wb_writeback_work work = { - .sb = sb, - .sync_mode = WB_SYNC_NONE, - .done = &done, - .nr_pages = nr, + .sb = sb, + .sync_mode = WB_SYNC_NONE, + .tagged_writepages = 1, + .done = &done, + .nr_pages = nr, + .reason = reason, }; WARN_ON(!rwsem_is_locked(&sb->s_umount)); @@ -1182,9 +1256,9 @@ EXPORT_SYMBOL(writeback_inodes_sb_nr); * on how many (if any) will be written, and this function does not wait * for IO completion of submitted IO. */ -void writeback_inodes_sb(struct super_block *sb) +void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason) { - return writeback_inodes_sb_nr(sb, get_nr_dirty_pages()); + return writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason); } EXPORT_SYMBOL(writeback_inodes_sb); @@ -1195,11 +1269,11 @@ EXPORT_SYMBOL(writeback_inodes_sb); * Invoke writeback_inodes_sb if no writeback is currently underway. * Returns 1 if writeback was started, 0 if not. */ -int writeback_inodes_sb_if_idle(struct super_block *sb) +int writeback_inodes_sb_if_idle(struct super_block *sb, enum wb_reason reason) { if (!writeback_in_progress(sb->s_bdi)) { down_read(&sb->s_umount); - writeback_inodes_sb(sb); + writeback_inodes_sb(sb, reason); up_read(&sb->s_umount); return 1; } else @@ -1216,11 +1290,12 @@ EXPORT_SYMBOL(writeback_inodes_sb_if_idle); * Returns 1 if writeback was started, 0 if not. */ int writeback_inodes_sb_nr_if_idle(struct super_block *sb, - unsigned long nr) + unsigned long nr, + enum wb_reason reason) { if (!writeback_in_progress(sb->s_bdi)) { down_read(&sb->s_umount); - writeback_inodes_sb_nr(sb, nr); + writeback_inodes_sb_nr(sb, nr, reason); up_read(&sb->s_umount); return 1; } else @@ -1244,6 +1319,7 @@ void sync_inodes_sb(struct super_block *sb) .nr_pages = LONG_MAX, .range_cyclic = 0, .done = &done, + .reason = WB_REASON_SYNC, }; WARN_ON(!rwsem_is_locked(&sb->s_umount)); @@ -1267,6 +1343,7 @@ EXPORT_SYMBOL(sync_inodes_sb); */ int write_inode_now(struct inode *inode, int sync) { + struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; int ret; struct writeback_control wbc = { .nr_to_write = LONG_MAX, @@ -1279,11 +1356,11 @@ int write_inode_now(struct inode *inode, int sync) wbc.nr_to_write = 0; might_sleep(); - spin_lock(&inode_wb_list_lock); + spin_lock(&wb->list_lock); spin_lock(&inode->i_lock); - ret = writeback_single_inode(inode, &wbc); + ret = writeback_single_inode(inode, wb, &wbc); spin_unlock(&inode->i_lock); - spin_unlock(&inode_wb_list_lock); + spin_unlock(&wb->list_lock); if (sync) inode_sync_wait(inode); return ret; @@ -1303,13 +1380,14 @@ EXPORT_SYMBOL(write_inode_now); */ int sync_inode(struct inode *inode, struct writeback_control *wbc) { + struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; int ret; - spin_lock(&inode_wb_list_lock); + spin_lock(&wb->list_lock); spin_lock(&inode->i_lock); - ret = writeback_single_inode(inode, wbc); + ret = writeback_single_inode(inode, wb, wbc); spin_unlock(&inode->i_lock); - spin_unlock(&inode_wb_list_lock); + spin_unlock(&wb->list_lock); return ret; } EXPORT_SYMBOL(sync_inode); diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 85542a7daf40..42593c587d48 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -231,7 +231,7 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent, if (iop) inode->i_op = iop; inode->i_fop = fop; - inode->i_nlink = nlink; + set_nlink(inode, nlink); inode->i_private = fc; d_add(dentry, inode); return dentry; diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index b6cca47f7b07..3426521f3205 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -47,6 +47,7 @@ #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/stat.h> +#include <linux/module.h> #include "fuse_i.h" diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 640fc229df10..5cb8614508c3 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -258,10 +258,14 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, forget->forget_one.nlookup = nlookup; spin_lock(&fc->lock); - fc->forget_list_tail->next = forget; - fc->forget_list_tail = forget; - wake_up(&fc->waitq); - kill_fasync(&fc->fasync, SIGIO, POLL_IN); + if (fc->connected) { + fc->forget_list_tail->next = forget; + fc->forget_list_tail = forget; + wake_up(&fc->waitq); + kill_fasync(&fc->fasync, SIGIO, POLL_IN); + } else { + kfree(forget); + } spin_unlock(&fc->lock); } @@ -1358,6 +1362,10 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size, if (outarg.namelen > FUSE_NAME_MAX) goto err; + err = -EINVAL; + if (size != sizeof(outarg) + outarg.namelen + 1) + goto err; + name.name = buf; name.len = outarg.namelen; err = fuse_copy_one(cs, buf, outarg.namelen + 1); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 7bb685cdd00c..594f07a81c28 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -14,6 +14,7 @@ #include <linux/sched.h> #include <linux/module.h> #include <linux/compat.h> +#include <linux/swap.h> static const struct file_operations fuse_direct_io_file_operations; @@ -245,6 +246,12 @@ void fuse_release_common(struct file *file, int opcode) req = ff->reserved_req; fuse_prepare_release(ff, file->f_flags, opcode); + if (ff->flock) { + struct fuse_release_in *inarg = &req->misc.release.in; + inarg->release_flags |= FUSE_RELEASE_FLOCK_UNLOCK; + inarg->lock_owner = fuse_lock_owner_id(ff->fc, + (fl_owner_t) file); + } /* Hold vfsmount and dentry until release is finished */ path_get(&file->f_path); req->misc.release.path = file->f_path; @@ -755,18 +762,6 @@ static size_t fuse_send_write(struct fuse_req *req, struct file *file, return req->misc.write.out.size; } -static int fuse_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata) -{ - pgoff_t index = pos >> PAGE_CACHE_SHIFT; - - *pagep = grab_cache_page_write_begin(mapping, index, flags); - if (!*pagep) - return -ENOMEM; - return 0; -} - void fuse_write_update_size(struct inode *inode, loff_t pos) { struct fuse_conn *fc = get_fuse_conn(inode); @@ -779,62 +774,6 @@ void fuse_write_update_size(struct inode *inode, loff_t pos) spin_unlock(&fc->lock); } -static int fuse_buffered_write(struct file *file, struct inode *inode, - loff_t pos, unsigned count, struct page *page) -{ - int err; - size_t nres; - struct fuse_conn *fc = get_fuse_conn(inode); - unsigned offset = pos & (PAGE_CACHE_SIZE - 1); - struct fuse_req *req; - - if (is_bad_inode(inode)) - return -EIO; - - /* - * Make sure writepages on the same page are not mixed up with - * plain writes. - */ - fuse_wait_on_page_writeback(inode, page->index); - - req = fuse_get_req(fc); - if (IS_ERR(req)) - return PTR_ERR(req); - - req->in.argpages = 1; - req->num_pages = 1; - req->pages[0] = page; - req->page_offset = offset; - nres = fuse_send_write(req, file, pos, count, NULL); - err = req->out.h.error; - fuse_put_request(fc, req); - if (!err && !nres) - err = -EIO; - if (!err) { - pos += nres; - fuse_write_update_size(inode, pos); - if (count == PAGE_CACHE_SIZE) - SetPageUptodate(page); - } - fuse_invalidate_attr(inode); - return err ? err : nres; -} - -static int fuse_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - struct inode *inode = mapping->host; - int res = 0; - - if (copied) - res = fuse_buffered_write(file, inode, pos, copied, page); - - unlock_page(page); - page_cache_release(page); - return res; -} - static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file, struct inode *inode, loff_t pos, size_t count) @@ -908,6 +847,8 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req, pagefault_enable(); flush_dcache_page(page); + mark_page_accessed(page); + if (!tmp) { unlock_page(page); page_cache_release(page); @@ -1507,7 +1448,7 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) pid_t pid = fl->fl_type != F_UNLCK ? current->tgid : 0; int err; - if (fl->fl_lmops && fl->fl_lmops->fl_grant) { + if (fl->fl_lmops && fl->fl_lmops->lm_grant) { /* NLM needs asynchronous locks, which we don't support yet */ return -ENOLCK; } @@ -1559,11 +1500,14 @@ static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl) struct fuse_conn *fc = get_fuse_conn(inode); int err; - if (fc->no_lock) { + if (fc->no_flock) { err = flock_lock_file_wait(file, fl); } else { + struct fuse_file *ff = file->private_data; + /* emulate flock with POSIX locks */ fl->fl_owner = (fl_owner_t) file; + ff->flock = true; err = fuse_setlk(file, fl, 1); } @@ -2201,8 +2145,6 @@ static const struct address_space_operations fuse_file_aops = { .readpage = fuse_readpage, .writepage = fuse_writepage, .launder_page = fuse_launder_page, - .write_begin = fuse_write_begin, - .write_end = fuse_write_end, .readpages = fuse_readpages, .set_page_dirty = __set_page_dirty_nobuffers, .bmap = fuse_bmap, diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index c6aa2d4b8517..cf6db0a93219 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -135,6 +135,9 @@ struct fuse_file { /** Wait queue head for poll */ wait_queue_head_t poll_wait; + + /** Has flock been performed on this file? */ + bool flock:1; }; /** One input argument of a request */ @@ -448,7 +451,7 @@ struct fuse_conn { /** Is removexattr not implemented by fs? */ unsigned no_removexattr:1; - /** Are file locking primitives not implemented by fs? */ + /** Are posix file locking primitives not implemented by fs? */ unsigned no_lock:1; /** Is access not implemented by fs? */ @@ -472,6 +475,9 @@ struct fuse_conn { /** Don't apply umask to creation modes */ unsigned dont_mask:1; + /** Are BSD file locking primitives not implemented by fs? */ + unsigned no_flock:1; + /** The number of requests waiting for completion */ atomic_t num_waiting; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 38f84cd48b67..3e6d72756479 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -71,7 +71,7 @@ struct fuse_mount_data { unsigned blksize; }; -struct fuse_forget_link *fuse_alloc_forget() +struct fuse_forget_link *fuse_alloc_forget(void) { return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL); } @@ -151,7 +151,7 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, inode->i_ino = attr->ino; inode->i_mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); - inode->i_nlink = attr->nlink; + set_nlink(inode, attr->nlink); inode->i_uid = attr->uid; inode->i_gid = attr->gid; inode->i_blocks = attr->blocks; @@ -809,6 +809,13 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->async_read = 1; if (!(arg->flags & FUSE_POSIX_LOCKS)) fc->no_lock = 1; + if (arg->minor >= 17) { + if (!(arg->flags & FUSE_FLOCK_LOCKS)) + fc->no_flock = 1; + } else { + if (!(arg->flags & FUSE_POSIX_LOCKS)) + fc->no_flock = 1; + } if (arg->flags & FUSE_ATOMIC_O_TRUNC) fc->atomic_o_trunc = 1; if (arg->minor >= 9) { @@ -823,6 +830,7 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) } else { ra_pages = fc->max_read / PAGE_CACHE_SIZE; fc->no_lock = 1; + fc->no_flock = 1; } fc->bdi.ra_pages = min(fc->bdi.ra_pages, ra_pages); @@ -843,7 +851,8 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) arg->minor = FUSE_KERNEL_MINOR_VERSION; arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE; arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | - FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK; + FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK | + FUSE_FLOCK_LOCKS; req->in.h.opcode = FUSE_INIT; req->in.numargs = 1; req->in.args[0].size = sizeof(*arg); diff --git a/fs/generic_acl.c b/fs/generic_acl.c index 70e90b4974ce..d0dddaceac59 100644 --- a/fs/generic_acl.c +++ b/fs/generic_acl.c @@ -82,18 +82,14 @@ generic_acl_set(struct dentry *dentry, const char *name, const void *value, return PTR_ERR(acl); } if (acl) { - mode_t mode; - error = posix_acl_valid(acl); if (error) goto failed; switch (type) { case ACL_TYPE_ACCESS: - mode = inode->i_mode; - error = posix_acl_equiv_mode(acl, &mode); + error = posix_acl_equiv_mode(acl, &inode->i_mode); if (error < 0) goto failed; - inode->i_mode = mode; inode->i_ctime = CURRENT_TIME; if (error == 0) { posix_acl_release(acl); @@ -125,38 +121,23 @@ int generic_acl_init(struct inode *inode, struct inode *dir) { struct posix_acl *acl = NULL; - mode_t mode = inode->i_mode; int error; - inode->i_mode = mode & ~current_umask(); if (!S_ISLNK(inode->i_mode)) acl = get_cached_acl(dir, ACL_TYPE_DEFAULT); if (acl) { - struct posix_acl *clone; - - if (S_ISDIR(inode->i_mode)) { - clone = posix_acl_clone(acl, GFP_KERNEL); - error = -ENOMEM; - if (!clone) - goto cleanup; - set_cached_acl(inode, ACL_TYPE_DEFAULT, clone); - posix_acl_release(clone); - } - clone = posix_acl_clone(acl, GFP_KERNEL); - error = -ENOMEM; - if (!clone) - goto cleanup; - error = posix_acl_create_masq(clone, &mode); - if (error >= 0) { - inode->i_mode = mode; - if (error > 0) - set_cached_acl(inode, ACL_TYPE_ACCESS, clone); - } - posix_acl_release(clone); + if (S_ISDIR(inode->i_mode)) + set_cached_acl(inode, ACL_TYPE_DEFAULT, acl); + error = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode); + if (error < 0) + return error; + if (error > 0) + set_cached_acl(inode, ACL_TYPE_ACCESS, acl); + } else { + inode->i_mode &= ~current_umask(); } error = 0; -cleanup: posix_acl_release(acl); return error; } @@ -170,44 +151,22 @@ cleanup: int generic_acl_chmod(struct inode *inode) { - struct posix_acl *acl, *clone; + struct posix_acl *acl; int error = 0; if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; acl = get_cached_acl(inode, ACL_TYPE_ACCESS); if (acl) { - clone = posix_acl_clone(acl, GFP_KERNEL); + error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); + if (error) + return error; + set_cached_acl(inode, ACL_TYPE_ACCESS, acl); posix_acl_release(acl); - if (!clone) - return -ENOMEM; - error = posix_acl_chmod_masq(clone, inode->i_mode); - if (!error) - set_cached_acl(inode, ACL_TYPE_ACCESS, clone); - posix_acl_release(clone); } return error; } -int -generic_check_acl(struct inode *inode, int mask) -{ - if (mask & MAY_NOT_BLOCK) { - if (!negative_cached_acl(inode, ACL_TYPE_ACCESS)) - return -ECHILD; - } else { - struct posix_acl *acl; - - acl = get_cached_acl(inode, ACL_TYPE_ACCESS); - if (acl) { - int error = posix_acl_permission(inode, acl, mask); - posix_acl_release(acl); - return error; - } - } - return -EAGAIN; -} - const struct xattr_handler generic_acl_access_handler = { .prefix = POSIX_ACL_XATTR_ACCESS, .flags = ACL_TYPE_ACCESS, diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 8ef1079f1665..65978d7885c8 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -67,39 +67,12 @@ static struct posix_acl *gfs2_acl_get(struct gfs2_inode *ip, int type) return acl; } -/** - * gfs2_check_acl - Check an ACL to see if we're allowed to do something - * @inode: the file we want to do something to - * @mask: what we want to do - * - * Returns: errno - */ - -int gfs2_check_acl(struct inode *inode, int mask) +struct posix_acl *gfs2_get_acl(struct inode *inode, int type) { - struct posix_acl *acl; - int error; - - if (mask & MAY_NOT_BLOCK) { - if (!negative_cached_acl(inode, ACL_TYPE_ACCESS)) - return -ECHILD; - return -EAGAIN; - } - - acl = gfs2_acl_get(GFS2_I(inode), ACL_TYPE_ACCESS); - if (IS_ERR(acl)) - return PTR_ERR(acl); - - if (acl) { - error = posix_acl_permission(inode, acl, mask); - posix_acl_release(acl); - return error; - } - - return -EAGAIN; + return gfs2_acl_get(GFS2_I(inode), type); } -static int gfs2_set_mode(struct inode *inode, mode_t mode) +static int gfs2_set_mode(struct inode *inode, umode_t mode) { int error = 0; @@ -109,7 +82,7 @@ static int gfs2_set_mode(struct inode *inode, mode_t mode) iattr.ia_valid = ATTR_MODE; iattr.ia_mode = mode; - error = gfs2_setattr_simple(GFS2_I(inode), &iattr); + error = gfs2_setattr_simple(inode, &iattr); } return error; @@ -143,8 +116,8 @@ out: int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode) { struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); - struct posix_acl *acl, *clone; - mode_t mode = inode->i_mode; + struct posix_acl *acl; + umode_t mode = inode->i_mode; int error = 0; if (!sdp->sd_args.ar_posix_acl) @@ -168,16 +141,10 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode) goto out; } - clone = posix_acl_clone(acl, GFP_NOFS); - error = -ENOMEM; - if (!clone) - goto out; - posix_acl_release(acl); - acl = clone; - - error = posix_acl_create_masq(acl, &mode); + error = posix_acl_create(&acl, GFP_NOFS, &mode); if (error < 0) - goto out; + return error; + if (error == 0) goto munge; @@ -193,7 +160,8 @@ out: int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr) { - struct posix_acl *acl, *clone; + struct inode *inode = &ip->i_inode; + struct posix_acl *acl; char *data; unsigned int len; int error; @@ -202,27 +170,21 @@ int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr) if (IS_ERR(acl)) return PTR_ERR(acl); if (!acl) - return gfs2_setattr_simple(ip, attr); + return gfs2_setattr_simple(inode, attr); - clone = posix_acl_clone(acl, GFP_NOFS); + error = posix_acl_chmod(&acl, GFP_NOFS, attr->ia_mode); + if (error) + return error; + + len = posix_acl_to_xattr(acl, NULL, 0); + data = kmalloc(len, GFP_NOFS); error = -ENOMEM; - if (!clone) + if (data == NULL) goto out; - posix_acl_release(acl); - acl = clone; - - error = posix_acl_chmod_masq(acl, attr->ia_mode); - if (!error) { - len = posix_acl_to_xattr(acl, NULL, 0); - data = kmalloc(len, GFP_NOFS); - error = -ENOMEM; - if (data == NULL) - goto out; - posix_acl_to_xattr(acl, data, len); - error = gfs2_xattr_acl_chmod(ip, attr, data); - kfree(data); - set_cached_acl(&ip->i_inode, ACL_TYPE_ACCESS, acl); - } + posix_acl_to_xattr(acl, data, len); + error = gfs2_xattr_acl_chmod(ip, attr, data); + kfree(data); + set_cached_acl(&ip->i_inode, ACL_TYPE_ACCESS, acl); out: posix_acl_release(acl); @@ -315,7 +277,7 @@ static int gfs2_xattr_system_set(struct dentry *dentry, const char *name, goto out_release; if (type == ACL_TYPE_ACCESS) { - mode_t mode = inode->i_mode; + umode_t mode = inode->i_mode; error = posix_acl_equiv_mode(acl, &mode); if (error <= 0) { diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h index b522b0cb39ea..0da38dc7efec 100644 --- a/fs/gfs2/acl.h +++ b/fs/gfs2/acl.h @@ -16,7 +16,7 @@ #define GFS2_POSIX_ACL_DEFAULT "posix_acl_default" #define GFS2_ACL_MAX_ENTRIES 25 -extern int gfs2_check_acl(struct inode *inode, int mask); +extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type); extern int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode); extern int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr); extern const struct xattr_handler gfs2_xattr_system_handler; diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index f9fbbe96c222..4858e1fed8b1 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -663,7 +663,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, if (&ip->i_inode == sdp->sd_rindex) rblocks += 2 * RES_STATFS; if (alloc_required) - rblocks += gfs2_rg_blocks(al); + rblocks += gfs2_rg_blocks(ip); error = gfs2_trans_begin(sdp, rblocks, PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize); @@ -787,7 +787,6 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh, u64 to = pos + copied; void *kaddr; unsigned char *buf = dibh->b_data + sizeof(struct gfs2_dinode); - struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data; BUG_ON((pos + len) > (dibh->b_size - sizeof(struct gfs2_dinode))); kaddr = kmap_atomic(page, KM_USER0); @@ -804,7 +803,6 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh, if (copied) { if (inode->i_size < to) i_size_write(inode, to); - gfs2_dinode_out(ip, di); mark_inode_dirty(inode); } @@ -873,10 +871,6 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping, gfs2_page_add_databufs(ip, page, from, to); ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); - if (ret > 0) { - gfs2_dinode_out(ip, dibh->b_data); - mark_inode_dirty(inode); - } if (inode == sdp->sd_rindex) { adjust_fs_space(inode); diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 7878c473ae62..41d494d79709 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -10,6 +10,7 @@ #include <linux/spinlock.h> #include <linux/completion.h> #include <linux/buffer_head.h> +#include <linux/blkdev.h> #include <linux/gfs2_ondisk.h> #include <linux/crc32.h> @@ -36,11 +37,6 @@ struct metapath { __u16 mp_list[GFS2_MAX_META_HEIGHT]; }; -typedef int (*block_call_t) (struct gfs2_inode *ip, struct buffer_head *dibh, - struct buffer_head *bh, __be64 *top, - __be64 *bottom, unsigned int height, - void *data); - struct strip_mine { int sm_first; unsigned int sm_height; @@ -273,6 +269,30 @@ static inline __be64 *metapointer(unsigned int height, const struct metapath *mp return ((__be64 *)(bh->b_data + head_size)) + mp->mp_list[height]; } +static void gfs2_metapath_ra(struct gfs2_glock *gl, + const struct buffer_head *bh, const __be64 *pos) +{ + struct buffer_head *rabh; + const __be64 *endp = (const __be64 *)(bh->b_data + bh->b_size); + const __be64 *t; + + for (t = pos; t < endp; t++) { + if (!*t) + continue; + + rabh = gfs2_getbuf(gl, be64_to_cpu(*t), CREATE); + if (trylock_buffer(rabh)) { + if (!buffer_uptodate(rabh)) { + rabh->b_end_io = end_buffer_read_sync; + submit_bh(READA | REQ_META, rabh); + continue; + } + unlock_buffer(rabh); + } + brelse(rabh); + } +} + /** * lookup_metapath - Walk the metadata tree to a specific point * @ip: The inode @@ -432,12 +452,14 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock, { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); + struct super_block *sb = sdp->sd_vfs; struct buffer_head *dibh = mp->mp_bh[0]; u64 bn, dblock = 0; unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0; unsigned dblks = 0; unsigned ptrs_per_blk; const unsigned end_of_metadata = height - 1; + int ret; int eob = 0; enum alloc_state state; __be64 *ptr; @@ -540,6 +562,15 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock, dblock = bn; while (n-- > 0) *ptr++ = cpu_to_be64(bn++); + if (buffer_zeronew(bh_map)) { + ret = sb_issue_zeroout(sb, dblock, dblks, + GFP_NOFS); + if (ret) { + fs_err(sdp, + "Failed to zero data buffers\n"); + clear_buffer_zeronew(bh_map); + } + } break; } } while ((state != ALLOC_DATA) || !dblock); @@ -668,76 +699,6 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi } /** - * recursive_scan - recursively scan through the end of a file - * @ip: the inode - * @dibh: the dinode buffer - * @mp: the path through the metadata to the point to start - * @height: the height the recursion is at - * @block: the indirect block to look at - * @first: 1 if this is the first block - * @bc: the call to make for each piece of metadata - * @data: data opaque to this function to pass to @bc - * - * When this is first called @height and @block should be zero and - * @first should be 1. - * - * Returns: errno - */ - -static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh, - struct metapath *mp, unsigned int height, - u64 block, int first, block_call_t bc, - void *data) -{ - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct buffer_head *bh = NULL; - __be64 *top, *bottom; - u64 bn; - int error; - int mh_size = sizeof(struct gfs2_meta_header); - - if (!height) { - error = gfs2_meta_inode_buffer(ip, &bh); - if (error) - return error; - dibh = bh; - - top = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + mp->mp_list[0]; - bottom = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + sdp->sd_diptrs; - } else { - error = gfs2_meta_indirect_buffer(ip, height, block, 0, &bh); - if (error) - return error; - - top = (__be64 *)(bh->b_data + mh_size) + - (first ? mp->mp_list[height] : 0); - - bottom = (__be64 *)(bh->b_data + mh_size) + sdp->sd_inptrs; - } - - error = bc(ip, dibh, bh, top, bottom, height, data); - if (error) - goto out; - - if (height < ip->i_height - 1) - for (; top < bottom; top++, first = 0) { - if (!*top) - continue; - - bn = be64_to_cpu(*top); - - error = recursive_scan(ip, dibh, mp, height + 1, bn, - first, bc, data); - if (error) - break; - } - -out: - brelse(bh); - return error; -} - -/** * do_strip - Look for a layer a particular layer of the file and strip it off * @ip: the inode * @dibh: the dinode buffer @@ -752,9 +713,8 @@ out: static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, struct buffer_head *bh, __be64 *top, __be64 *bottom, - unsigned int height, void *data) + unsigned int height, struct strip_mine *sm) { - struct strip_mine *sm = data; struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_rgrp_list rlist; u64 bn, bstart; @@ -783,11 +743,6 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, else if (ip->i_depth) revokes = sdp->sd_inptrs; - if (ip != GFS2_I(sdp->sd_rindex)) - error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh); - else if (!sdp->sd_rgrps) - error = gfs2_ri_update(ip); - if (error) return error; @@ -805,7 +760,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, blen++; else { if (bstart) - gfs2_rlist_add(sdp, &rlist, bstart); + gfs2_rlist_add(ip, &rlist, bstart); bstart = bn; blen = 1; @@ -813,7 +768,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, } if (bstart) - gfs2_rlist_add(sdp, &rlist, bstart); + gfs2_rlist_add(ip, &rlist, bstart); else goto out; /* Nothing to do */ @@ -887,12 +842,82 @@ out_rg_gunlock: out_rlist: gfs2_rlist_free(&rlist); out: - if (ip != GFS2_I(sdp->sd_rindex)) - gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh); return error; } /** + * recursive_scan - recursively scan through the end of a file + * @ip: the inode + * @dibh: the dinode buffer + * @mp: the path through the metadata to the point to start + * @height: the height the recursion is at + * @block: the indirect block to look at + * @first: 1 if this is the first block + * @sm: data opaque to this function to pass to @bc + * + * When this is first called @height and @block should be zero and + * @first should be 1. + * + * Returns: errno + */ + +static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh, + struct metapath *mp, unsigned int height, + u64 block, int first, struct strip_mine *sm) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct buffer_head *bh = NULL; + __be64 *top, *bottom; + u64 bn; + int error; + int mh_size = sizeof(struct gfs2_meta_header); + + if (!height) { + error = gfs2_meta_inode_buffer(ip, &bh); + if (error) + return error; + dibh = bh; + + top = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + mp->mp_list[0]; + bottom = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + sdp->sd_diptrs; + } else { + error = gfs2_meta_indirect_buffer(ip, height, block, 0, &bh); + if (error) + return error; + + top = (__be64 *)(bh->b_data + mh_size) + + (first ? mp->mp_list[height] : 0); + + bottom = (__be64 *)(bh->b_data + mh_size) + sdp->sd_inptrs; + } + + error = do_strip(ip, dibh, bh, top, bottom, height, sm); + if (error) + goto out; + + if (height < ip->i_height - 1) { + + gfs2_metapath_ra(ip->i_gl, bh, top); + + for (; top < bottom; top++, first = 0) { + if (!*top) + continue; + + bn = be64_to_cpu(*top); + + error = recursive_scan(ip, dibh, mp, height + 1, bn, + first, sm); + if (error) + break; + } + } +out: + brelse(bh); + return error; +} + + +/** * gfs2_block_truncate_page - Deal with zeroing out data for truncate * * This is partly borrowed from ext3. @@ -1031,7 +1056,7 @@ static int trunc_dealloc(struct gfs2_inode *ip, u64 size) sm.sm_first = !!size; sm.sm_height = height; - error = recursive_scan(ip, NULL, &mp, 0, 0, 1, do_strip, &sm); + error = recursive_scan(ip, NULL, &mp, 0, 0, 1, &sm); if (error) break; } diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 1cc2f8ec52a2..8ccad2467cb6 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -240,16 +240,15 @@ fail: return error; } -static int gfs2_dir_read_stuffed(struct gfs2_inode *ip, char *buf, - u64 offset, unsigned int size) +static int gfs2_dir_read_stuffed(struct gfs2_inode *ip, __be64 *buf, + unsigned int size) { struct buffer_head *dibh; int error; error = gfs2_meta_inode_buffer(ip, &dibh); if (!error) { - offset += sizeof(struct gfs2_dinode); - memcpy(buf, dibh->b_data + offset, size); + memcpy(buf, dibh->b_data + sizeof(struct gfs2_dinode), size); brelse(dibh); } @@ -261,13 +260,12 @@ static int gfs2_dir_read_stuffed(struct gfs2_inode *ip, char *buf, * gfs2_dir_read_data - Read a data from a directory inode * @ip: The GFS2 Inode * @buf: The buffer to place result into - * @offset: File offset to begin jdata_readng from * @size: Amount of data to transfer * * Returns: The amount of data actually copied or the error */ -static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset, - unsigned int size, unsigned ra) +static int gfs2_dir_read_data(struct gfs2_inode *ip, __be64 *buf, + unsigned int size) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); u64 lblock, dblock; @@ -275,24 +273,14 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset, unsigned int o; int copied = 0; int error = 0; - u64 disksize = i_size_read(&ip->i_inode); - - if (offset >= disksize) - return 0; - - if (offset + size > disksize) - size = disksize - offset; - - if (!size) - return 0; if (gfs2_is_stuffed(ip)) - return gfs2_dir_read_stuffed(ip, buf, offset, size); + return gfs2_dir_read_stuffed(ip, buf, size); if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip))) return -EINVAL; - lblock = offset; + lblock = 0; o = do_div(lblock, sdp->sd_jbsize) + sizeof(struct gfs2_meta_header); while (copied < size) { @@ -311,8 +299,6 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset, if (error || !dblock) goto fail; BUG_ON(extlen < 1); - if (!ra) - extlen = 1; bh = gfs2_meta_ra(ip->i_gl, dblock, extlen); } else { error = gfs2_meta_read(ip->i_gl, dblock, DIO_WAIT, &bh); @@ -328,7 +314,7 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset, extlen--; memcpy(buf, bh->b_data + o, amount); brelse(bh); - buf += amount; + buf += (amount/sizeof(__be64)); copied += amount; lblock++; o = sizeof(struct gfs2_meta_header); @@ -371,7 +357,7 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip) if (hc == NULL) return ERR_PTR(-ENOMEM); - ret = gfs2_dir_read_data(ip, (char *)hc, 0, hsize, 1); + ret = gfs2_dir_read_data(ip, hc, hsize); if (ret < 0) { kfree(hc); return ERR_PTR(ret); @@ -1695,7 +1681,6 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry) const struct qstr *name = &dentry->d_name; struct gfs2_dirent *dent, *prev = NULL; struct buffer_head *bh; - int error; /* Returns _either_ the entry (if its first in block) or the previous entry otherwise */ @@ -1724,22 +1709,15 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry) } brelse(bh); - error = gfs2_meta_inode_buffer(dip, &bh); - if (error) - return error; - if (!dip->i_entries) gfs2_consist_inode(dip); - gfs2_trans_add_bh(dip->i_gl, bh, 1); dip->i_entries--; dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME; if (S_ISDIR(dentry->d_inode->i_mode)) drop_nlink(&dip->i_inode); - gfs2_dinode_out(dip, bh->b_data); - brelse(bh); mark_inode_dirty(&dip->i_inode); - return error; + return 0; } /** @@ -1829,10 +1807,6 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, if (error) goto out_put; - error = gfs2_rindex_hold(sdp, &dip->i_alloc->al_ri_gh); - if (error) - goto out_qs; - /* Count the number of leaves */ bh = leaf_bh; @@ -1847,7 +1821,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, if (blk != leaf_no) brelse(bh); - gfs2_rlist_add(sdp, &rlist, blk); + gfs2_rlist_add(dip, &rlist, blk); l_blocks++; } @@ -1911,8 +1885,6 @@ out_rg_gunlock: gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs); out_rlist: gfs2_rlist_free(&rlist); - gfs2_glock_dq_uninit(&dip->i_alloc->al_ri_gh); -out_qs: gfs2_quota_unhold(dip); out_put: gfs2_alloc_put(dip); diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index edeb9e802903..ce36a56dfeac 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -59,15 +59,24 @@ static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin) struct gfs2_holder i_gh; loff_t error; - if (origin == 2) { + switch (origin) { + case SEEK_END: /* These reference inode->i_size */ + case SEEK_DATA: + case SEEK_HOLE: error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); if (!error) { - error = generic_file_llseek_unlocked(file, offset, origin); + error = generic_file_llseek(file, offset, origin); gfs2_glock_dq_uninit(&i_gh); } - } else - error = generic_file_llseek_unlocked(file, offset, origin); + break; + case SEEK_CUR: + case SEEK_SET: + error = generic_file_llseek(file, offset, origin); + break; + default: + error = -EINVAL; + } return error; } @@ -357,8 +366,15 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) unsigned int data_blocks, ind_blocks, rblocks; struct gfs2_holder gh; struct gfs2_alloc *al; + loff_t size; int ret; + /* Wait if fs is frozen. This is racy so we check again later on + * and retry if the fs has been frozen after the page lock has + * been acquired + */ + vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); ret = gfs2_glock_nq(&gh); if (ret) @@ -367,8 +383,15 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) set_bit(GLF_DIRTY, &ip->i_gl->gl_flags); set_bit(GIF_SW_PAGED, &ip->i_flags); - if (!gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE)) + if (!gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE)) { + lock_page(page); + if (!PageUptodate(page) || page->mapping != inode->i_mapping) { + ret = -EAGAIN; + unlock_page(page); + } goto out_unlock; + } + ret = -ENOMEM; al = gfs2_alloc_get(ip); if (al == NULL) @@ -388,7 +411,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) rblocks += data_blocks ? data_blocks : 1; if (ind_blocks || data_blocks) { rblocks += RES_STATFS + RES_QUOTA; - rblocks += gfs2_rg_blocks(al); + rblocks += gfs2_rg_blocks(ip); } ret = gfs2_trans_begin(sdp, rblocks, 0); if (ret) @@ -396,21 +419,29 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) lock_page(page); ret = -EINVAL; - last_index = ip->i_inode.i_size >> PAGE_CACHE_SHIFT; - if (page->index > last_index) - goto out_unlock_page; + size = i_size_read(inode); + last_index = (size - 1) >> PAGE_CACHE_SHIFT; + /* Check page index against inode size */ + if (size == 0 || (page->index > last_index)) + goto out_trans_end; + + ret = -EAGAIN; + /* If truncated, we must retry the operation, we may have raced + * with the glock demotion code. + */ + if (!PageUptodate(page) || page->mapping != inode->i_mapping) + goto out_trans_end; + + /* Unstuff, if required, and allocate backing blocks for page */ ret = 0; - if (!PageUptodate(page) || page->mapping != ip->i_inode.i_mapping) - goto out_unlock_page; - if (gfs2_is_stuffed(ip)) { + if (gfs2_is_stuffed(ip)) ret = gfs2_unstuff_dinode(ip, page); - if (ret) - goto out_unlock_page; - } - ret = gfs2_allocate_page_backing(page); + if (ret == 0) + ret = gfs2_allocate_page_backing(page); -out_unlock_page: - unlock_page(page); +out_trans_end: + if (ret) + unlock_page(page); gfs2_trans_end(sdp); out_trans_fail: gfs2_inplace_release(ip); @@ -422,11 +453,17 @@ out_unlock: gfs2_glock_dq(&gh); out: gfs2_holder_uninit(&gh); - if (ret == -ENOMEM) - ret = VM_FAULT_OOM; - else if (ret) - ret = VM_FAULT_SIGBUS; - return ret; + if (ret == 0) { + set_page_dirty(page); + /* This check must be post dropping of transaction lock */ + if (inode->i_sb->s_frozen == SB_UNFROZEN) { + wait_on_page_writeback(page); + } else { + ret = -EAGAIN; + unlock_page(page); + } + } + return block_page_mkwrite_return(ret); } static const struct vm_operations_struct gfs2_vm_ops = { @@ -551,8 +588,16 @@ static int gfs2_close(struct inode *inode, struct file *file) * @end: the end position in the file to sync * @datasync: set if we can ignore timestamp changes * - * The VFS will flush data for us. We only need to worry - * about metadata here. + * We split the data flushing here so that we don't wait for the data + * until after we've also sent the metadata to disk. Note that for + * data=ordered, we will write & wait for the data at the log flush + * stage anyway, so this is unlikely to make much of a difference + * except in the data=writeback case. + * + * If the fdatawrite fails due to any reason except -EIO, we will + * continue the remainder of the fsync, although we'll still report + * the error at the end. This is to match filemap_write_and_wait_range() + * behaviour. * * Returns: errno */ @@ -560,30 +605,34 @@ static int gfs2_close(struct inode *inode, struct file *file) static int gfs2_fsync(struct file *file, loff_t start, loff_t end, int datasync) { - struct inode *inode = file->f_mapping->host; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC); struct gfs2_inode *ip = GFS2_I(inode); - int ret; + int ret, ret1 = 0; - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); - if (ret) - return ret; - mutex_lock(&inode->i_mutex); + if (mapping->nrpages) { + ret1 = filemap_fdatawrite_range(mapping, start, end); + if (ret1 == -EIO) + return ret1; + } if (datasync) sync_state &= ~I_DIRTY_SYNC; if (sync_state) { ret = sync_inode_metadata(inode, 1); - if (ret) { - mutex_unlock(&inode->i_mutex); + if (ret) return ret; - } - gfs2_ail_flush(ip->i_gl); + if (gfs2_is_jdata(ip)) + filemap_write_and_wait(mapping); + gfs2_ail_flush(ip->i_gl, 1); } - mutex_unlock(&inode->i_mutex); - return 0; + if (mapping->nrpages) + ret = filemap_fdatawait_range(mapping, start, end); + + return ret ? ret : ret1; } /** @@ -620,135 +669,18 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov, return generic_file_aio_write(iocb, iov, nr_segs, pos); } -static int empty_write_end(struct page *page, unsigned from, - unsigned to, int mode) -{ - struct inode *inode = page->mapping->host; - struct gfs2_inode *ip = GFS2_I(inode); - struct buffer_head *bh; - unsigned offset, blksize = 1 << inode->i_blkbits; - pgoff_t end_index = i_size_read(inode) >> PAGE_CACHE_SHIFT; - - zero_user(page, from, to-from); - mark_page_accessed(page); - - if (page->index < end_index || !(mode & FALLOC_FL_KEEP_SIZE)) { - if (!gfs2_is_writeback(ip)) - gfs2_page_add_databufs(ip, page, from, to); - - block_commit_write(page, from, to); - return 0; - } - - offset = 0; - bh = page_buffers(page); - while (offset < to) { - if (offset >= from) { - set_buffer_uptodate(bh); - mark_buffer_dirty(bh); - clear_buffer_new(bh); - write_dirty_buffer(bh, WRITE); - } - offset += blksize; - bh = bh->b_this_page; - } - - offset = 0; - bh = page_buffers(page); - while (offset < to) { - if (offset >= from) { - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) - return -EIO; - } - offset += blksize; - bh = bh->b_this_page; - } - return 0; -} - -static int needs_empty_write(sector_t block, struct inode *inode) -{ - int error; - struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 }; - - bh_map.b_size = 1 << inode->i_blkbits; - error = gfs2_block_map(inode, block, &bh_map, 0); - if (unlikely(error)) - return error; - return !buffer_mapped(&bh_map); -} - -static int write_empty_blocks(struct page *page, unsigned from, unsigned to, - int mode) -{ - struct inode *inode = page->mapping->host; - unsigned start, end, next, blksize; - sector_t block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); - int ret; - - blksize = 1 << inode->i_blkbits; - next = end = 0; - while (next < from) { - next += blksize; - block++; - } - start = next; - do { - next += blksize; - ret = needs_empty_write(block, inode); - if (unlikely(ret < 0)) - return ret; - if (ret == 0) { - if (end) { - ret = __block_write_begin(page, start, end - start, - gfs2_block_map); - if (unlikely(ret)) - return ret; - ret = empty_write_end(page, start, end, mode); - if (unlikely(ret)) - return ret; - end = 0; - } - start = next; - } - else - end = next; - block++; - } while (next < to); - - if (end) { - ret = __block_write_begin(page, start, end - start, gfs2_block_map); - if (unlikely(ret)) - return ret; - ret = empty_write_end(page, start, end, mode); - if (unlikely(ret)) - return ret; - } - - return 0; -} - static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len, int mode) { struct gfs2_inode *ip = GFS2_I(inode); struct buffer_head *dibh; int error; - u64 start = offset >> PAGE_CACHE_SHIFT; - unsigned int start_offset = offset & ~PAGE_CACHE_MASK; - u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT; - pgoff_t curr; - struct page *page; - unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK; - unsigned int from, to; - - if (!end_offset) - end_offset = PAGE_CACHE_SIZE; + unsigned int nr_blks; + sector_t lblock = offset >> inode->i_blkbits; error = gfs2_meta_inode_buffer(ip, &dibh); if (unlikely(error)) - goto out; + return error; gfs2_trans_add_bh(ip->i_gl, dibh, 1); @@ -758,40 +690,31 @@ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len, goto out; } - curr = start; - offset = start << PAGE_CACHE_SHIFT; - from = start_offset; - to = PAGE_CACHE_SIZE; - while (curr <= end) { - page = grab_cache_page_write_begin(inode->i_mapping, curr, - AOP_FLAG_NOFS); - if (unlikely(!page)) { - error = -ENOMEM; - goto out; - } + while (len) { + struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 }; + bh_map.b_size = len; + set_buffer_zeronew(&bh_map); - if (curr == end) - to = end_offset; - error = write_empty_blocks(page, from, to, mode); - if (!error && offset + to > inode->i_size && - !(mode & FALLOC_FL_KEEP_SIZE)) { - i_size_write(inode, offset + to); - } - unlock_page(page); - page_cache_release(page); - if (error) + error = gfs2_block_map(inode, lblock, &bh_map, 1); + if (unlikely(error)) goto out; - curr++; - offset += PAGE_CACHE_SIZE; - from = 0; + len -= bh_map.b_size; + nr_blks = bh_map.b_size >> inode->i_blkbits; + lblock += nr_blks; + if (!buffer_new(&bh_map)) + continue; + if (unlikely(!buffer_zeronew(&bh_map))) { + error = -EIO; + goto out; + } } + if (offset + len > inode->i_size && !(mode & FALLOC_FL_KEEP_SIZE)) + i_size_write(inode, offset + len); - gfs2_dinode_out(ip, dibh->b_data); mark_inode_dirty(inode); - brelse(dibh); - out: + brelse(dibh); return error; } @@ -799,7 +722,7 @@ static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len, unsigned int *data_blocks, unsigned int *ind_blocks) { const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - unsigned int max_blocks = ip->i_alloc->al_rgd->rd_free_clone; + unsigned int max_blocks = ip->i_rgd->rd_free_clone; unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1); for (tmp = max_data; tmp > sdp->sd_diptrs;) { @@ -831,6 +754,7 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset, int error; loff_t bsize_mask = ~((loff_t)sdp->sd_sb.sb_bsize - 1); loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift; + loff_t max_chunk_size = UINT_MAX & bsize_mask; next = (next + 1) << sdp->sd_sb.sb_bsize_shift; /* We only support the FALLOC_FL_KEEP_SIZE mode */ @@ -884,11 +808,12 @@ retry: goto out_qunlock; } max_bytes = bytes; - calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks); + calc_max_reserv(ip, (len > max_chunk_size)? max_chunk_size: len, + &max_bytes, &data_blocks, &ind_blocks); al->al_requested = data_blocks + ind_blocks; rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA + - RES_RG_HDR + gfs2_rg_blocks(al); + RES_RG_HDR + gfs2_rg_blocks(ip); if (gfs2_is_jdata(ip)) rblocks += data_blocks ? data_blocks : 1; diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 66707118af25..2553b858a72e 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -201,7 +201,7 @@ int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs); void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs); void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs); -__attribute__ ((format(printf, 2, 3))) +__printf(2, 3) void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...); /** diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index da21ecaafcc2..1656df7aacd2 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -28,40 +28,55 @@ #include "trans.h" #include "dir.h" +static void gfs2_ail_error(struct gfs2_glock *gl, const struct buffer_head *bh) +{ + fs_err(gl->gl_sbd, "AIL buffer %p: blocknr %llu state 0x%08lx mapping %p page state 0x%lx\n", + bh, (unsigned long long)bh->b_blocknr, bh->b_state, + bh->b_page->mapping, bh->b_page->flags); + fs_err(gl->gl_sbd, "AIL glock %u:%llu mapping %p\n", + gl->gl_name.ln_type, gl->gl_name.ln_number, + gfs2_glock2aspace(gl)); + gfs2_lm_withdraw(gl->gl_sbd, "AIL error\n"); +} + /** * __gfs2_ail_flush - remove all buffers for a given lock from the AIL * @gl: the glock + * @fsync: set when called from fsync (not all buffers will be clean) * * None of the buffers should be dirty, locked, or pinned. */ -static void __gfs2_ail_flush(struct gfs2_glock *gl) +static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync) { struct gfs2_sbd *sdp = gl->gl_sbd; struct list_head *head = &gl->gl_ail_list; - struct gfs2_bufdata *bd; + struct gfs2_bufdata *bd, *tmp; struct buffer_head *bh; + const unsigned long b_state = (1UL << BH_Dirty)|(1UL << BH_Pinned)|(1UL << BH_Lock); + sector_t blocknr; + gfs2_log_lock(sdp); spin_lock(&sdp->sd_ail_lock); - while (!list_empty(head)) { - bd = list_entry(head->next, struct gfs2_bufdata, - bd_ail_gl_list); + list_for_each_entry_safe(bd, tmp, head, bd_ail_gl_list) { bh = bd->bd_bh; - gfs2_remove_from_ail(bd); - bd->bd_bh = NULL; + if (bh->b_state & b_state) { + if (fsync) + continue; + gfs2_ail_error(gl, bh); + } + blocknr = bh->b_blocknr; bh->b_private = NULL; - spin_unlock(&sdp->sd_ail_lock); + gfs2_remove_from_ail(bd); /* drops ref on bh */ - bd->bd_blkno = bh->b_blocknr; - gfs2_log_lock(sdp); - gfs2_assert_withdraw(sdp, !buffer_busy(bh)); - gfs2_trans_add_revoke(sdp, bd); - gfs2_log_unlock(sdp); + bd->bd_bh = NULL; + bd->bd_blkno = blocknr; - spin_lock(&sdp->sd_ail_lock); + gfs2_trans_add_revoke(sdp, bd); } - gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count)); + BUG_ON(!fsync && atomic_read(&gl->gl_ail_count)); spin_unlock(&sdp->sd_ail_lock); + gfs2_log_unlock(sdp); } @@ -84,13 +99,13 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl) BUG_ON(current->journal_info); current->journal_info = &tr; - __gfs2_ail_flush(gl); + __gfs2_ail_flush(gl, 0); gfs2_trans_end(sdp); gfs2_log_flush(sdp, NULL); } -void gfs2_ail_flush(struct gfs2_glock *gl) +void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync) { struct gfs2_sbd *sdp = gl->gl_sbd; unsigned int revokes = atomic_read(&gl->gl_ail_count); @@ -102,7 +117,7 @@ void gfs2_ail_flush(struct gfs2_glock *gl) ret = gfs2_trans_begin(sdp, 0, revokes); if (ret) return; - __gfs2_ail_flush(gl); + __gfs2_ail_flush(gl, fsync); gfs2_trans_end(sdp); gfs2_log_flush(sdp, NULL); } @@ -119,6 +134,7 @@ void gfs2_ail_flush(struct gfs2_glock *gl) static void rgrp_go_sync(struct gfs2_glock *gl) { struct address_space *metamapping = gfs2_glock2aspace(gl); + struct gfs2_rgrpd *rgd; int error; if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) @@ -130,6 +146,12 @@ static void rgrp_go_sync(struct gfs2_glock *gl) error = filemap_fdatawait(metamapping); mapping_set_error(metamapping, error); gfs2_ail_empty_gl(gl); + + spin_lock(&gl->gl_spin); + rgd = gl->gl_object; + if (rgd) + gfs2_free_clones(rgd); + spin_unlock(&gl->gl_spin); } /** @@ -277,7 +299,7 @@ static void gfs2_set_nlink(struct inode *inode, u32 nlink) if (nlink == 0) clear_nlink(inode); else - inode->i_nlink = nlink; + set_nlink(inode, nlink); } } @@ -430,33 +452,6 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl) } /** - * rgrp_go_lock - operation done after an rgrp lock is locked by - * a first holder on this node. - * @gl: the glock - * @flags: - * - * Returns: errno - */ - -static int rgrp_go_lock(struct gfs2_holder *gh) -{ - return gfs2_rgrp_bh_get(gh->gh_gl->gl_object); -} - -/** - * rgrp_go_unlock - operation done before an rgrp lock is unlocked by - * a last holder on this node. - * @gl: the glock - * @flags: - * - */ - -static void rgrp_go_unlock(struct gfs2_holder *gh) -{ - gfs2_rgrp_bh_put(gh->gh_gl->gl_object); -} - -/** * trans_go_sync - promote/demote the transaction glock * @gl: the glock * @state: the requested state @@ -558,8 +553,8 @@ const struct gfs2_glock_operations gfs2_inode_glops = { const struct gfs2_glock_operations gfs2_rgrp_glops = { .go_xmote_th = rgrp_go_sync, .go_inval = rgrp_go_inval, - .go_lock = rgrp_go_lock, - .go_unlock = rgrp_go_unlock, + .go_lock = gfs2_rgrp_go_lock, + .go_unlock = gfs2_rgrp_go_unlock, .go_dump = gfs2_rgrp_dump, .go_type = LM_TYPE_RGRP, .go_flags = GLOF_ASPACE, diff --git a/fs/gfs2/glops.h b/fs/gfs2/glops.h index 6fce409b5a50..bf95a2dc1662 100644 --- a/fs/gfs2/glops.h +++ b/fs/gfs2/glops.h @@ -23,6 +23,6 @@ extern const struct gfs2_glock_operations gfs2_quota_glops; extern const struct gfs2_glock_operations gfs2_journal_glops; extern const struct gfs2_glock_operations *gfs2_glops_list[]; -extern void gfs2_ail_flush(struct gfs2_glock *gl); +extern void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync); #endif /* __GLOPS_DOT_H__ */ diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 892ac37de8ae..7389dfdcc9ef 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -18,6 +18,7 @@ #include <linux/rcupdate.h> #include <linux/rculist_bl.h> #include <linux/completion.h> +#include <linux/rbtree.h> #define DIO_WAIT 0x00000010 #define DIO_METADATA 0x00000020 @@ -78,8 +79,7 @@ struct gfs2_bitmap { }; struct gfs2_rgrpd { - struct list_head rd_list; /* Link with superblock */ - struct list_head rd_list_mru; + struct rb_node rd_node; /* Link with superblock */ struct gfs2_glock *rd_gl; /* Glock for this rgrp */ u64 rd_addr; /* grp block disk address */ u64 rd_data0; /* first data location */ @@ -91,10 +91,7 @@ struct gfs2_rgrpd { u32 rd_dinodes; u64 rd_igeneration; struct gfs2_bitmap *rd_bits; - struct mutex rd_mutex; - struct gfs2_log_element rd_le; struct gfs2_sbd *rd_sbd; - unsigned int rd_bh_count; u32 rd_last_alloc; u32 rd_flags; #define GFS2_RDF_CHECK 0x10000000 /* check for unlinked inodes */ @@ -106,12 +103,15 @@ struct gfs2_rgrpd { enum gfs2_state_bits { BH_Pinned = BH_PrivateStart, BH_Escaped = BH_PrivateStart + 1, + BH_Zeronew = BH_PrivateStart + 2, }; BUFFER_FNS(Pinned, pinned) TAS_BUFFER_FNS(Pinned, pinned) BUFFER_FNS(Escaped, escaped) TAS_BUFFER_FNS(Escaped, escaped) +BUFFER_FNS(Zeronew, zeronew) +TAS_BUFFER_FNS(Zeronew, zeronew) struct gfs2_bufdata { struct buffer_head *bd_bh; @@ -246,7 +246,6 @@ struct gfs2_glock { struct gfs2_alloc { /* Quota stuff */ - struct gfs2_quota_data *al_qd[2*MAXQUOTAS]; struct gfs2_holder al_qd_ghs[2*MAXQUOTAS]; unsigned int al_qd_num; @@ -255,18 +254,13 @@ struct gfs2_alloc { u32 al_alloced; /* Filled in by gfs2_alloc_*() */ /* Filled in by gfs2_inplace_reserve() */ - - unsigned int al_line; - char *al_file; - struct gfs2_holder al_ri_gh; struct gfs2_holder al_rgd_gh; - struct gfs2_rgrpd *al_rgd; - }; enum { GIF_INVALID = 0, GIF_QD_LOCKED = 1, + GIF_ALLOC_FAILED = 2, GIF_SW_PAGED = 3, }; @@ -282,6 +276,7 @@ struct gfs2_inode { struct gfs2_holder i_iopen_gh; struct gfs2_holder i_gh; /* for prepare/commit_write only */ struct gfs2_alloc *i_alloc; + struct gfs2_rgrpd *i_rgd; u64 i_goal; /* goal block for allocations */ struct rw_semaphore i_rw_mutex; struct list_head i_trunc_list; @@ -574,9 +569,7 @@ struct gfs2_sbd { int sd_rindex_uptodate; spinlock_t sd_rindex_spin; struct mutex sd_rindex_mutex; - struct list_head sd_rindex_list; - struct list_head sd_rindex_mru_list; - struct gfs2_rgrpd *sd_rindex_forward; + struct rb_root sd_rindex_tree; unsigned int sd_rgrps; unsigned int sd_max_rg_data; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 0fb51a96eff0..cfd4959b218c 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -583,7 +583,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name, goto fail_quota_locks; error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + - al->al_rgd->rd_length + + dip->i_rgd->rd_length + 2 * RES_DINODE + RES_STATFS + RES_QUOTA, 0); if (error) @@ -613,8 +613,7 @@ fail_end_trans: gfs2_trans_end(sdp); fail_ipreserv: - if (dip->i_alloc->al_rgd) - gfs2_inplace_release(dip); + gfs2_inplace_release(dip); fail_quota_locks: gfs2_quota_unlock(dip); @@ -624,31 +623,29 @@ fail: return error; } -static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip, - const struct qstr *qstr) +int gfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) { - int err; - size_t len; - void *value; - char *name; - - err = security_inode_init_security(&ip->i_inode, &dip->i_inode, qstr, - &name, &value, &len); - - if (err) { - if (err == -EOPNOTSUPP) - return 0; - return err; + const struct xattr *xattr; + int err = 0; + + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + err = __gfs2_xattr_set(inode, xattr->name, xattr->value, + xattr->value_len, 0, + GFS2_EATYPE_SECURITY); + if (err < 0) + break; } - - err = __gfs2_xattr_set(&ip->i_inode, name, value, len, 0, - GFS2_EATYPE_SECURITY); - kfree(value); - kfree(name); - return err; } +static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip, + const struct qstr *qstr) +{ + return security_inode_init_security(&ip->i_inode, &dip->i_inode, qstr, + &gfs2_initxattrs, NULL); +} + /** * gfs2_create_inode - Create a new inode * @dir: The parent directory @@ -663,7 +660,7 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip, static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, unsigned int mode, dev_t dev, const char *symname, - unsigned int size) + unsigned int size, int excl) { const struct qstr *name = &dentry->d_name; struct gfs2_holder ghs[2]; @@ -683,6 +680,12 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, goto fail; error = create_ok(dip, name, mode); + if ((error == -EEXIST) && S_ISREG(mode) && !excl) { + inode = gfs2_lookupi(dir, &dentry->d_name, 0); + gfs2_glock_dq_uninit(ghs); + d_instantiate(dentry, inode); + return IS_ERR(inode) ? PTR_ERR(inode) : 0; + } if (error) goto fail_gunlock; @@ -725,21 +728,22 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, brelse(bh); gfs2_trans_end(sdp); - if (dip->i_alloc->al_rgd) - gfs2_inplace_release(dip); + gfs2_inplace_release(dip); gfs2_quota_unlock(dip); gfs2_alloc_put(dip); - gfs2_glock_dq_uninit_m(2, ghs); mark_inode_dirty(inode); + gfs2_glock_dq_uninit_m(2, ghs); d_instantiate(dentry, inode); return 0; fail_gunlock2: gfs2_glock_dq_uninit(ghs + 1); - if (inode && !IS_ERR(inode)) - iput(inode); fail_gunlock: gfs2_glock_dq_uninit(ghs); + if (inode && !IS_ERR(inode)) { + set_bit(GIF_ALLOC_FAILED, &GFS2_I(inode)->i_flags); + iput(inode); + } fail: if (bh) brelse(bh); @@ -758,24 +762,10 @@ fail: static int gfs2_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd) { - struct inode *inode; - int ret; - - for (;;) { - ret = gfs2_create_inode(dir, dentry, S_IFREG | mode, 0, NULL, 0); - if (ret != -EEXIST || (nd && (nd->flags & LOOKUP_EXCL))) - return ret; - - inode = gfs2_lookupi(dir, &dentry->d_name, 0); - if (inode) { - if (!IS_ERR(inode)) - break; - return PTR_ERR(inode); - } - } - - d_instantiate(dentry, inode); - return 0; + int excl = 0; + if (nd && (nd->flags & LOOKUP_EXCL)) + excl = 1; + return gfs2_create_inode(dir, dentry, S_IFREG | mode, 0, NULL, 0, excl); } /** @@ -902,7 +892,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, goto out_gunlock_q; error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + - gfs2_rg_blocks(al) + + gfs2_rg_blocks(dip) + 2 * RES_DINODE + RES_STATFS + RES_QUOTA, 0); if (error) @@ -924,8 +914,9 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, gfs2_trans_add_bh(ip->i_gl, dibh, 1); inc_nlink(&ip->i_inode); ip->i_inode.i_ctime = CURRENT_TIME; - gfs2_dinode_out(ip, dibh->b_data); - mark_inode_dirty(&ip->i_inode); + ihold(inode); + d_instantiate(dentry, inode); + mark_inode_dirty(inode); out_brelse: brelse(dibh); @@ -947,11 +938,6 @@ out_child: out_parent: gfs2_holder_uninit(ghs); gfs2_holder_uninit(ghs + 1); - if (!error) { - ihold(inode); - d_instantiate(dentry, inode); - mark_inode_dirty(inode); - } return error; } @@ -1024,8 +1010,6 @@ static int gfs2_unlink_inode(struct gfs2_inode *dip, clear_nlink(inode); else drop_nlink(inode); - gfs2_trans_add_bh(ip->i_gl, bh, 1); - gfs2_dinode_out(ip, bh->b_data); mark_inode_dirty(inode); if (inode->i_nlink == 0) gfs2_unlink_di(inode); @@ -1053,13 +1037,8 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry) struct buffer_head *bh; struct gfs2_holder ghs[3]; struct gfs2_rgrpd *rgd; - struct gfs2_holder ri_gh; int error; - error = gfs2_rindex_hold(sdp, &ri_gh); - if (error) - return error; - gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1); @@ -1116,7 +1095,6 @@ out_child: gfs2_glock_dq(ghs); out_parent: gfs2_holder_uninit(ghs); - gfs2_glock_dq_uninit(&ri_gh); return error; } @@ -1139,7 +1117,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry, if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode) - 1) return -ENAMETOOLONG; - return gfs2_create_inode(dir, dentry, S_IFLNK | S_IRWXUGO, 0, symname, size); + return gfs2_create_inode(dir, dentry, S_IFLNK | S_IRWXUGO, 0, symname, size, 0); } /** @@ -1153,7 +1131,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry, static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode) { - return gfs2_create_inode(dir, dentry, S_IFDIR | mode, 0, NULL, 0); + return gfs2_create_inode(dir, dentry, S_IFDIR | mode, 0, NULL, 0, 0); } /** @@ -1168,7 +1146,7 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode) static int gfs2_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) { - return gfs2_create_inode(dir, dentry, mode, dev, NULL, 0); + return gfs2_create_inode(dir, dentry, mode, dev, NULL, 0, 0); } /* @@ -1234,7 +1212,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, struct gfs2_inode *ip = GFS2_I(odentry->d_inode); struct gfs2_inode *nip = NULL; struct gfs2_sbd *sdp = GFS2_SB(odir); - struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }, ri_gh; + struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }; struct gfs2_rgrpd *nrgd; unsigned int num_gh; int dir_rename = 0; @@ -1248,10 +1226,6 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, return 0; } - error = gfs2_rindex_hold(sdp, &ri_gh); - if (error) - return error; - if (odip != ndip) { error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE, 0, &r_gh); @@ -1388,12 +1362,12 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, al->al_requested = sdp->sd_max_dirres; - error = gfs2_inplace_reserve_ri(ndip); + error = gfs2_inplace_reserve(ndip); if (error) goto out_gunlock_q; error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + - gfs2_rg_blocks(al) + + gfs2_rg_blocks(ndip) + 4 * RES_DINODE + 4 * RES_LEAF + RES_STATFS + RES_QUOTA + 4, 0); if (error) @@ -1459,7 +1433,6 @@ out_gunlock_r: if (r_gh.gh_gl) gfs2_glock_dq_uninit(&r_gh); out: - gfs2_glock_dq_uninit(&ri_gh); return error; } @@ -1563,21 +1536,10 @@ int gfs2_permission(struct inode *inode, int mask) return error; } -static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr) +static int __gfs2_setattr_simple(struct inode *inode, struct iattr *attr) { - struct inode *inode = &ip->i_inode; - struct buffer_head *dibh; - int error; - - error = gfs2_meta_inode_buffer(ip, &dibh); - if (error) - return error; - setattr_copy(inode, attr); mark_inode_dirty(inode); - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - gfs2_dinode_out(ip, dibh->b_data); - brelse(dibh); return 0; } @@ -1589,19 +1551,19 @@ static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr) * Returns: errno */ -int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr) +int gfs2_setattr_simple(struct inode *inode, struct iattr *attr) { int error; if (current->journal_info) - return __gfs2_setattr_simple(ip, attr); + return __gfs2_setattr_simple(inode, attr); - error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE, 0); + error = gfs2_trans_begin(GFS2_SB(inode), RES_DINODE, 0); if (error) return error; - error = __gfs2_setattr_simple(ip, attr); - gfs2_trans_end(GFS2_SB(&ip->i_inode)); + error = __gfs2_setattr_simple(inode, attr); + gfs2_trans_end(GFS2_SB(inode)); return error; } @@ -1639,7 +1601,7 @@ static int setattr_chown(struct inode *inode, struct iattr *attr) if (error) goto out_gunlock_q; - error = gfs2_setattr_simple(ip, attr); + error = gfs2_setattr_simple(inode, attr); if (error) goto out_end_trans; @@ -1695,12 +1657,12 @@ static int gfs2_setattr(struct dentry *dentry, struct iattr *attr) else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode)) error = gfs2_acl_chmod(ip, attr); else - error = gfs2_setattr_simple(ip, attr); + error = gfs2_setattr_simple(inode, attr); out: - gfs2_glock_dq_uninit(&i_gh); if (!error) mark_inode_dirty(inode); + gfs2_glock_dq_uninit(&i_gh); return error; } @@ -1846,7 +1808,7 @@ const struct inode_operations gfs2_file_iops = { .listxattr = gfs2_listxattr, .removexattr = gfs2_removexattr, .fiemap = gfs2_fiemap, - .check_acl = gfs2_check_acl, + .get_acl = gfs2_get_acl, }; const struct inode_operations gfs2_dir_iops = { @@ -1867,7 +1829,7 @@ const struct inode_operations gfs2_dir_iops = { .listxattr = gfs2_listxattr, .removexattr = gfs2_removexattr, .fiemap = gfs2_fiemap, - .check_acl = gfs2_check_acl, + .get_acl = gfs2_get_acl, }; const struct inode_operations gfs2_symlink_iops = { @@ -1882,6 +1844,6 @@ const struct inode_operations gfs2_symlink_iops = { .listxattr = gfs2_listxattr, .removexattr = gfs2_removexattr, .fiemap = gfs2_fiemap, - .check_acl = gfs2_check_acl, + .get_acl = gfs2_get_acl, }; diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index 8d90e0c07672..276e7b52b658 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -109,7 +109,7 @@ extern int gfs2_inode_refresh(struct gfs2_inode *ip); extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, int is_root); extern int gfs2_permission(struct inode *inode, int mask); -extern int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr); +extern int gfs2_setattr_simple(struct inode *inode, struct iattr *attr); extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name); extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf); diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 85c62923ee29..598646434362 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -624,9 +624,9 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull) bh->b_end_io = end_buffer_write_sync; get_bh(bh); if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) - submit_bh(WRITE_SYNC | REQ_META, bh); + submit_bh(WRITE_SYNC | REQ_META | REQ_PRIO, bh); else - submit_bh(WRITE_FLUSH_FUA | REQ_META, bh); + submit_bh(WRITE_FLUSH_FUA | REQ_META | REQ_PRIO, bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 05bbb124699f..0301be655b12 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -60,6 +60,29 @@ static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh) trace_gfs2_pin(bd, 1); } +static bool buffer_is_rgrp(const struct gfs2_bufdata *bd) +{ + return bd->bd_gl->gl_name.ln_type == LM_TYPE_RGRP; +} + +static void maybe_release_space(struct gfs2_bufdata *bd) +{ + struct gfs2_glock *gl = bd->bd_gl; + struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_rgrpd *rgd = gl->gl_object; + unsigned int index = bd->bd_bh->b_blocknr - gl->gl_name.ln_number; + struct gfs2_bitmap *bi = rgd->rd_bits + index; + + if (bi->bi_clone == 0) + return; + if (sdp->sd_args.ar_discard) + gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bd->bd_bh, bi); + memcpy(bi->bi_clone + bi->bi_offset, + bd->bd_bh->b_data + bi->bi_offset, bi->bi_len); + clear_bit(GBF_FULL, &bi->bi_flags); + rgd->rd_free_clone = rgd->rd_free; +} + /** * gfs2_unpin - Unpin a buffer * @sdp: the filesystem the buffer belongs to @@ -81,6 +104,9 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh, mark_buffer_dirty(bh); clear_buffer_pinned(bh); + if (buffer_is_rgrp(bd)) + maybe_release_space(bd); + spin_lock(&sdp->sd_ail_lock); if (bd->bd_ail) { list_del(&bd->bd_ail_st_list); @@ -469,42 +495,6 @@ static void revoke_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass) gfs2_revoke_clean(sdp); } -static void rg_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le) -{ - struct gfs2_rgrpd *rgd; - struct gfs2_trans *tr = current->journal_info; - - tr->tr_touched = 1; - - rgd = container_of(le, struct gfs2_rgrpd, rd_le); - - gfs2_log_lock(sdp); - if (!list_empty(&le->le_list)){ - gfs2_log_unlock(sdp); - return; - } - gfs2_rgrp_bh_hold(rgd); - sdp->sd_log_num_rg++; - list_add(&le->le_list, &sdp->sd_log_le_rg); - gfs2_log_unlock(sdp); -} - -static void rg_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai) -{ - struct list_head *head = &sdp->sd_log_le_rg; - struct gfs2_rgrpd *rgd; - - while (!list_empty(head)) { - rgd = list_entry(head->next, struct gfs2_rgrpd, rd_le.le_list); - list_del_init(&rgd->rd_le.le_list); - sdp->sd_log_num_rg--; - - gfs2_rgrp_repolish_clones(rgd); - gfs2_rgrp_bh_put(rgd); - } - gfs2_assert_warn(sdp, !sdp->sd_log_num_rg); -} - /** * databuf_lo_add - Add a databuf to the transaction. * @@ -705,8 +695,6 @@ static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, brelse(bh_log); brelse(bh_ip); - if (error) - break; sdp->sd_replayed_blocks++; } @@ -771,8 +759,6 @@ const struct gfs2_log_operations gfs2_revoke_lops = { }; const struct gfs2_log_operations gfs2_rg_lops = { - .lo_add = rg_lo_add, - .lo_after_commit = rg_lo_after_commit, .lo_name = "rg", }; diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 29e1ace7953d..8a139ff1919f 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -16,7 +16,7 @@ #include <linux/gfs2_ondisk.h> #include <linux/rcupdate.h> #include <linux/rculist_bl.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include "gfs2.h" #include "incore.h" diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 747238cd9f96..be29858900f6 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -37,7 +37,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb { struct buffer_head *bh, *head; int nr_underway = 0; - int write_op = REQ_META | + int write_op = REQ_META | REQ_PRIO | (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); BUG_ON(!PageLocked(page)); @@ -225,7 +225,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, } bh->b_end_io = end_buffer_read_sync; get_bh(bh); - submit_bh(READ_SYNC | REQ_META, bh); + submit_bh(READ_SYNC | REQ_META | REQ_PRIO, bh); if (!(flags & DIO_WAIT)) return 0; @@ -435,7 +435,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen) if (buffer_uptodate(first_bh)) goto out; if (!buffer_locked(first_bh)) - ll_rw_block(READ_SYNC | REQ_META, 1, &first_bh); + ll_rw_block(READ_SYNC | REQ_META | REQ_PRIO, 1, &first_bh); dblock++; extlen--; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 516516e0c2a2..cb23c2be731a 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -14,6 +14,7 @@ #include <linux/buffer_head.h> #include <linux/blkdev.h> #include <linux/kthread.h> +#include <linux/export.h> #include <linux/namei.h> #include <linux/mount.h> #include <linux/gfs2_ondisk.h> @@ -77,8 +78,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) spin_lock_init(&sdp->sd_rindex_spin); mutex_init(&sdp->sd_rindex_mutex); - INIT_LIST_HEAD(&sdp->sd_rindex_list); - INIT_LIST_HEAD(&sdp->sd_rindex_mru_list); + sdp->sd_rindex_tree.rb_node = NULL; INIT_LIST_HEAD(&sdp->sd_jindex_list); spin_lock_init(&sdp->sd_jindex_spin); @@ -224,7 +224,7 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent) bio->bi_end_io = end_bio_io_page; bio->bi_private = page; - submit_bio(READ_SYNC | REQ_META, bio); + submit_bio(READ_SYNC | REQ_META | REQ_PRIO, bio); wait_on_page_locked(page); bio_put(bio); if (!PageUptodate(page)) { @@ -652,7 +652,6 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) fs_err(sdp, "can't lookup journal index: %d\n", error); return PTR_ERR(sdp->sd_jindex); } - ip = GFS2_I(sdp->sd_jindex); /* Load in the journal index special file */ @@ -764,7 +763,6 @@ fail: static int init_inodes(struct gfs2_sbd *sdp, int undo) { int error = 0; - struct gfs2_inode *ip; struct inode *master = sdp->sd_master_dir->d_inode; if (undo) @@ -789,7 +787,6 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo) fs_err(sdp, "can't get resource index inode: %d\n", error); goto fail_statfs; } - ip = GFS2_I(sdp->sd_rindex); sdp->sd_rindex_uptodate = 0; /* Read in the quota inode */ @@ -1018,13 +1015,13 @@ hostdata_error: fsname++; if (lm->lm_mount == NULL) { fs_info(sdp, "Now mounting FS...\n"); - complete(&sdp->sd_locking_init); + complete_all(&sdp->sd_locking_init); return 0; } ret = lm->lm_mount(sdp, fsname); if (ret == 0) fs_info(sdp, "Joined cluster. Now mounting FS...\n"); - complete(&sdp->sd_locking_init); + complete_all(&sdp->sd_locking_init); return ret; } diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 42e8d23bc047..7e528dc14f85 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -638,15 +638,18 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, unsigned long index = loc >> PAGE_CACHE_SHIFT; unsigned offset = loc & (PAGE_CACHE_SIZE - 1); unsigned blocksize, iblock, pos; - struct buffer_head *bh, *dibh; + struct buffer_head *bh; struct page *page; void *kaddr, *ptr; struct gfs2_quota q, *qp; int err, nbytes; u64 size; - if (gfs2_is_stuffed(ip)) - gfs2_unstuff_dinode(ip, NULL); + if (gfs2_is_stuffed(ip)) { + err = gfs2_unstuff_dinode(ip, NULL); + if (err) + return err; + } memset(&q, 0, sizeof(struct gfs2_quota)); err = gfs2_internal_read(ip, NULL, (char *)&q, &loc, sizeof(q)); @@ -709,7 +712,7 @@ get_a_page: set_buffer_uptodate(bh); if (!buffer_uptodate(bh)) { - ll_rw_block(READ_META, 1, &bh); + ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) goto unlock_out; @@ -736,22 +739,13 @@ get_a_page: goto get_a_page; } - /* Update the disk inode timestamp and size (if extended) */ - err = gfs2_meta_inode_buffer(ip, &dibh); - if (err) - goto out; - size = loc + sizeof(struct gfs2_quota); if (size > inode->i_size) i_size_write(inode, size); inode->i_mtime = inode->i_atime = CURRENT_TIME; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - gfs2_dinode_out(ip, dibh->b_data); - brelse(dibh); mark_inode_dirty(inode); - -out: return err; + unlock_out: unlock_page(page); page_cache_release(page); @@ -822,7 +816,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) goto out_alloc; if (nalloc) - blocks += gfs2_rg_blocks(al) + nalloc * ind_blocks + RES_STATFS; + blocks += gfs2_rg_blocks(ip) + nalloc * ind_blocks + RES_STATFS; error = gfs2_trans_begin(sdp, blocks, 0); if (error) @@ -936,7 +930,9 @@ int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid) unsigned int x; int error = 0; - gfs2_quota_hold(ip, uid, gid); + error = gfs2_quota_hold(ip, uid, gid); + if (error) + return error; if (capable(CAP_SYS_RESOURCE) || sdp->sd_args.ar_quota != GFS2_QUOTA_ON) @@ -1607,7 +1603,7 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id, error = gfs2_inplace_reserve(ip); if (error) goto out_alloc; - blocks += gfs2_rg_blocks(al); + blocks += gfs2_rg_blocks(ip); } /* Some quotas span block boundaries and can update two blocks, diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 7f8af1eb02de..96bd6d759f29 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -15,6 +15,7 @@ #include <linux/gfs2_ondisk.h> #include <linux/prefetch.h> #include <linux/blkdev.h> +#include <linux/rbtree.h> #include "gfs2.h" #include "incore.h" @@ -328,18 +329,22 @@ static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block) struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk) { - struct gfs2_rgrpd *rgd; + struct rb_node **newn; + struct gfs2_rgrpd *cur; spin_lock(&sdp->sd_rindex_spin); - - list_for_each_entry(rgd, &sdp->sd_rindex_mru_list, rd_list_mru) { - if (rgrp_contains_block(rgd, blk)) { - list_move(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list); + newn = &sdp->sd_rindex_tree.rb_node; + while (*newn) { + cur = rb_entry(*newn, struct gfs2_rgrpd, rd_node); + if (blk < cur->rd_addr) + newn = &((*newn)->rb_left); + else if (blk >= cur->rd_data0 + cur->rd_data) + newn = &((*newn)->rb_right); + else { spin_unlock(&sdp->sd_rindex_spin); - return rgd; + return cur; } } - spin_unlock(&sdp->sd_rindex_spin); return NULL; @@ -354,8 +359,15 @@ struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk) struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp) { - gfs2_assert(sdp, !list_empty(&sdp->sd_rindex_list)); - return list_entry(sdp->sd_rindex_list.next, struct gfs2_rgrpd, rd_list); + const struct rb_node *n; + struct gfs2_rgrpd *rgd; + + spin_lock(&sdp->sd_rindex_spin); + n = rb_first(&sdp->sd_rindex_tree); + rgd = rb_entry(n, struct gfs2_rgrpd, rd_node); + spin_unlock(&sdp->sd_rindex_spin); + + return rgd; } /** @@ -367,47 +379,60 @@ struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp) struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd) { - if (rgd->rd_list.next == &rgd->rd_sbd->sd_rindex_list) + struct gfs2_sbd *sdp = rgd->rd_sbd; + const struct rb_node *n; + + spin_lock(&sdp->sd_rindex_spin); + n = rb_next(&rgd->rd_node); + if (n == NULL) + n = rb_first(&sdp->sd_rindex_tree); + + if (unlikely(&rgd->rd_node == n)) { + spin_unlock(&sdp->sd_rindex_spin); return NULL; - return list_entry(rgd->rd_list.next, struct gfs2_rgrpd, rd_list); + } + rgd = rb_entry(n, struct gfs2_rgrpd, rd_node); + spin_unlock(&sdp->sd_rindex_spin); + return rgd; } -static void clear_rgrpdi(struct gfs2_sbd *sdp) +void gfs2_free_clones(struct gfs2_rgrpd *rgd) { - struct list_head *head; + int x; + + for (x = 0; x < rgd->rd_length; x++) { + struct gfs2_bitmap *bi = rgd->rd_bits + x; + kfree(bi->bi_clone); + bi->bi_clone = NULL; + } +} + +void gfs2_clear_rgrpd(struct gfs2_sbd *sdp) +{ + struct rb_node *n; struct gfs2_rgrpd *rgd; struct gfs2_glock *gl; - spin_lock(&sdp->sd_rindex_spin); - sdp->sd_rindex_forward = NULL; - spin_unlock(&sdp->sd_rindex_spin); - - head = &sdp->sd_rindex_list; - while (!list_empty(head)) { - rgd = list_entry(head->next, struct gfs2_rgrpd, rd_list); + while ((n = rb_first(&sdp->sd_rindex_tree))) { + rgd = rb_entry(n, struct gfs2_rgrpd, rd_node); gl = rgd->rd_gl; - list_del(&rgd->rd_list); - list_del(&rgd->rd_list_mru); + rb_erase(n, &sdp->sd_rindex_tree); if (gl) { + spin_lock(&gl->gl_spin); gl->gl_object = NULL; + spin_unlock(&gl->gl_spin); gfs2_glock_add_to_lru(gl); gfs2_glock_put(gl); } + gfs2_free_clones(rgd); kfree(rgd->rd_bits); kmem_cache_free(gfs2_rgrpd_cachep, rgd); } } -void gfs2_clear_rgrpd(struct gfs2_sbd *sdp) -{ - mutex_lock(&sdp->sd_rindex_mutex); - clear_rgrpdi(sdp); - mutex_unlock(&sdp->sd_rindex_mutex); -} - static void gfs2_rindex_print(const struct gfs2_rgrpd *rgd) { printk(KERN_INFO " ri_addr = %llu\n", (unsigned long long)rgd->rd_addr); @@ -524,22 +549,34 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp) return total_data; } -static void gfs2_rindex_in(struct gfs2_rgrpd *rgd, const void *buf) +static void rgd_insert(struct gfs2_rgrpd *rgd) { - const struct gfs2_rindex *str = buf; + struct gfs2_sbd *sdp = rgd->rd_sbd; + struct rb_node **newn = &sdp->sd_rindex_tree.rb_node, *parent = NULL; + + /* Figure out where to put new node */ + while (*newn) { + struct gfs2_rgrpd *cur = rb_entry(*newn, struct gfs2_rgrpd, + rd_node); + + parent = *newn; + if (rgd->rd_addr < cur->rd_addr) + newn = &((*newn)->rb_left); + else if (rgd->rd_addr > cur->rd_addr) + newn = &((*newn)->rb_right); + else + return; + } - rgd->rd_addr = be64_to_cpu(str->ri_addr); - rgd->rd_length = be32_to_cpu(str->ri_length); - rgd->rd_data0 = be64_to_cpu(str->ri_data0); - rgd->rd_data = be32_to_cpu(str->ri_data); - rgd->rd_bitbytes = be32_to_cpu(str->ri_bitbytes); + rb_link_node(&rgd->rd_node, parent, newn); + rb_insert_color(&rgd->rd_node, &sdp->sd_rindex_tree); } /** * read_rindex_entry - Pull in a new resource index entry from the disk * @gl: The glock covering the rindex inode * - * Returns: 0 on success, error code otherwise + * Returns: 0 on success, > 0 on EOF, error code otherwise */ static int read_rindex_entry(struct gfs2_inode *ip, @@ -547,44 +584,53 @@ static int read_rindex_entry(struct gfs2_inode *ip, { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex); - char buf[sizeof(struct gfs2_rindex)]; + struct gfs2_rindex buf; int error; struct gfs2_rgrpd *rgd; - error = gfs2_internal_read(ip, ra_state, buf, &pos, + if (pos >= i_size_read(&ip->i_inode)) + return 1; + + error = gfs2_internal_read(ip, ra_state, (char *)&buf, &pos, sizeof(struct gfs2_rindex)); - if (!error) - return 0; - if (error != sizeof(struct gfs2_rindex)) { - if (error > 0) - error = -EIO; - return error; - } + + if (error != sizeof(struct gfs2_rindex)) + return (error == 0) ? 1 : error; rgd = kmem_cache_zalloc(gfs2_rgrpd_cachep, GFP_NOFS); error = -ENOMEM; if (!rgd) return error; - mutex_init(&rgd->rd_mutex); - lops_init_le(&rgd->rd_le, &gfs2_rg_lops); rgd->rd_sbd = sdp; + rgd->rd_addr = be64_to_cpu(buf.ri_addr); + rgd->rd_length = be32_to_cpu(buf.ri_length); + rgd->rd_data0 = be64_to_cpu(buf.ri_data0); + rgd->rd_data = be32_to_cpu(buf.ri_data); + rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes); - list_add_tail(&rgd->rd_list, &sdp->sd_rindex_list); - list_add_tail(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list); - - gfs2_rindex_in(rgd, buf); error = compute_bitstructs(rgd); if (error) - return error; + goto fail; error = gfs2_glock_get(sdp, rgd->rd_addr, &gfs2_rgrp_glops, CREATE, &rgd->rd_gl); if (error) - return error; + goto fail; rgd->rd_gl->gl_object = rgd; rgd->rd_flags &= ~GFS2_RDF_UPTODATE; + if (rgd->rd_data > sdp->sd_max_rg_data) + sdp->sd_max_rg_data = rgd->rd_data; + spin_lock(&sdp->sd_rindex_spin); + rgd_insert(rgd); + sdp->sd_rgrps++; + spin_unlock(&sdp->sd_rindex_spin); + return error; + +fail: + kfree(rgd->rd_bits); + kmem_cache_free(gfs2_rgrpd_cachep, rgd); return error; } @@ -595,40 +641,28 @@ static int read_rindex_entry(struct gfs2_inode *ip, * Returns: 0 on successful update, error code otherwise */ -int gfs2_ri_update(struct gfs2_inode *ip) +static int gfs2_ri_update(struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct inode *inode = &ip->i_inode; struct file_ra_state ra_state; - u64 rgrp_count = i_size_read(inode); - struct gfs2_rgrpd *rgd; - unsigned int max_data = 0; int error; - do_div(rgrp_count, sizeof(struct gfs2_rindex)); - clear_rgrpdi(sdp); - file_ra_state_init(&ra_state, inode->i_mapping); - for (sdp->sd_rgrps = 0; sdp->sd_rgrps < rgrp_count; sdp->sd_rgrps++) { + do { error = read_rindex_entry(ip, &ra_state); - if (error) { - clear_rgrpdi(sdp); - return error; - } - } + } while (error == 0); + + if (error < 0) + return error; - list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list) - if (rgd->rd_data > max_data) - max_data = rgd->rd_data; - sdp->sd_max_rg_data = max_data; sdp->sd_rindex_uptodate = 1; return 0; } /** - * gfs2_rindex_hold - Grab a lock on the rindex + * gfs2_rindex_update - Update the rindex if required * @sdp: The GFS2 superblock - * @ri_gh: the glock holder * * We grab a lock on the rindex inode to make sure that it doesn't * change whilst we are performing an operation. We keep this lock @@ -640,30 +674,29 @@ int gfs2_ri_update(struct gfs2_inode *ip) * special file, which might have been updated if someone expanded the * filesystem (via gfs2_grow utility), which adds new resource groups. * - * Returns: 0 on success, error code otherwise + * Returns: 0 on succeess, error code otherwise */ -int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh) +int gfs2_rindex_update(struct gfs2_sbd *sdp) { struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex); struct gfs2_glock *gl = ip->i_gl; - int error; - - error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, ri_gh); - if (error) - return error; + struct gfs2_holder ri_gh; + int error = 0; /* Read new copy from disk if we don't have the latest */ if (!sdp->sd_rindex_uptodate) { mutex_lock(&sdp->sd_rindex_mutex); - if (!sdp->sd_rindex_uptodate) { + error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, &ri_gh); + if (error) + return error; + if (!sdp->sd_rindex_uptodate) error = gfs2_ri_update(ip); - if (error) - gfs2_glock_dq_uninit(ri_gh); - } + gfs2_glock_dq_uninit(&ri_gh); mutex_unlock(&sdp->sd_rindex_mutex); } + return error; } @@ -694,7 +727,7 @@ static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf) } /** - * gfs2_rgrp_bh_get - Read in a RG's header and bitmaps + * gfs2_rgrp_go_lock - Read in a RG's header and bitmaps * @rgd: the struct gfs2_rgrpd describing the RG to read in * * Read in all of a Resource Group's header and bitmap blocks. @@ -703,8 +736,9 @@ static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf) * Returns: errno */ -int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd) +int gfs2_rgrp_go_lock(struct gfs2_holder *gh) { + struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object; struct gfs2_sbd *sdp = rgd->rd_sbd; struct gfs2_glock *gl = rgd->rd_gl; unsigned int length = rgd->rd_length; @@ -712,17 +746,6 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd) unsigned int x, y; int error; - mutex_lock(&rgd->rd_mutex); - - spin_lock(&sdp->sd_rindex_spin); - if (rgd->rd_bh_count) { - rgd->rd_bh_count++; - spin_unlock(&sdp->sd_rindex_spin); - mutex_unlock(&rgd->rd_mutex); - return 0; - } - spin_unlock(&sdp->sd_rindex_spin); - for (x = 0; x < length; x++) { bi = rgd->rd_bits + x; error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, &bi->bi_bh); @@ -747,15 +770,9 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd) clear_bit(GBF_FULL, &rgd->rd_bits[x].bi_flags); gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data); rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK); + rgd->rd_free_clone = rgd->rd_free; } - spin_lock(&sdp->sd_rindex_spin); - rgd->rd_free_clone = rgd->rd_free; - rgd->rd_bh_count++; - spin_unlock(&sdp->sd_rindex_spin); - - mutex_unlock(&rgd->rd_mutex); - return 0; fail: @@ -765,52 +782,32 @@ fail: bi->bi_bh = NULL; gfs2_assert_warn(sdp, !bi->bi_clone); } - mutex_unlock(&rgd->rd_mutex); return error; } -void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd) -{ - struct gfs2_sbd *sdp = rgd->rd_sbd; - - spin_lock(&sdp->sd_rindex_spin); - gfs2_assert_warn(rgd->rd_sbd, rgd->rd_bh_count); - rgd->rd_bh_count++; - spin_unlock(&sdp->sd_rindex_spin); -} - /** - * gfs2_rgrp_bh_put - Release RG bitmaps read in with gfs2_rgrp_bh_get() + * gfs2_rgrp_go_unlock - Release RG bitmaps read in with gfs2_rgrp_bh_get() * @rgd: the struct gfs2_rgrpd describing the RG to read in * */ -void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd) +void gfs2_rgrp_go_unlock(struct gfs2_holder *gh) { - struct gfs2_sbd *sdp = rgd->rd_sbd; + struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object; int x, length = rgd->rd_length; - spin_lock(&sdp->sd_rindex_spin); - gfs2_assert_warn(rgd->rd_sbd, rgd->rd_bh_count); - if (--rgd->rd_bh_count) { - spin_unlock(&sdp->sd_rindex_spin); - return; - } - for (x = 0; x < length; x++) { struct gfs2_bitmap *bi = rgd->rd_bits + x; - kfree(bi->bi_clone); - bi->bi_clone = NULL; brelse(bi->bi_bh); bi->bi_bh = NULL; } - spin_unlock(&sdp->sd_rindex_spin); } -static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, - const struct gfs2_bitmap *bi) +void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, + struct buffer_head *bh, + const struct gfs2_bitmap *bi) { struct super_block *sb = sdp->sd_vfs; struct block_device *bdev = sb->s_bdev; @@ -823,7 +820,7 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, unsigned int x; for (x = 0; x < bi->bi_len; x++) { - const u8 *orig = bi->bi_bh->b_data + bi->bi_offset + x; + const u8 *orig = bh->b_data + bi->bi_offset + x; const u8 *clone = bi->bi_clone + bi->bi_offset + x; u8 diff = ~(*orig | (*orig >> 1)) & (*clone | (*clone >> 1)); diff &= 0x55; @@ -862,28 +859,6 @@ fail: sdp->sd_args.ar_discard = 0; } -void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd) -{ - struct gfs2_sbd *sdp = rgd->rd_sbd; - unsigned int length = rgd->rd_length; - unsigned int x; - - for (x = 0; x < length; x++) { - struct gfs2_bitmap *bi = rgd->rd_bits + x; - if (!bi->bi_clone) - continue; - if (sdp->sd_args.ar_discard) - gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bi); - clear_bit(GBF_FULL, &bi->bi_flags); - memcpy(bi->bi_clone + bi->bi_offset, - bi->bi_bh->b_data + bi->bi_offset, bi->bi_len); - } - - spin_lock(&sdp->sd_rindex_spin); - rgd->rd_free_clone = rgd->rd_free; - spin_unlock(&sdp->sd_rindex_spin); -} - /** * gfs2_alloc_get - get the struct gfs2_alloc structure for an inode * @ip: the incore GFS2 inode structure @@ -893,38 +868,35 @@ void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd) struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip) { + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + int error; BUG_ON(ip->i_alloc != NULL); ip->i_alloc = kzalloc(sizeof(struct gfs2_alloc), GFP_NOFS); + error = gfs2_rindex_update(sdp); + if (error) + fs_warn(sdp, "rindex update returns %d\n", error); return ip->i_alloc; } /** * try_rgrp_fit - See if a given reservation will fit in a given RG * @rgd: the RG data - * @al: the struct gfs2_alloc structure describing the reservation + * @ip: the inode * * If there's room for the requested blocks to be allocated from the RG: - * Sets the $al_rgd field in @al. * * Returns: 1 on success (it fits), 0 on failure (it doesn't fit) */ -static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al) +static int try_rgrp_fit(const struct gfs2_rgrpd *rgd, const struct gfs2_inode *ip) { - struct gfs2_sbd *sdp = rgd->rd_sbd; - int ret = 0; + const struct gfs2_alloc *al = ip->i_alloc; if (rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR)) return 0; - - spin_lock(&sdp->sd_rindex_spin); - if (rgd->rd_free_clone >= al->al_requested) { - al->al_rgd = rgd; - ret = 1; - } - spin_unlock(&sdp->sd_rindex_spin); - - return ret; + if (rgd->rd_free_clone >= al->al_requested) + return 1; + return 0; } /** @@ -992,76 +964,6 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip } /** - * recent_rgrp_next - get next RG from "recent" list - * @cur_rgd: current rgrp - * - * Returns: The next rgrp in the recent list - */ - -static struct gfs2_rgrpd *recent_rgrp_next(struct gfs2_rgrpd *cur_rgd) -{ - struct gfs2_sbd *sdp = cur_rgd->rd_sbd; - struct list_head *head; - struct gfs2_rgrpd *rgd; - - spin_lock(&sdp->sd_rindex_spin); - head = &sdp->sd_rindex_mru_list; - if (unlikely(cur_rgd->rd_list_mru.next == head)) { - spin_unlock(&sdp->sd_rindex_spin); - return NULL; - } - rgd = list_entry(cur_rgd->rd_list_mru.next, struct gfs2_rgrpd, rd_list_mru); - spin_unlock(&sdp->sd_rindex_spin); - return rgd; -} - -/** - * forward_rgrp_get - get an rgrp to try next from full list - * @sdp: The GFS2 superblock - * - * Returns: The rgrp to try next - */ - -static struct gfs2_rgrpd *forward_rgrp_get(struct gfs2_sbd *sdp) -{ - struct gfs2_rgrpd *rgd; - unsigned int journals = gfs2_jindex_size(sdp); - unsigned int rg = 0, x; - - spin_lock(&sdp->sd_rindex_spin); - - rgd = sdp->sd_rindex_forward; - if (!rgd) { - if (sdp->sd_rgrps >= journals) - rg = sdp->sd_rgrps * sdp->sd_jdesc->jd_jid / journals; - - for (x = 0, rgd = gfs2_rgrpd_get_first(sdp); x < rg; - x++, rgd = gfs2_rgrpd_get_next(rgd)) - /* Do Nothing */; - - sdp->sd_rindex_forward = rgd; - } - - spin_unlock(&sdp->sd_rindex_spin); - - return rgd; -} - -/** - * forward_rgrp_set - set the forward rgrp pointer - * @sdp: the filesystem - * @rgd: The new forward rgrp - * - */ - -static void forward_rgrp_set(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd) -{ - spin_lock(&sdp->sd_rindex_spin); - sdp->sd_rindex_forward = rgd; - spin_unlock(&sdp->sd_rindex_spin); -} - -/** * get_local_rgrp - Choose and lock a rgrp for allocation * @ip: the inode to reserve space for * @rgp: the chosen and locked rgrp @@ -1076,14 +978,18 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_rgrpd *rgd, *begin = NULL; struct gfs2_alloc *al = ip->i_alloc; - int flags = LM_FLAG_TRY; - int skipped = 0; - int loops = 0; int error, rg_locked; + int loops = 0; + + if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal)) + rgd = begin = ip->i_rgd; + else + rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal); - rgd = gfs2_blk2rgrpd(sdp, ip->i_goal); + if (rgd == NULL) + return -EBADSLT; - while (rgd) { + while (loops < 3) { rg_locked = 0; if (gfs2_glock_is_locked_by_me(rgd->rd_gl)) { @@ -1095,92 +1001,36 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) } switch (error) { case 0: - if (try_rgrp_fit(rgd, al)) - goto out; + if (try_rgrp_fit(rgd, ip)) { + ip->i_rgd = rgd; + return 0; + } if (rgd->rd_flags & GFS2_RDF_CHECK) try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr); if (!rg_locked) gfs2_glock_dq_uninit(&al->al_rgd_gh); /* fall through */ case GLR_TRYFAILED: - rgd = recent_rgrp_next(rgd); - break; - - default: - return error; - } - } - - /* Go through full list of rgrps */ - - begin = rgd = forward_rgrp_get(sdp); - - for (;;) { - rg_locked = 0; - - if (gfs2_glock_is_locked_by_me(rgd->rd_gl)) { - rg_locked = 1; - error = 0; - } else { - error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, flags, - &al->al_rgd_gh); - } - switch (error) { - case 0: - if (try_rgrp_fit(rgd, al)) - goto out; - if (rgd->rd_flags & GFS2_RDF_CHECK) - try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr); - if (!rg_locked) - gfs2_glock_dq_uninit(&al->al_rgd_gh); - break; - - case GLR_TRYFAILED: - skipped++; + rgd = gfs2_rgrpd_get_next(rgd); + if (rgd == begin) + loops++; break; - default: return error; } - - rgd = gfs2_rgrpd_get_next(rgd); - if (!rgd) - rgd = gfs2_rgrpd_get_first(sdp); - - if (rgd == begin) { - if (++loops >= 3) - return -ENOSPC; - if (!skipped) - loops++; - flags = 0; - if (loops == 2) - gfs2_log_flush(sdp, NULL); - } } -out: - if (begin) { - spin_lock(&sdp->sd_rindex_spin); - list_move(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list); - spin_unlock(&sdp->sd_rindex_spin); - rgd = gfs2_rgrpd_get_next(rgd); - if (!rgd) - rgd = gfs2_rgrpd_get_first(sdp); - forward_rgrp_set(sdp, rgd); - } - - return 0; + return -ENOSPC; } /** - * gfs2_inplace_reserve_i - Reserve space in the filesystem + * gfs2_inplace_reserve - Reserve space in the filesystem * @ip: the inode to reserve space for * * Returns: errno */ -int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex, - char *file, unsigned int line) +int gfs2_inplace_reserve(struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_alloc *al = ip->i_alloc; @@ -1191,45 +1041,22 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex, if (gfs2_assert_warn(sdp, al->al_requested)) return -EINVAL; - if (hold_rindex) { - /* We need to hold the rindex unless the inode we're using is - the rindex itself, in which case it's already held. */ - if (ip != GFS2_I(sdp->sd_rindex)) - error = gfs2_rindex_hold(sdp, &al->al_ri_gh); - else if (!sdp->sd_rgrps) /* We may not have the rindex read - in, so: */ - error = gfs2_ri_update(ip); - if (error) - return error; - } - -try_again: do { error = get_local_rgrp(ip, &last_unlinked); - /* If there is no space, flushing the log may release some */ - if (error) { - if (ip == GFS2_I(sdp->sd_rindex) && - !sdp->sd_rindex_uptodate) { - error = gfs2_ri_update(ip); - if (error) - return error; - goto try_again; - } - gfs2_log_flush(sdp, NULL); + if (error != -ENOSPC) + break; + /* Check that fs hasn't grown if writing to rindex */ + if (ip == GFS2_I(sdp->sd_rindex) && !sdp->sd_rindex_uptodate) { + error = gfs2_ri_update(ip); + if (error) + break; + continue; } - } while (error && tries++ < 3); - - if (error) { - if (hold_rindex && ip != GFS2_I(sdp->sd_rindex)) - gfs2_glock_dq_uninit(&al->al_ri_gh); - return error; - } - - /* no error, so we have the rgrp set in the inode's allocation. */ - al->al_file = file; - al->al_line = line; + /* Flushing the log may release space */ + gfs2_log_flush(sdp, NULL); + } while (tries++ < 3); - return 0; + return error; } /** @@ -1241,20 +1068,10 @@ try_again: void gfs2_inplace_release(struct gfs2_inode *ip) { - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_alloc *al = ip->i_alloc; - if (gfs2_assert_warn(sdp, al->al_alloced <= al->al_requested) == -1) - fs_warn(sdp, "al_alloced = %u, al_requested = %u " - "al_file = %s, al_line = %u\n", - al->al_alloced, al->al_requested, al->al_file, - al->al_line); - - al->al_rgd = NULL; if (al->al_rgd_gh.gh_gl) gfs2_glock_dq_uninit(&al->al_rgd_gh); - if (ip != GFS2_I(sdp->sd_rindex) && al->al_ri_gh.gh_gl) - gfs2_glock_dq_uninit(&al->al_ri_gh); } /** @@ -1352,6 +1169,7 @@ do_search: /* The GFS2_BLKST_UNLINKED state doesn't apply to the clone bitmaps, so we must search the originals for that. */ buffer = bi->bi_bh->b_data + bi->bi_offset; + WARN_ON(!buffer_uptodate(bi->bi_bh)); if (old_state != GFS2_BLKST_UNLINKED && bi->bi_clone) buffer = bi->bi_clone + bi->bi_offset; @@ -1371,6 +1189,7 @@ skip: if (blk == BFITNOENT) return blk; + *n = 1; if (old_state == new_state) goto out; @@ -1503,7 +1322,7 @@ int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n) if (al == NULL) return -ECANCELED; - rgd = al->al_rgd; + rgd = ip->i_rgd; if (rgrp_contains_block(rgd, ip->i_goal)) goal = ip->i_goal - rgd->rd_data0; @@ -1518,7 +1337,7 @@ int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n) rgd->rd_last_alloc = blk; block = rgd->rd_data0 + blk; - ip->i_goal = block; + ip->i_goal = block + *n - 1; error = gfs2_meta_inode_buffer(ip, &dibh); if (error == 0) { struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data; @@ -1539,9 +1358,7 @@ int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n) gfs2_statfs_change(sdp, 0, -(s64)*n, 0); gfs2_quota_change(ip, *n, ip->i_inode.i_uid, ip->i_inode.i_gid); - spin_lock(&sdp->sd_rindex_spin); rgd->rd_free_clone -= *n; - spin_unlock(&sdp->sd_rindex_spin); trace_gfs2_block_alloc(ip, block, *n, GFS2_BLKST_USED); *bn = block; return 0; @@ -1564,7 +1381,7 @@ int gfs2_alloc_di(struct gfs2_inode *dip, u64 *bn, u64 *generation) { struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); struct gfs2_alloc *al = dip->i_alloc; - struct gfs2_rgrpd *rgd = al->al_rgd; + struct gfs2_rgrpd *rgd = dip->i_rgd; u32 blk; u64 block; unsigned int n = 1; @@ -1594,9 +1411,7 @@ int gfs2_alloc_di(struct gfs2_inode *dip, u64 *bn, u64 *generation) gfs2_statfs_change(sdp, 0, -1, +1); gfs2_trans_add_unrevoke(sdp, block, 1); - spin_lock(&sdp->sd_rindex_spin); rgd->rd_free_clone--; - spin_unlock(&sdp->sd_rindex_spin); trace_gfs2_block_alloc(dip, block, 1, GFS2_BLKST_DINODE); *bn = block; return 0; @@ -1629,8 +1444,6 @@ void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta) gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); - gfs2_trans_add_rg(rgd); - /* Directories keep their data in the metadata address space */ if (meta || ip->i_depth) gfs2_meta_wipe(ip, bstart, blen); @@ -1666,7 +1479,6 @@ void gfs2_unlink_di(struct inode *inode) trace_gfs2_block_alloc(ip, blkno, 1, GFS2_BLKST_UNLINKED); gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); - gfs2_trans_add_rg(rgd); } static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno) @@ -1688,7 +1500,6 @@ static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno) gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); gfs2_statfs_change(sdp, 0, +1, -1); - gfs2_trans_add_rg(rgd); } @@ -1714,41 +1525,33 @@ void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip) int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type) { struct gfs2_rgrpd *rgd; - struct gfs2_holder ri_gh, rgd_gh; - struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex); - int ri_locked = 0; + struct gfs2_holder rgd_gh; int error; - if (!gfs2_glock_is_locked_by_me(ip->i_gl)) { - error = gfs2_rindex_hold(sdp, &ri_gh); - if (error) - goto fail; - ri_locked = 1; - } + error = gfs2_rindex_update(sdp); + if (error) + return error; error = -EINVAL; rgd = gfs2_blk2rgrpd(sdp, no_addr); if (!rgd) - goto fail_rindex; + goto fail; error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh); if (error) - goto fail_rindex; + goto fail; if (gfs2_get_block_type(rgd, no_addr) != type) error = -ESTALE; gfs2_glock_dq_uninit(&rgd_gh); -fail_rindex: - if (ri_locked) - gfs2_glock_dq_uninit(&ri_gh); fail: return error; } /** * gfs2_rlist_add - add a RG to a list of RGs - * @sdp: the filesystem + * @ip: the inode * @rlist: the list of resource groups * @block: the block * @@ -1758,9 +1561,10 @@ fail: * */ -void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist, +void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist, u64 block) { + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_rgrpd *rgd; struct gfs2_rgrpd **tmp; unsigned int new_space; @@ -1769,12 +1573,15 @@ void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist, if (gfs2_assert_warn(sdp, !rlist->rl_ghs)) return; - rgd = gfs2_blk2rgrpd(sdp, block); + if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, block)) + rgd = ip->i_rgd; + else + rgd = gfs2_blk2rgrpd(sdp, block); if (!rgd) { - if (gfs2_consist(sdp)) - fs_err(sdp, "block = %llu\n", (unsigned long long)block); + fs_err(sdp, "rlist_add: no rgrp for block %llu\n", (unsigned long long)block); return; } + ip->i_rgd = rgd; for (x = 0; x < rlist->rl_rgrps; x++) if (rlist->rl_rgd[x] == rgd) diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index d253f9a8c70e..cf5c50180192 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -18,18 +18,15 @@ struct gfs2_holder; extern void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd); -struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk); -struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp); -struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd); +extern struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk); +extern struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp); +extern struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd); extern void gfs2_clear_rgrpd(struct gfs2_sbd *sdp); -extern int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh); - -extern int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd); -extern void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd); -extern void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd); - -extern void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd); +extern int gfs2_rindex_update(struct gfs2_sbd *sdp); +extern void gfs2_free_clones(struct gfs2_rgrpd *rgd); +extern int gfs2_rgrp_go_lock(struct gfs2_holder *gh); +extern void gfs2_rgrp_go_unlock(struct gfs2_holder *gh); extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip); static inline void gfs2_alloc_put(struct gfs2_inode *ip) @@ -39,16 +36,9 @@ static inline void gfs2_alloc_put(struct gfs2_inode *ip) ip->i_alloc = NULL; } -extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex, - char *file, unsigned int line); -#define gfs2_inplace_reserve(ip) \ - gfs2_inplace_reserve_i((ip), 1, __FILE__, __LINE__) -#define gfs2_inplace_reserve_ri(ip) \ - gfs2_inplace_reserve_i((ip), 0, __FILE__, __LINE__) - +extern int gfs2_inplace_reserve(struct gfs2_inode *ip); extern void gfs2_inplace_release(struct gfs2_inode *ip); -extern int gfs2_ri_update(struct gfs2_inode *ip); extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n); extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation); @@ -66,11 +56,14 @@ struct gfs2_rgrp_list { struct gfs2_holder *rl_ghs; }; -extern void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist, +extern void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist, u64 block); extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state); extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist); extern u64 gfs2_ri_total(struct gfs2_sbd *sdp); extern int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl); +extern void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, + struct buffer_head *bh, + const struct gfs2_bitmap *bi); #endif /* __RGRP_DOT_H__ */ diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index b7beadd9ba4c..71e420989f77 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -752,51 +752,77 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc) struct gfs2_sbd *sdp = GFS2_SB(inode); struct address_space *metamapping = gfs2_glock2aspace(ip->i_gl); struct backing_dev_info *bdi = metamapping->backing_dev_info; - struct gfs2_holder gh; + int ret = 0; + + if (wbc->sync_mode == WB_SYNC_ALL) + gfs2_log_flush(GFS2_SB(inode), ip->i_gl); + if (bdi->dirty_exceeded) + gfs2_ail1_flush(sdp, wbc); + else + filemap_fdatawrite(metamapping); + if (wbc->sync_mode == WB_SYNC_ALL) + ret = filemap_fdatawait(metamapping); + if (ret) + mark_inode_dirty_sync(inode); + return ret; +} + +/** + * gfs2_dirty_inode - check for atime updates + * @inode: The inode in question + * @flags: The type of dirty + * + * Unfortunately it can be called under any combination of inode + * glock and transaction lock, so we have to check carefully. + * + * At the moment this deals only with atime - it should be possible + * to expand that role in future, once a review of the locking has + * been carried out. + */ + +static void gfs2_dirty_inode(struct inode *inode, int flags) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); struct buffer_head *bh; - struct timespec atime; - struct gfs2_dinode *di; - int ret = -EAGAIN; - int unlock_required = 0; - - /* Skip timestamp update, if this is from a memalloc */ - if (current->flags & PF_MEMALLOC) - goto do_flush; + struct gfs2_holder gh; + int need_unlock = 0; + int need_endtrans = 0; + int ret; + + if (!(flags & (I_DIRTY_DATASYNC|I_DIRTY_SYNC))) + return; + if (!gfs2_glock_is_locked_by_me(ip->i_gl)) { ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); - if (ret) - goto do_flush; - unlock_required = 1; + if (ret) { + fs_err(sdp, "dirty_inode: glock %d\n", ret); + return; + } + need_unlock = 1; } - ret = gfs2_trans_begin(sdp, RES_DINODE, 0); - if (ret) - goto do_unlock; + + if (current->journal_info == NULL) { + ret = gfs2_trans_begin(sdp, RES_DINODE, 0); + if (ret) { + fs_err(sdp, "dirty_inode: gfs2_trans_begin %d\n", ret); + goto out; + } + need_endtrans = 1; + } + ret = gfs2_meta_inode_buffer(ip, &bh); if (ret == 0) { - di = (struct gfs2_dinode *)bh->b_data; - atime.tv_sec = be64_to_cpu(di->di_atime); - atime.tv_nsec = be32_to_cpu(di->di_atime_nsec); - if (timespec_compare(&inode->i_atime, &atime) > 0) { - gfs2_trans_add_bh(ip->i_gl, bh, 1); - gfs2_dinode_out(ip, bh->b_data); - } + gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_dinode_out(ip, bh->b_data); brelse(bh); } - gfs2_trans_end(sdp); -do_unlock: - if (unlock_required) + + if (need_endtrans) + gfs2_trans_end(sdp); +out: + if (need_unlock) gfs2_glock_dq_uninit(&gh); -do_flush: - if (wbc->sync_mode == WB_SYNC_ALL) - gfs2_log_flush(GFS2_SB(inode), ip->i_gl); - filemap_fdatawrite(metamapping); - if (bdi->dirty_exceeded) - gfs2_ail1_flush(sdp, wbc); - if (!ret && (wbc->sync_mode == WB_SYNC_ALL)) - ret = filemap_fdatawait(metamapping); - if (ret) - mark_inode_dirty_sync(inode); - return ret; } /** @@ -1011,7 +1037,6 @@ static int statfs_slow_fill(struct gfs2_rgrpd *rgd, static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc) { - struct gfs2_holder ri_gh; struct gfs2_rgrpd *rgd_next; struct gfs2_holder *gha, *gh; unsigned int slots = 64; @@ -1024,10 +1049,6 @@ static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host if (!gha) return -ENOMEM; - error = gfs2_rindex_hold(sdp, &ri_gh); - if (error) - goto out; - rgd_next = gfs2_rgrpd_get_first(sdp); for (;;) { @@ -1070,9 +1091,6 @@ static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host yield(); } - gfs2_glock_dq_uninit(&ri_gh); - -out: kfree(gha); return error; } @@ -1124,6 +1142,10 @@ static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf) struct gfs2_statfs_change_host sc; int error; + error = gfs2_rindex_update(sdp); + if (error) + return error; + if (gfs2_tune_get(sdp, gt_statfs_slow)) error = gfs2_statfs_slow(sdp, &sc); else @@ -1394,21 +1416,17 @@ static int gfs2_dinode_dealloc(struct gfs2_inode *ip) if (error) goto out; - error = gfs2_rindex_hold(sdp, &al->al_ri_gh); - if (error) - goto out_qs; - rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr); if (!rgd) { gfs2_consist_inode(ip); error = -EIO; - goto out_rindex_relse; + goto out_qs; } error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &al->al_rgd_gh); if (error) - goto out_rindex_relse; + goto out_qs; error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS + RES_QUOTA, sdp->sd_jdesc->jd_blocks); @@ -1423,8 +1441,6 @@ static int gfs2_dinode_dealloc(struct gfs2_inode *ip) out_rg_gunlock: gfs2_glock_dq_uninit(&al->al_rgd_gh); -out_rindex_relse: - gfs2_glock_dq_uninit(&al->al_ri_gh); out_qs: gfs2_quota_unhold(ip); out: @@ -1471,9 +1487,11 @@ static void gfs2_evict_inode(struct inode *inode) goto out; } - error = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED); - if (error) - goto out_truncate; + if (!test_bit(GIF_ALLOC_FAILED, &ip->i_flags)) { + error = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED); + if (error) + goto out_truncate; + } if (test_bit(GIF_INVALID, &ip->i_flags)) { error = gfs2_inode_refresh(ip); @@ -1513,6 +1531,10 @@ static void gfs2_evict_inode(struct inode *inode) goto out_unlock; out_truncate: + gfs2_log_flush(sdp, ip->i_gl); + write_inode_now(inode, 1); + gfs2_ail_flush(ip->i_gl, 0); + /* Case 2 starts here */ error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks); if (error) @@ -1552,6 +1574,7 @@ static struct inode *gfs2_alloc_inode(struct super_block *sb) if (ip) { ip->i_flags = 0; ip->i_gl = NULL; + ip->i_rgd = NULL; } return &ip->i_inode; } @@ -1572,6 +1595,7 @@ const struct super_operations gfs2_super_ops = { .alloc_inode = gfs2_alloc_inode, .destroy_inode = gfs2_destroy_inode, .write_inode = gfs2_write_inode, + .dirty_inode = gfs2_dirty_inode, .evict_inode = gfs2_evict_inode, .put_super = gfs2_put_super, .sync_fs = gfs2_sync_fs, diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index 9ec73a854111..86ac75d99d31 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -185,8 +185,3 @@ void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len) gfs2_log_unlock(sdp); } -void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd) -{ - lops_add(rgd->rd_sbd, &rgd->rd_le); -} - diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h index fb56b783e028..f8f101ef600c 100644 --- a/fs/gfs2/trans.h +++ b/fs/gfs2/trans.h @@ -28,20 +28,20 @@ struct gfs2_glock; /* reserve either the number of blocks to be allocated plus the rg header * block, or all of the blocks in the rg, whichever is smaller */ -static inline unsigned int gfs2_rg_blocks(const struct gfs2_alloc *al) +static inline unsigned int gfs2_rg_blocks(const struct gfs2_inode *ip) { - return (al->al_requested < al->al_rgd->rd_length)? - al->al_requested + 1 : al->al_rgd->rd_length; + const struct gfs2_alloc *al = ip->i_alloc; + if (al->al_requested < ip->i_rgd->rd_length) + return al->al_requested + 1; + return ip->i_rgd->rd_length; } -int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, - unsigned int revokes); +extern int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, + unsigned int revokes); -void gfs2_trans_end(struct gfs2_sbd *sdp); - -void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta); -void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd); -void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len); -void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd); +extern void gfs2_trans_end(struct gfs2_sbd *sdp); +extern void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta); +extern void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd); +extern void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len); #endif /* __TRANS_DOT_H__ */ diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 439b61c03262..71d7bf830c09 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -332,15 +332,8 @@ static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh, if (error) goto out_alloc; - error = gfs2_rindex_hold(GFS2_SB(&ip->i_inode), &al->al_ri_gh); - if (error) - goto out_quota; - error = ea_dealloc_unstuffed(ip, bh, ea, prev, (leave) ? &error : NULL); - gfs2_glock_dq_uninit(&al->al_ri_gh); - -out_quota: gfs2_quota_unhold(ip); out_alloc: gfs2_alloc_put(ip); @@ -734,7 +727,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er, goto out_gunlock_q; error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), - blks + gfs2_rg_blocks(al) + + blks + gfs2_rg_blocks(ip) + RES_DINODE + RES_STATFS + RES_QUOTA, 0); if (error) goto out_ipres; @@ -1296,7 +1289,8 @@ fail: int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data) { - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct inode *inode = &ip->i_inode; + struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_ea_location el; int error; @@ -1319,7 +1313,7 @@ int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data) if (error) return error; - error = gfs2_setattr_simple(ip, attr); + error = gfs2_setattr_simple(inode, attr); gfs2_trans_end(sdp); return error; } @@ -1362,14 +1356,14 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip) blen++; else { if (bstart) - gfs2_rlist_add(sdp, &rlist, bstart); + gfs2_rlist_add(ip, &rlist, bstart); bstart = bn; blen = 1; } blks++; } if (bstart) - gfs2_rlist_add(sdp, &rlist, bstart); + gfs2_rlist_add(ip, &rlist, bstart); else goto out; @@ -1501,24 +1495,18 @@ int gfs2_ea_dealloc(struct gfs2_inode *ip) if (error) goto out_alloc; - error = gfs2_rindex_hold(GFS2_SB(&ip->i_inode), &al->al_ri_gh); - if (error) - goto out_quota; - error = ea_foreach(ip, ea_dealloc_unstuffed, NULL); if (error) - goto out_rindex; + goto out_quota; if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) { error = ea_dealloc_indirect(ip); if (error) - goto out_rindex; + goto out_quota; } error = ea_dealloc_block(ip); -out_rindex: - gfs2_glock_dq_uninit(&al->al_ri_gh); out_quota: gfs2_quota_unhold(ip); out_alloc: diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c index 3ebc437736fe..1cbdeea1db44 100644 --- a/fs/hfs/btree.c +++ b/fs/hfs/btree.c @@ -46,11 +46,26 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke case HFS_EXT_CNID: hfs_inode_read_fork(tree->inode, mdb->drXTExtRec, mdb->drXTFlSize, mdb->drXTFlSize, be32_to_cpu(mdb->drXTClpSiz)); + if (HFS_I(tree->inode)->alloc_blocks > + HFS_I(tree->inode)->first_blocks) { + printk(KERN_ERR "hfs: invalid btree extent records\n"); + unlock_new_inode(tree->inode); + goto free_inode; + } + tree->inode->i_mapping->a_ops = &hfs_btree_aops; break; case HFS_CAT_CNID: hfs_inode_read_fork(tree->inode, mdb->drCTExtRec, mdb->drCTFlSize, mdb->drCTFlSize, be32_to_cpu(mdb->drCTClpSiz)); + + if (!HFS_I(tree->inode)->first_blocks) { + printk(KERN_ERR "hfs: invalid btree extent records " + "(0 size).\n"); + unlock_new_inode(tree->inode); + goto free_inode; + } + tree->inode->i_mapping->a_ops = &hfs_btree_aops; break; default: @@ -59,11 +74,6 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke } unlock_new_inode(tree->inode); - if (!HFS_I(tree->inode)->first_blocks) { - printk(KERN_ERR "hfs: invalid btree extent records (0 size).\n"); - goto free_inode; - } - mapping = tree->inode->i_mapping; page = read_mapping_page(mapping, 0, NULL); if (IS_ERR(page)) diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c index b4d70b13be92..bce4eef91a06 100644 --- a/fs/hfs/dir.c +++ b/fs/hfs/dir.c @@ -198,7 +198,7 @@ static int hfs_create(struct inode *dir, struct dentry *dentry, int mode, res = hfs_cat_create(inode->i_ino, dir, &dentry->d_name, inode); if (res) { - inode->i_nlink = 0; + clear_nlink(inode); hfs_delete_inode(inode); iput(inode); return res; @@ -227,7 +227,7 @@ static int hfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) res = hfs_cat_create(inode->i_ino, dir, &dentry->d_name, inode); if (res) { - inode->i_nlink = 0; + clear_nlink(inode); hfs_delete_inode(inode); iput(inode); return res; diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 96a1b625fc74..a1a9fdcd2a00 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -183,7 +183,7 @@ struct inode *hfs_new_inode(struct inode *dir, struct qstr *name, int mode) inode->i_mode = mode; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); - inode->i_nlink = 1; + set_nlink(inode, 1); inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; HFS_I(inode)->flags = 0; HFS_I(inode)->rsrc_inode = NULL; @@ -313,7 +313,7 @@ static int hfs_read_inode(struct inode *inode, void *data) /* Initialize the inode */ inode->i_uid = hsb->s_uid; inode->i_gid = hsb->s_gid; - inode->i_nlink = 1; + set_nlink(inode, 1); if (idata->key) HFS_I(inode)->cat_key = *idata->key; diff --git a/fs/hfs/trans.c b/fs/hfs/trans.c index e673a88b8ae7..b1ce4c7ad3fb 100644 --- a/fs/hfs/trans.c +++ b/fs/hfs/trans.c @@ -40,6 +40,8 @@ int hfs_mac2asc(struct super_block *sb, char *out, const struct hfs_name *in) src = in->name; srclen = in->len; + if (srclen > HFS_NAMELEN) + srclen = HFS_NAMELEN; dst = out; dstlen = HFS_MAX_NAMELEN; if (nls_io) { diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index 25b2443a004c..4536cd3f15ae 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -415,7 +415,7 @@ static int hfsplus_symlink(struct inode *dir, struct dentry *dentry, goto out; out_err: - inode->i_nlink = 0; + clear_nlink(inode); hfsplus_delete_inode(inode); iput(inode); out: @@ -440,7 +440,7 @@ static int hfsplus_mknod(struct inode *dir, struct dentry *dentry, res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode); if (res) { - inode->i_nlink = 0; + clear_nlink(inode); hfsplus_delete_inode(inode); iput(inode); goto out; diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 4cc1e3a36ec7..40e1413be4cf 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -391,7 +391,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, int mode) inode->i_mode = mode; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); - inode->i_nlink = 1; + set_nlink(inode, 1); inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; hip = HFSPLUS_I(inode); @@ -512,7 +512,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) hfs_bnode_read(fd->bnode, &entry, fd->entryoffset, sizeof(struct hfsplus_cat_folder)); hfsplus_get_perms(inode, &folder->permissions, 1); - inode->i_nlink = 1; + set_nlink(inode, 1); inode->i_size = 2 + be32_to_cpu(folder->valence); inode->i_atime = hfsp_mt2ut(folder->access_date); inode->i_mtime = hfsp_mt2ut(folder->content_mod_date); @@ -532,11 +532,11 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) hfsplus_inode_read_fork(inode, HFSPLUS_IS_RSRC(inode) ? &file->rsrc_fork : &file->data_fork); hfsplus_get_perms(inode, &file->permissions, 0); - inode->i_nlink = 1; + set_nlink(inode, 1); if (S_ISREG(inode->i_mode)) { if (file->permissions.dev) - inode->i_nlink = - be32_to_cpu(file->permissions.dev); + set_nlink(inode, + be32_to_cpu(file->permissions.dev)); inode->i_op = &hfsplus_file_inode_operations; inode->i_fop = &hfsplus_file_operations; inode->i_mapping->a_ops = &hfsplus_aops; diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index c106ca22e812..d24a9b666a23 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -344,6 +344,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) struct inode *root, *inode; struct qstr str; struct nls_table *nls = NULL; + u64 last_fs_block, last_fs_page; int err; err = -EINVAL; @@ -399,9 +400,13 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) if (!sbi->rsrc_clump_blocks) sbi->rsrc_clump_blocks = 1; - err = generic_check_addressable(sbi->alloc_blksz_shift, - sbi->total_blocks); - if (err) { + err = -EFBIG; + last_fs_block = sbi->total_blocks - 1; + last_fs_page = (last_fs_block << sbi->alloc_blksz_shift) >> + PAGE_CACHE_SHIFT; + + if ((last_fs_block > (sector_t)(~0ULL) >> (sbi->alloc_blksz_shift - 9)) || + (last_fs_page > (pgoff_t)(~0ULL))) { printk(KERN_ERR "hfs: filesystem size too large.\n"); goto out_free_vhdr; } @@ -525,8 +530,8 @@ out_close_cat_tree: out_close_ext_tree: hfs_btree_close(sbi->ext_tree); out_free_vhdr: - kfree(sbi->s_vhdr); - kfree(sbi->s_backup_vhdr); + kfree(sbi->s_vhdr_buf); + kfree(sbi->s_backup_vhdr_buf); out_unload_nls: unload_nls(sbi->nls); unload_nls(nls); diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index 10e515a0d452..7daf4b852d1c 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -272,9 +272,9 @@ reread: return 0; out_free_backup_vhdr: - kfree(sbi->s_backup_vhdr); + kfree(sbi->s_backup_vhdr_buf); out_free_vhdr: - kfree(sbi->s_vhdr); + kfree(sbi->s_vhdr_buf); out: return error; } diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 0d22afdd4611..2f72da5ae686 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -541,7 +541,7 @@ static int read_name(struct inode *ino, char *name) ino->i_ino = st.ino; ino->i_mode = st.mode; - ino->i_nlink = st.nlink; + set_nlink(ino, st.nlink); ino->i_uid = st.uid; ino->i_gid = st.gid; ino->i_atime = st.atime; diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c index d51a98384bc0..dd7bc38a3825 100644 --- a/fs/hostfs/hostfs_user.c +++ b/fs/hostfs/hostfs_user.c @@ -16,7 +16,6 @@ #include <sys/vfs.h> #include "hostfs.h" #include "os.h" -#include "user.h" #include <utime.h> static void stat64_to_hostfs(const struct stat64 *buf, struct hostfs_stat *p) diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c index 96a8ed91cedd..2fa0089a02a8 100644 --- a/fs/hpfs/dir.c +++ b/fs/hpfs/dir.c @@ -247,7 +247,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name result->i_mode &= ~0111; result->i_op = &hpfs_file_iops; result->i_fop = &hpfs_file_ops; - result->i_nlink = 1; + set_nlink(result, 1); } unlock_new_inode(result); } diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h index 331b5e234ef3..de946170ebb1 100644 --- a/fs/hpfs/hpfs_fn.h +++ b/fs/hpfs/hpfs_fn.h @@ -311,8 +311,8 @@ static inline struct hpfs_sb_info *hpfs_sb(struct super_block *sb) /* super.c */ -void hpfs_error(struct super_block *, const char *, ...) - __attribute__((format (printf, 2, 3))); +__printf(2, 3) +void hpfs_error(struct super_block *, const char *, ...); int hpfs_stop_cycles(struct super_block *, int, int *, int *, char *); unsigned hpfs_count_one_bitmap(struct super_block *, secno); diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index 338cd8368451..3b2cec29972b 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -53,7 +53,7 @@ void hpfs_read_inode(struct inode *i) i->i_mode &= ~0111; i->i_op = &hpfs_file_iops; i->i_fop = &hpfs_file_ops; - i->i_nlink = 0;*/ + clear_nlink(i);*/ make_bad_inode(i); return; } @@ -77,7 +77,7 @@ void hpfs_read_inode(struct inode *i) i->i_mode = S_IFLNK | 0777; i->i_op = &page_symlink_inode_operations; i->i_data.a_ops = &hpfs_symlink_aops; - i->i_nlink = 1; + set_nlink(i, 1); i->i_size = ea_size; i->i_blocks = 1; brelse(bh); @@ -101,7 +101,7 @@ void hpfs_read_inode(struct inode *i) } if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISFIFO(mode) || S_ISSOCK(mode)) { brelse(bh); - i->i_nlink = 1; + set_nlink(i, 1); i->i_size = 0; i->i_blocks = 1; init_special_inode(i, mode, @@ -125,13 +125,13 @@ void hpfs_read_inode(struct inode *i) hpfs_count_dnodes(i->i_sb, hpfs_inode->i_dno, &n_dnodes, &n_subdirs, NULL); i->i_blocks = 4 * n_dnodes; i->i_size = 2048 * n_dnodes; - i->i_nlink = 2 + n_subdirs; + set_nlink(i, 2 + n_subdirs); } else { i->i_mode |= S_IFREG; if (!hpfs_inode->i_ea_mode) i->i_mode &= ~0111; i->i_op = &hpfs_file_iops; i->i_fop = &hpfs_file_ops; - i->i_nlink = 1; + set_nlink(i, 1); i->i_size = le32_to_cpu(fnode->file_size); i->i_blocks = ((i->i_size + 511) >> 9) + 1; i->i_data.a_ops = &hpfs_aops; diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c index 2df69e2f07cf..ea91fcb0ef9b 100644 --- a/fs/hpfs/namei.c +++ b/fs/hpfs/namei.c @@ -56,7 +56,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) result->i_fop = &hpfs_dir_ops; result->i_blocks = 4; result->i_size = 2048; - result->i_nlink = 2; + set_nlink(result, 2); if (dee.read_only) result->i_mode &= ~0222; @@ -150,7 +150,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struc result->i_mode &= ~0111; result->i_op = &hpfs_file_iops; result->i_fop = &hpfs_file_ops; - result->i_nlink = 1; + set_nlink(result, 1); hpfs_i(result)->i_parent_dir = dir->i_ino; result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)); result->i_ctime.tv_nsec = 0; @@ -242,7 +242,7 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t hpfs_i(result)->i_ea_size = 0; result->i_uid = current_fsuid(); result->i_gid = current_fsgid(); - result->i_nlink = 1; + set_nlink(result, 1); result->i_size = 0; result->i_blocks = 1; init_special_inode(result, mode, rdev); @@ -318,7 +318,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy result->i_uid = current_fsuid(); result->i_gid = current_fsgid(); result->i_blocks = 1; - result->i_nlink = 1; + set_nlink(result, 1); result->i_size = strlen(symlink); result->i_op = &page_symlink_inode_operations; result->i_data.a_ops = &hpfs_symlink_aops; diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c index 8635be5ffd97..f590b1160c6c 100644 --- a/fs/hppfs/hppfs.c +++ b/fs/hppfs/hppfs.c @@ -16,6 +16,7 @@ #include <linux/statfs.h> #include <linux/types.h> #include <linux/pid_namespace.h> +#include <linux/namei.h> #include <asm/uaccess.h> #include "os.h" @@ -701,7 +702,7 @@ static struct inode *get_inode(struct super_block *sb, struct dentry *dentry) inode->i_ctime = proc_ino->i_ctime; inode->i_ino = proc_ino->i_ino; inode->i_mode = proc_ino->i_mode; - inode->i_nlink = proc_ino->i_nlink; + set_nlink(inode, proc_ino->i_nlink); inode->i_size = proc_ino->i_size; inode->i_blocks = proc_ino->i_blocks; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 7aafeb8fa300..0be5a78598d0 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -94,7 +94,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) vma->vm_flags |= VM_HUGETLB | VM_RESERVED; vma->vm_ops = &hugetlb_vm_ops; - if (vma->vm_pgoff & ~(huge_page_mask(h) >> PAGE_SHIFT)) + if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) return -EINVAL; vma_len = (loff_t)(vma->vm_end - vma->vm_start); @@ -491,6 +491,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid, inode->i_op = &page_symlink_inode_operations; break; } + lockdep_annotate_inode_mutex_key(inode); } return inode; } @@ -969,7 +970,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size, d_instantiate(path.dentry, inode); inode->i_size = size; - inode->i_nlink = 0; + clear_nlink(inode); error = -ENFILE; file = alloc_file(&path, FMODE_WRITE | FMODE_READ, @@ -1030,6 +1031,7 @@ static int __init init_hugetlbfs_fs(void) static void __exit exit_hugetlbfs_fs(void) { kmem_cache_destroy(hugetlbfs_inode_cachep); + kern_unmount(hugetlbfs_vfsmount); unregister_filesystem(&hugetlbfs_fs_type); bdi_destroy(&hugetlbfs_backing_dev_info); } diff --git a/fs/inode.c b/fs/inode.c index 96c77b81167c..ee4e66b998f4 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -37,7 +37,7 @@ * inode->i_sb->s_inode_lru, inode->i_lru * inode_sb_list_lock protects: * sb->s_inodes, inode->i_sb_list - * inode_wb_list_lock protects: + * bdi->wb.list_lock protects: * bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list * inode_hash_lock protects: * inode_hashtable, inode->i_hash @@ -48,7 +48,7 @@ * inode->i_lock * inode->i_sb->s_inode_lru_lock * - * inode_wb_list_lock + * bdi->wb.list_lock * inode->i_lock * * inode_hash_lock @@ -65,7 +65,6 @@ static struct hlist_head *inode_hashtable __read_mostly; static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock); __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock); -__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock); /* * Empty aops. Can be used for the cases where the user does not @@ -143,7 +142,8 @@ int inode_init_always(struct super_block *sb, struct inode *inode) atomic_set(&inode->i_count, 1); inode->i_op = &empty_iops; inode->i_fop = &empty_fops; - inode->i_nlink = 1; + inode->__i_nlink = 1; + inode->i_opflags = 0; inode->i_uid = 0; inode->i_gid = 0; atomic_set(&inode->i_writecount, 0); @@ -362,9 +362,11 @@ EXPORT_SYMBOL_GPL(inode_sb_list_add); static inline void inode_sb_list_del(struct inode *inode) { - spin_lock(&inode_sb_list_lock); - list_del_init(&inode->i_sb_list); - spin_unlock(&inode_sb_list_lock); + if (!list_empty(&inode->i_sb_list)) { + spin_lock(&inode_sb_list_lock); + list_del_init(&inode->i_sb_list); + spin_unlock(&inode_sb_list_lock); + } } static unsigned long hash(struct super_block *sb, unsigned long hashval) @@ -398,12 +400,12 @@ void __insert_inode_hash(struct inode *inode, unsigned long hashval) EXPORT_SYMBOL(__insert_inode_hash); /** - * remove_inode_hash - remove an inode from the hash + * __remove_inode_hash - remove an inode from the hash * @inode: inode to unhash * * Remove an inode from the superblock. */ -void remove_inode_hash(struct inode *inode) +void __remove_inode_hash(struct inode *inode) { spin_lock(&inode_hash_lock); spin_lock(&inode->i_lock); @@ -411,7 +413,7 @@ void remove_inode_hash(struct inode *inode) spin_unlock(&inode->i_lock); spin_unlock(&inode_hash_lock); } -EXPORT_SYMBOL(remove_inode_hash); +EXPORT_SYMBOL(__remove_inode_hash); void end_writeback(struct inode *inode) { @@ -453,7 +455,9 @@ static void evict(struct inode *inode) BUG_ON(!(inode->i_state & I_FREEING)); BUG_ON(!list_empty(&inode->i_lru)); - inode_wb_list_del(inode); + if (!list_empty(&inode->i_wb_list)) + inode_wb_list_del(inode); + inode_sb_list_del(inode); if (op->evict_inode) { @@ -630,7 +634,7 @@ void prune_icache_sb(struct super_block *sb, int nr_to_scan) * inode to the back of the list so we don't spin on it. */ if (!spin_trylock(&inode->i_lock)) { - list_move(&inode->i_lru, &sb->s_inode_lru); + list_move_tail(&inode->i_lru, &sb->s_inode_lru); continue; } @@ -797,6 +801,29 @@ unsigned int get_next_ino(void) EXPORT_SYMBOL(get_next_ino); /** + * new_inode_pseudo - obtain an inode + * @sb: superblock + * + * Allocates a new inode for given superblock. + * Inode wont be chained in superblock s_inodes list + * This means : + * - fs can't be unmount + * - quotas, fsnotify, writeback can't work + */ +struct inode *new_inode_pseudo(struct super_block *sb) +{ + struct inode *inode = alloc_inode(sb); + + if (inode) { + spin_lock(&inode->i_lock); + inode->i_state = 0; + spin_unlock(&inode->i_lock); + INIT_LIST_HEAD(&inode->i_sb_list); + } + return inode; +} + +/** * new_inode - obtain an inode * @sb: superblock * @@ -814,27 +841,16 @@ struct inode *new_inode(struct super_block *sb) spin_lock_prefetch(&inode_sb_list_lock); - inode = alloc_inode(sb); - if (inode) { - spin_lock(&inode->i_lock); - inode->i_state = 0; - spin_unlock(&inode->i_lock); + inode = new_inode_pseudo(sb); + if (inode) inode_sb_list_add(inode); - } return inode; } EXPORT_SYMBOL(new_inode); -/** - * unlock_new_inode - clear the I_NEW state and wake up any waiters - * @inode: new inode to unlock - * - * Called when the inode is fully initialised to clear the new state of the - * inode and wake up anyone waiting for the inode to finish initialisation. - */ -void unlock_new_inode(struct inode *inode) -{ #ifdef CONFIG_DEBUG_LOCK_ALLOC +void lockdep_annotate_inode_mutex_key(struct inode *inode) +{ if (S_ISDIR(inode->i_mode)) { struct file_system_type *type = inode->i_sb->s_type; @@ -850,7 +866,20 @@ void unlock_new_inode(struct inode *inode) &type->i_mutex_dir_key); } } +} +EXPORT_SYMBOL(lockdep_annotate_inode_mutex_key); #endif + +/** + * unlock_new_inode - clear the I_NEW state and wake up any waiters + * @inode: new inode to unlock + * + * Called when the inode is fully initialised to clear the new state of the + * inode and wake up anyone waiting for the inode to finish initialisation. + */ +void unlock_new_inode(struct inode *inode) +{ + lockdep_annotate_inode_mutex_key(inode); spin_lock(&inode->i_lock); WARN_ON(!(inode->i_state & I_NEW)); inode->i_state &= ~I_NEW; @@ -1308,7 +1337,8 @@ static void iput_final(struct inode *inode) } inode->i_state |= I_FREEING; - inode_lru_list_del(inode); + if (!list_empty(&inode->i_lru)) + inode_lru_list_del(inode); spin_unlock(&inode->i_lock); evict(inode); diff --git a/fs/ioprio.c b/fs/ioprio.c index 7da2a06508e5..f79dab83e17b 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c @@ -21,6 +21,7 @@ */ #include <linux/gfp.h> #include <linux/kernel.h> +#include <linux/export.h> #include <linux/ioprio.h> #include <linux/blkdev.h> #include <linux/capability.h> diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index a5d03672d04e..f950059525fc 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -20,6 +20,7 @@ #include <linux/statfs.h> #include <linux/cdrom.h> #include <linux/parser.h> +#include <linux/mpage.h> #include "isofs.h" #include "zisofs.h" @@ -1148,7 +1149,13 @@ struct buffer_head *isofs_bread(struct inode *inode, sector_t block) static int isofs_readpage(struct file *file, struct page *page) { - return block_read_full_page(page,isofs_get_block); + return mpage_readpage(page, isofs_get_block); +} + +static int isofs_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, isofs_get_block); } static sector_t _isofs_bmap(struct address_space *mapping, sector_t block) @@ -1158,6 +1165,7 @@ static sector_t _isofs_bmap(struct address_space *mapping, sector_t block) static const struct address_space_operations isofs_aops = { .readpage = isofs_readpage, + .readpages = isofs_readpages, .bmap = _isofs_bmap }; @@ -1319,7 +1327,7 @@ static int isofs_read_inode(struct inode *inode) inode->i_mode = S_IFDIR | sbi->s_dmode; else inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; - inode->i_nlink = 1; /* + set_nlink(inode, 1); /* * Set to 1. We know there are 2, but * the find utility tries to optimize * if it is 2, and it screws up. It is @@ -1337,7 +1345,7 @@ static int isofs_read_inode(struct inode *inode) */ inode->i_mode = S_IFREG | S_IRUGO | S_IXUGO; } - inode->i_nlink = 1; + set_nlink(inode, 1); } inode->i_uid = sbi->s_uid; inode->i_gid = sbi->s_gid; diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c index 1fbc7de88f50..70e79d0c756a 100644 --- a/fs/isofs/rock.c +++ b/fs/isofs/rock.c @@ -363,7 +363,7 @@ repeat: break; case SIG('P', 'X'): inode->i_mode = isonum_733(rr->u.PX.mode); - inode->i_nlink = isonum_733(rr->u.PX.n_links); + set_nlink(inode, isonum_733(rr->u.PX.n_links)); inode->i_uid = isonum_733(rr->u.PX.uid); inode->i_gid = isonum_733(rr->u.PX.gid); break; @@ -496,7 +496,7 @@ repeat: goto out; } inode->i_mode = reloc->i_mode; - inode->i_nlink = reloc->i_nlink; + set_nlink(inode, reloc->i_nlink); inode->i_uid = reloc->i_uid; inode->i_gid = reloc->i_gid; inode->i_rdev = reloc->i_rdev; diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c index e4b87bc1fa56..f94fc48ff3a0 100644 --- a/fs/jbd/checkpoint.c +++ b/fs/jbd/checkpoint.c @@ -22,6 +22,8 @@ #include <linux/jbd.h> #include <linux/errno.h> #include <linux/slab.h> +#include <linux/blkdev.h> +#include <trace/events/jbd.h> /* * Unlink a buffer from a transaction checkpoint list. @@ -95,10 +97,14 @@ static int __try_to_free_cp_buf(struct journal_head *jh) if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh) && !buffer_write_io_error(bh)) { + /* + * Get our reference so that bh cannot be freed before + * we unlock it + */ + get_bh(bh); JBUFFER_TRACE(jh, "remove from checkpoint list"); ret = __journal_remove_checkpoint(jh) + 1; jbd_unlock_bh_state(bh); - journal_remove_journal_head(bh); BUFFER_TRACE(bh, "release"); __brelse(bh); } else { @@ -220,8 +226,8 @@ restart: spin_lock(&journal->j_list_lock); goto restart; } + get_bh(bh); if (buffer_locked(bh)) { - get_bh(bh); spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); wait_on_buffer(bh); @@ -240,7 +246,6 @@ restart: */ released = __journal_remove_checkpoint(jh); jbd_unlock_bh_state(bh); - journal_remove_journal_head(bh); __brelse(bh); } @@ -253,9 +258,12 @@ static void __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count) { int i; + struct blk_plug plug; + blk_start_plug(&plug); for (i = 0; i < *batch_count; i++) - write_dirty_buffer(bhs[i], WRITE); + write_dirty_buffer(bhs[i], WRITE_SYNC); + blk_finish_plug(&plug); for (i = 0; i < *batch_count; i++) { struct buffer_head *bh = bhs[i]; @@ -304,12 +312,12 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh, ret = 1; if (unlikely(buffer_write_io_error(bh))) ret = -EIO; + get_bh(bh); J_ASSERT_JH(jh, !buffer_jbddirty(bh)); BUFFER_TRACE(bh, "remove from checkpoint"); __journal_remove_checkpoint(jh); spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); - journal_remove_journal_head(bh); __brelse(bh); } else { /* @@ -358,6 +366,7 @@ int log_do_checkpoint(journal_t *journal) * journal straight away. */ result = cleanup_journal_tail(journal); + trace_jbd_checkpoint(journal, result); jbd_debug(1, "cleanup_journal_tail returned %d\n", result); if (result <= 0) return result; @@ -503,6 +512,7 @@ int cleanup_journal_tail(journal_t *journal) if (blocknr < journal->j_tail) freed = freed + journal->j_last - journal->j_first; + trace_jbd_cleanup_journal_tail(journal, first_tid, blocknr, freed); jbd_debug(1, "Cleaning journal tail from %d to %d (offset %u), " "freeing %u\n", @@ -523,9 +533,9 @@ int cleanup_journal_tail(journal_t *journal) /* * journal_clean_one_cp_list * - * Find all the written-back checkpoint buffers in the given list and release them. + * Find all the written-back checkpoint buffers in the given list and release + * them. * - * Called with the journal locked. * Called with j_list_lock held. * Returns number of bufers reaped (for debug) */ @@ -632,8 +642,8 @@ out: * checkpoint lists. * * The function returns 1 if it frees the transaction, 0 otherwise. + * The function can free jh and bh. * - * This function is called with the journal locked. * This function is called with j_list_lock held. * This function is called with jbd_lock_bh_state(jh2bh(jh)) */ @@ -652,13 +662,14 @@ int __journal_remove_checkpoint(struct journal_head *jh) } journal = transaction->t_journal; + JBUFFER_TRACE(jh, "removing from transaction"); __buffer_unlink(jh); jh->b_cp_transaction = NULL; + journal_put_journal_head(jh); if (transaction->t_checkpoint_list != NULL || transaction->t_checkpoint_io_list != NULL) goto out; - JBUFFER_TRACE(jh, "transaction has no more buffers"); /* * There is one special case to worry about: if we have just pulled the @@ -669,10 +680,8 @@ int __journal_remove_checkpoint(struct journal_head *jh) * The locking here around t_state is a bit sleazy. * See the comment at the end of journal_commit_transaction(). */ - if (transaction->t_state != T_FINISHED) { - JBUFFER_TRACE(jh, "belongs to running/committing transaction"); + if (transaction->t_state != T_FINISHED) goto out; - } /* OK, that was the last buffer for the transaction: we can now safely remove this transaction from the log */ @@ -684,7 +693,6 @@ int __journal_remove_checkpoint(struct journal_head *jh) wake_up(&journal->j_wait_logspace); ret = 1; out: - JBUFFER_TRACE(jh, "exit"); return ret; } @@ -703,6 +711,8 @@ void __journal_insert_checkpoint(struct journal_head *jh, J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh))); J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); + /* Get reference for checkpointing transaction */ + journal_grab_journal_head(jh2bh(jh)); jh->b_cp_transaction = transaction; if (!transaction->t_checkpoint_list) { @@ -752,6 +762,7 @@ void __journal_drop_transaction(journal_t *journal, transaction_t *transaction) J_ASSERT(journal->j_committing_transaction != transaction); J_ASSERT(journal->j_running_transaction != transaction); + trace_jbd_drop_transaction(journal, transaction); jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid); kfree(transaction); } diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 72ffa974b0b8..8799207df058 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -21,6 +21,7 @@ #include <linux/pagemap.h> #include <linux/bio.h> #include <linux/blkdev.h> +#include <trace/events/jbd.h> /* * Default IO end handler for temporary BJ_IO buffer_heads. @@ -204,6 +205,8 @@ write_out_data: if (!trylock_buffer(bh)) { BUFFER_TRACE(bh, "needs blocking lock"); spin_unlock(&journal->j_list_lock); + trace_jbd_do_submit_data(journal, + commit_transaction); /* Write out all data to prevent deadlocks */ journal_do_submit_data(wbuf, bufs, write_op); bufs = 0; @@ -236,6 +239,8 @@ write_out_data: jbd_unlock_bh_state(bh); if (bufs == journal->j_wbufsize) { spin_unlock(&journal->j_list_lock); + trace_jbd_do_submit_data(journal, + commit_transaction); journal_do_submit_data(wbuf, bufs, write_op); bufs = 0; goto write_out_data; @@ -253,10 +258,6 @@ write_out_data: jbd_unlock_bh_state(bh); if (locked) unlock_buffer(bh); - journal_remove_journal_head(bh); - /* One for our safety reference, other for - * journal_remove_journal_head() */ - put_bh(bh); release_data_buffer(bh); } @@ -266,6 +267,7 @@ write_out_data: } } spin_unlock(&journal->j_list_lock); + trace_jbd_do_submit_data(journal, commit_transaction); journal_do_submit_data(wbuf, bufs, write_op); return err; @@ -316,12 +318,14 @@ void journal_commit_transaction(journal_t *journal) commit_transaction = journal->j_running_transaction; J_ASSERT(commit_transaction->t_state == T_RUNNING); + trace_jbd_start_commit(journal, commit_transaction); jbd_debug(1, "JBD: starting commit of transaction %d\n", commit_transaction->t_tid); spin_lock(&journal->j_state_lock); commit_transaction->t_state = T_LOCKED; + trace_jbd_commit_locking(journal, commit_transaction); spin_lock(&commit_transaction->t_handle_lock); while (commit_transaction->t_updates) { DEFINE_WAIT(wait); @@ -392,6 +396,7 @@ void journal_commit_transaction(journal_t *journal) */ journal_switch_revoke_table(journal); + trace_jbd_commit_flushing(journal, commit_transaction); commit_transaction->t_state = T_FLUSH; journal->j_committing_transaction = commit_transaction; journal->j_running_transaction = NULL; @@ -446,14 +451,9 @@ void journal_commit_transaction(journal_t *journal) } if (buffer_jbd(bh) && bh2jh(bh) == jh && jh->b_transaction == commit_transaction && - jh->b_jlist == BJ_Locked) { + jh->b_jlist == BJ_Locked) __journal_unfile_buffer(jh); - jbd_unlock_bh_state(bh); - journal_remove_journal_head(bh); - put_bh(bh); - } else { - jbd_unlock_bh_state(bh); - } + jbd_unlock_bh_state(bh); release_data_buffer(bh); cond_resched_lock(&journal->j_list_lock); } @@ -493,6 +493,7 @@ void journal_commit_transaction(journal_t *journal) commit_transaction->t_state = T_COMMIT; spin_unlock(&journal->j_state_lock); + trace_jbd_commit_logging(journal, commit_transaction); J_ASSERT(commit_transaction->t_nr_buffers <= commit_transaction->t_outstanding_credits); @@ -797,10 +798,16 @@ restart_loop: while (commit_transaction->t_forget) { transaction_t *cp_transaction; struct buffer_head *bh; + int try_to_free = 0; jh = commit_transaction->t_forget; spin_unlock(&journal->j_list_lock); bh = jh2bh(jh); + /* + * Get a reference so that bh cannot be freed before we are + * done with it. + */ + get_bh(bh); jbd_lock_bh_state(bh); J_ASSERT_JH(jh, jh->b_transaction == commit_transaction || jh->b_transaction == journal->j_running_transaction); @@ -858,28 +865,27 @@ restart_loop: __journal_insert_checkpoint(jh, commit_transaction); if (is_journal_aborted(journal)) clear_buffer_jbddirty(bh); - JBUFFER_TRACE(jh, "refile for checkpoint writeback"); - __journal_refile_buffer(jh); - jbd_unlock_bh_state(bh); } else { J_ASSERT_BH(bh, !buffer_dirty(bh)); - /* The buffer on BJ_Forget list and not jbddirty means + /* + * The buffer on BJ_Forget list and not jbddirty means * it has been freed by this transaction and hence it * could not have been reallocated until this * transaction has committed. *BUT* it could be * reallocated once we have written all the data to * disk and before we process the buffer on BJ_Forget - * list. */ - JBUFFER_TRACE(jh, "refile or unfile freed buffer"); - __journal_refile_buffer(jh); - if (!jh->b_transaction) { - jbd_unlock_bh_state(bh); - /* needs a brelse */ - journal_remove_journal_head(bh); - release_buffer_page(bh); - } else - jbd_unlock_bh_state(bh); + * list. + */ + if (!jh->b_next_transaction) + try_to_free = 1; } + JBUFFER_TRACE(jh, "refile or unfile freed buffer"); + __journal_refile_buffer(jh); + jbd_unlock_bh_state(bh); + if (try_to_free) + release_buffer_page(bh); + else + __brelse(bh); cond_resched_lock(&journal->j_list_lock); } spin_unlock(&journal->j_list_lock); @@ -946,6 +952,7 @@ restart_loop: } spin_unlock(&journal->j_list_lock); + trace_jbd_end_commit(journal, commit_transaction); jbd_debug(1, "JBD: commit %d complete, head %d\n", journal->j_commit_sequence, journal->j_tail_sequence); diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index e2d4285fbe90..fea8dd661d2b 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -38,6 +38,9 @@ #include <linux/debugfs.h> #include <linux/ratelimit.h> +#define CREATE_TRACE_POINTS +#include <trace/events/jbd.h> + #include <asm/uaccess.h> #include <asm/page.h> @@ -1065,6 +1068,7 @@ void journal_update_superblock(journal_t *journal, int wait) } else write_dirty_buffer(bh, WRITE); + trace_jbd_update_superblock_end(journal, wait); out: /* If we have just flushed the log (by marking s_start==0), then * any future commit will have to be careful to update the @@ -1131,6 +1135,14 @@ static int journal_get_superblock(journal_t *journal) goto out; } + if (be32_to_cpu(sb->s_first) == 0 || + be32_to_cpu(sb->s_first) >= journal->j_maxlen) { + printk(KERN_WARNING + "JBD: Invalid start block of journal: %u\n", + be32_to_cpu(sb->s_first)); + goto out; + } + return 0; out: @@ -1799,10 +1811,9 @@ static void journal_free_journal_head(struct journal_head *jh) * When a buffer has its BH_JBD bit set it is immune from being released by * core kernel code, mainly via ->b_count. * - * A journal_head may be detached from its buffer_head when the journal_head's - * b_transaction, b_cp_transaction and b_next_transaction pointers are NULL. - * Various places in JBD call journal_remove_journal_head() to indicate that the - * journal_head can be dropped if needed. + * A journal_head is detached from its buffer_head when the journal_head's + * b_jcount reaches zero. Running transaction (b_transaction) and checkpoint + * transaction (b_cp_transaction) hold their references to b_jcount. * * Various places in the kernel want to attach a journal_head to a buffer_head * _before_ attaching the journal_head to a transaction. To protect the @@ -1815,17 +1826,16 @@ static void journal_free_journal_head(struct journal_head *jh) * (Attach a journal_head if needed. Increments b_jcount) * struct journal_head *jh = journal_add_journal_head(bh); * ... - * jh->b_transaction = xxx; - * journal_put_journal_head(jh); - * - * Now, the journal_head's b_jcount is zero, but it is safe from being released - * because it has a non-zero b_transaction. + * (Get another reference for transaction) + * journal_grab_journal_head(bh); + * jh->b_transaction = xxx; + * (Put original reference) + * journal_put_journal_head(jh); */ /* * Give a buffer_head a journal_head. * - * Doesn't need the journal lock. * May sleep. */ struct journal_head *journal_add_journal_head(struct buffer_head *bh) @@ -1889,61 +1899,29 @@ static void __journal_remove_journal_head(struct buffer_head *bh) struct journal_head *jh = bh2jh(bh); J_ASSERT_JH(jh, jh->b_jcount >= 0); - - get_bh(bh); - if (jh->b_jcount == 0) { - if (jh->b_transaction == NULL && - jh->b_next_transaction == NULL && - jh->b_cp_transaction == NULL) { - J_ASSERT_JH(jh, jh->b_jlist == BJ_None); - J_ASSERT_BH(bh, buffer_jbd(bh)); - J_ASSERT_BH(bh, jh2bh(jh) == bh); - BUFFER_TRACE(bh, "remove journal_head"); - if (jh->b_frozen_data) { - printk(KERN_WARNING "%s: freeing " - "b_frozen_data\n", - __func__); - jbd_free(jh->b_frozen_data, bh->b_size); - } - if (jh->b_committed_data) { - printk(KERN_WARNING "%s: freeing " - "b_committed_data\n", - __func__); - jbd_free(jh->b_committed_data, bh->b_size); - } - bh->b_private = NULL; - jh->b_bh = NULL; /* debug, really */ - clear_buffer_jbd(bh); - __brelse(bh); - journal_free_journal_head(jh); - } else { - BUFFER_TRACE(bh, "journal_head was locked"); - } + J_ASSERT_JH(jh, jh->b_transaction == NULL); + J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); + J_ASSERT_JH(jh, jh->b_jlist == BJ_None); + J_ASSERT_BH(bh, buffer_jbd(bh)); + J_ASSERT_BH(bh, jh2bh(jh) == bh); + BUFFER_TRACE(bh, "remove journal_head"); + if (jh->b_frozen_data) { + printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__); + jbd_free(jh->b_frozen_data, bh->b_size); } + if (jh->b_committed_data) { + printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__); + jbd_free(jh->b_committed_data, bh->b_size); + } + bh->b_private = NULL; + jh->b_bh = NULL; /* debug, really */ + clear_buffer_jbd(bh); + journal_free_journal_head(jh); } /* - * journal_remove_journal_head(): if the buffer isn't attached to a transaction - * and has a zero b_jcount then remove and release its journal_head. If we did - * see that the buffer is not used by any transaction we also "logically" - * decrement ->b_count. - * - * We in fact take an additional increment on ->b_count as a convenience, - * because the caller usually wants to do additional things with the bh - * after calling here. - * The caller of journal_remove_journal_head() *must* run __brelse(bh) at some - * time. Once the caller has run __brelse(), the buffer is eligible for - * reaping by try_to_free_buffers(). - */ -void journal_remove_journal_head(struct buffer_head *bh) -{ - jbd_lock_bh_journal_head(bh); - __journal_remove_journal_head(bh); - jbd_unlock_bh_journal_head(bh); -} - -/* - * Drop a reference on the passed journal_head. If it fell to zero then try to + * Drop a reference on the passed journal_head. If it fell to zero then * release the journal_head from the buffer_head. */ void journal_put_journal_head(struct journal_head *jh) @@ -1953,11 +1931,12 @@ void journal_put_journal_head(struct journal_head *jh) jbd_lock_bh_journal_head(bh); J_ASSERT_JH(jh, jh->b_jcount > 0); --jh->b_jcount; - if (!jh->b_jcount && !jh->b_transaction) { + if (!jh->b_jcount) { __journal_remove_journal_head(bh); + jbd_unlock_bh_journal_head(bh); __brelse(bh); - } - jbd_unlock_bh_journal_head(bh); + } else + jbd_unlock_bh_journal_head(bh); } /* diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index f7ee81a065da..7e59c6e66f9b 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -26,6 +26,7 @@ #include <linux/mm.h> #include <linux/highmem.h> #include <linux/hrtimer.h> +#include <linux/backing-dev.h> static void __journal_temp_unlink_buffer(struct journal_head *jh); @@ -99,11 +100,10 @@ static int start_this_handle(journal_t *journal, handle_t *handle) alloc_transaction: if (!journal->j_running_transaction) { - new_transaction = kzalloc(sizeof(*new_transaction), - GFP_NOFS|__GFP_NOFAIL); + new_transaction = kzalloc(sizeof(*new_transaction), GFP_NOFS); if (!new_transaction) { - ret = -ENOMEM; - goto out; + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto alloc_transaction; } } @@ -696,7 +696,6 @@ repeat: if (!jh->b_transaction) { JBUFFER_TRACE(jh, "no transaction"); J_ASSERT_JH(jh, !jh->b_next_transaction); - jh->b_transaction = transaction; JBUFFER_TRACE(jh, "file as BJ_Reserved"); spin_lock(&journal->j_list_lock); __journal_file_buffer(jh, transaction, BJ_Reserved); @@ -818,7 +817,6 @@ int journal_get_create_access(handle_t *handle, struct buffer_head *bh) * committed and so it's safe to clear the dirty bit. */ clear_buffer_dirty(jh2bh(jh)); - jh->b_transaction = transaction; /* first access by this transaction */ jh->b_modified = 0; @@ -844,8 +842,8 @@ int journal_get_create_access(handle_t *handle, struct buffer_head *bh) */ JBUFFER_TRACE(jh, "cancelling revoke"); journal_cancel_revoke(handle, jh); - journal_put_journal_head(jh); out: + journal_put_journal_head(jh); return err; } @@ -1069,8 +1067,9 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh) ret = -EIO; goto no_journal; } - - if (jh->b_transaction != NULL) { + /* We might have slept so buffer could be refiled now */ + if (jh->b_transaction != NULL && + jh->b_transaction != handle->h_transaction) { JBUFFER_TRACE(jh, "unfile from commit"); __journal_temp_unlink_buffer(jh); /* It still points to the committing @@ -1091,8 +1090,6 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh) if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) { JBUFFER_TRACE(jh, "not on correct data list: unfile"); J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow); - __journal_temp_unlink_buffer(jh); - jh->b_transaction = handle->h_transaction; JBUFFER_TRACE(jh, "file as data"); __journal_file_buffer(jh, handle->h_transaction, BJ_SyncData); @@ -1300,8 +1297,6 @@ int journal_forget (handle_t *handle, struct buffer_head *bh) __journal_file_buffer(jh, transaction, BJ_Forget); } else { __journal_unfile_buffer(jh); - journal_remove_journal_head(bh); - __brelse(bh); if (!buffer_jbd(bh)) { spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); @@ -1622,19 +1617,32 @@ static void __journal_temp_unlink_buffer(struct journal_head *jh) mark_buffer_dirty(bh); /* Expose it to the VM */ } +/* + * Remove buffer from all transactions. + * + * Called with bh_state lock and j_list_lock + * + * jh and bh may be already freed when this function returns. + */ void __journal_unfile_buffer(struct journal_head *jh) { __journal_temp_unlink_buffer(jh); jh->b_transaction = NULL; + journal_put_journal_head(jh); } void journal_unfile_buffer(journal_t *journal, struct journal_head *jh) { - jbd_lock_bh_state(jh2bh(jh)); + struct buffer_head *bh = jh2bh(jh); + + /* Get reference so that buffer cannot be freed before we unlock it */ + get_bh(bh); + jbd_lock_bh_state(bh); spin_lock(&journal->j_list_lock); __journal_unfile_buffer(jh); spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(jh2bh(jh)); + jbd_unlock_bh_state(bh); + __brelse(bh); } /* @@ -1661,16 +1669,12 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh) /* A written-back ordered data buffer */ JBUFFER_TRACE(jh, "release data"); __journal_unfile_buffer(jh); - journal_remove_journal_head(bh); - __brelse(bh); } } else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) { /* written-back checkpointed metadata buffer */ if (jh->b_jlist == BJ_None) { JBUFFER_TRACE(jh, "remove from checkpoint list"); __journal_remove_checkpoint(jh); - journal_remove_journal_head(bh); - __brelse(bh); } } spin_unlock(&journal->j_list_lock); @@ -1733,7 +1737,7 @@ int journal_try_to_free_buffers(journal_t *journal, /* * We take our own ref against the journal_head here to avoid * having to add tons of locking around each instance of - * journal_remove_journal_head() and journal_put_journal_head(). + * journal_put_journal_head(). */ jh = journal_grab_journal_head(bh); if (!jh) @@ -1770,10 +1774,9 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) int may_free = 1; struct buffer_head *bh = jh2bh(jh); - __journal_unfile_buffer(jh); - if (jh->b_cp_transaction) { JBUFFER_TRACE(jh, "on running+cp transaction"); + __journal_temp_unlink_buffer(jh); /* * We don't want to write the buffer anymore, clear the * bit so that we don't confuse checks in @@ -1784,8 +1787,7 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) may_free = 0; } else { JBUFFER_TRACE(jh, "on running transaction"); - journal_remove_journal_head(bh); - __brelse(bh); + __journal_unfile_buffer(jh); } return may_free; } @@ -2070,6 +2072,8 @@ void __journal_file_buffer(struct journal_head *jh, if (jh->b_transaction) __journal_temp_unlink_buffer(jh); + else + journal_grab_journal_head(bh); jh->b_transaction = transaction; switch (jlist) { @@ -2127,9 +2131,10 @@ void journal_file_buffer(struct journal_head *jh, * already started to be used by a subsequent transaction, refile the * buffer on that transaction's metadata list. * - * Called under journal->j_list_lock - * + * Called under j_list_lock * Called under jbd_lock_bh_state(jh2bh(jh)) + * + * jh and bh may be already free when this function returns */ void __journal_refile_buffer(struct journal_head *jh) { @@ -2153,6 +2158,11 @@ void __journal_refile_buffer(struct journal_head *jh) was_dirty = test_clear_buffer_jbddirty(bh); __journal_temp_unlink_buffer(jh); + /* + * We set b_transaction here because b_next_transaction will inherit + * our jh reference and thus __journal_file_buffer() must not take a + * new one. + */ jh->b_transaction = jh->b_next_transaction; jh->b_next_transaction = NULL; if (buffer_freed(bh)) @@ -2169,30 +2179,21 @@ void __journal_refile_buffer(struct journal_head *jh) } /* - * For the unlocked version of this call, also make sure that any - * hanging journal_head is cleaned up if necessary. - * - * __journal_refile_buffer is usually called as part of a single locked - * operation on a buffer_head, in which the caller is probably going to - * be hooking the journal_head onto other lists. In that case it is up - * to the caller to remove the journal_head if necessary. For the - * unlocked journal_refile_buffer call, the caller isn't going to be - * doing anything else to the buffer so we need to do the cleanup - * ourselves to avoid a jh leak. - * - * *** The journal_head may be freed by this call! *** + * __journal_refile_buffer() with necessary locking added. We take our bh + * reference so that we can safely unlock bh. + * + * The jh and bh may be freed by this call. */ void journal_refile_buffer(journal_t *journal, struct journal_head *jh) { struct buffer_head *bh = jh2bh(jh); + /* Get reference so that buffer cannot be freed before we unlock it */ + get_bh(bh); jbd_lock_bh_state(bh); spin_lock(&journal->j_list_lock); - __journal_refile_buffer(jh); jbd_unlock_bh_state(bh); - journal_remove_journal_head(bh); - spin_unlock(&journal->j_list_lock); __brelse(bh); } diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 2c62c5aae82f..16a698bd906d 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -257,9 +257,12 @@ static void __flush_batch(journal_t *journal, int *batch_count) { int i; + struct blk_plug plug; + blk_start_plug(&plug); for (i = 0; i < *batch_count; i++) - write_dirty_buffer(journal->j_chkpt_bhs[i], WRITE); + write_dirty_buffer(journal->j_chkpt_bhs[i], WRITE_SYNC); + blk_finish_plug(&plug); for (i = 0; i < *batch_count; i++) { struct buffer_head *bh = journal->j_chkpt_bhs[i]; diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index eef6979821a4..68d704db787f 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -352,7 +352,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) J_ASSERT(commit_transaction->t_state == T_RUNNING); trace_jbd2_start_commit(journal, commit_transaction); - jbd_debug(1, "JBD: starting commit of transaction %d\n", + jbd_debug(1, "JBD2: starting commit of transaction %d\n", commit_transaction->t_tid); write_lock(&journal->j_state_lock); @@ -427,7 +427,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) __jbd2_journal_clean_checkpoint_list(journal); spin_unlock(&journal->j_list_lock); - jbd_debug (3, "JBD: commit phase 1\n"); + jbd_debug(3, "JBD2: commit phase 1\n"); /* * Switch to a new revoke table. @@ -447,7 +447,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) wake_up(&journal->j_wait_transaction_locked); write_unlock(&journal->j_state_lock); - jbd_debug (3, "JBD: commit phase 2\n"); + jbd_debug(3, "JBD2: commit phase 2\n"); /* * Now start flushing things to disk, in the order they appear @@ -462,7 +462,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) WRITE_SYNC); blk_finish_plug(&plug); - jbd_debug(3, "JBD: commit phase 2\n"); + jbd_debug(3, "JBD2: commit phase 2\n"); /* * Way to go: we have now written out all of the data for a @@ -522,7 +522,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) J_ASSERT (bufs == 0); - jbd_debug(4, "JBD: get descriptor\n"); + jbd_debug(4, "JBD2: get descriptor\n"); descriptor = jbd2_journal_get_descriptor_buffer(journal); if (!descriptor) { @@ -531,7 +531,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) } bh = jh2bh(descriptor); - jbd_debug(4, "JBD: got buffer %llu (%p)\n", + jbd_debug(4, "JBD2: got buffer %llu (%p)\n", (unsigned long long)bh->b_blocknr, bh->b_data); header = (journal_header_t *)&bh->b_data[0]; header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); @@ -625,7 +625,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) commit_transaction->t_buffers == NULL || space_left < tag_bytes + 16) { - jbd_debug(4, "JBD: Submit %d IOs\n", bufs); + jbd_debug(4, "JBD2: Submit %d IOs\n", bufs); /* Write an end-of-descriptor marker before submitting the IOs. "tag" still points to @@ -707,7 +707,7 @@ start_journal_io: so we incur less scheduling load. */ - jbd_debug(3, "JBD: commit phase 3\n"); + jbd_debug(3, "JBD2: commit phase 3\n"); /* * akpm: these are BJ_IO, and j_list_lock is not needed. @@ -771,7 +771,7 @@ wait_for_iobuf: J_ASSERT (commit_transaction->t_shadow_list == NULL); - jbd_debug(3, "JBD: commit phase 4\n"); + jbd_debug(3, "JBD2: commit phase 4\n"); /* Here we wait for the revoke record and descriptor record buffers */ wait_for_ctlbuf: @@ -801,7 +801,7 @@ wait_for_iobuf: if (err) jbd2_journal_abort(journal, err); - jbd_debug(3, "JBD: commit phase 5\n"); + jbd_debug(3, "JBD2: commit phase 5\n"); write_lock(&journal->j_state_lock); J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH); commit_transaction->t_state = T_COMMIT_JFLUSH; @@ -830,7 +830,7 @@ wait_for_iobuf: transaction can be removed from any checkpoint list it was on before. */ - jbd_debug(3, "JBD: commit phase 6\n"); + jbd_debug(3, "JBD2: commit phase 6\n"); J_ASSERT(list_empty(&commit_transaction->t_inode_list)); J_ASSERT(commit_transaction->t_buffers == NULL); @@ -964,7 +964,7 @@ restart_loop: /* Done with this transaction! */ - jbd_debug(3, "JBD: commit phase 7\n"); + jbd_debug(3, "JBD2: commit phase 7\n"); J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH); @@ -1039,7 +1039,7 @@ restart_loop: journal->j_commit_callback(journal, commit_transaction); trace_jbd2_end_commit(journal, commit_transaction); - jbd_debug(1, "JBD: commit %d complete, head %d\n", + jbd_debug(1, "JBD2: commit %d complete, head %d\n", journal->j_commit_sequence, journal->j_tail_sequence); if (to_free) kfree(commit_transaction); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 0dfa5b598e68..0fa0123151d3 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -491,7 +491,7 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target) */ journal->j_commit_request = target; - jbd_debug(1, "JBD: requesting commit %d/%d\n", + jbd_debug(1, "JBD2: requesting commit %d/%d\n", journal->j_commit_request, journal->j_commit_sequence); wake_up(&journal->j_wait_commit); @@ -500,7 +500,7 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target) /* This should never happen, but if it does, preserve the evidence before kjournald goes into a loop and increments j_commit_sequence beyond all recognition. */ - WARN_ONCE(1, "jbd: bad log_start_commit: %u %u %u %u\n", + WARN_ONCE(1, "JBD2: bad log_start_commit: %u %u %u %u\n", journal->j_commit_request, journal->j_commit_sequence, target, journal->j_running_transaction ? @@ -645,7 +645,7 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid) } #endif while (tid_gt(tid, journal->j_commit_sequence)) { - jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n", + jbd_debug(1, "JBD2: want %d, j_commit_sequence=%d\n", tid, journal->j_commit_sequence); wake_up(&journal->j_wait_commit); read_unlock(&journal->j_state_lock); @@ -1093,7 +1093,7 @@ static int journal_reset(journal_t *journal) first = be32_to_cpu(sb->s_first); last = be32_to_cpu(sb->s_maxlen); if (first + JBD2_MIN_JOURNAL_BLOCKS > last + 1) { - printk(KERN_ERR "JBD: Journal too short (blocks %llu-%llu).\n", + printk(KERN_ERR "JBD2: Journal too short (blocks %llu-%llu).\n", first, last); journal_fail_superblock(journal); return -EINVAL; @@ -1139,7 +1139,7 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait) */ if (sb->s_start == 0 && journal->j_tail_sequence == journal->j_transaction_sequence) { - jbd_debug(1,"JBD: Skipping superblock update on recovered sb " + jbd_debug(1, "JBD2: Skipping superblock update on recovered sb " "(start %ld, seq %d, errno %d)\n", journal->j_tail, journal->j_tail_sequence, journal->j_errno); @@ -1163,7 +1163,7 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait) } read_lock(&journal->j_state_lock); - jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n", + jbd_debug(1, "JBD2: updating superblock (start %ld, seq %d, errno %d)\n", journal->j_tail, journal->j_tail_sequence, journal->j_errno); sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); @@ -1216,8 +1216,8 @@ static int journal_get_superblock(journal_t *journal) ll_rw_block(READ, 1, &bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { - printk (KERN_ERR - "JBD: IO error reading journal superblock\n"); + printk(KERN_ERR + "JBD2: IO error reading journal superblock\n"); goto out; } } @@ -1228,7 +1228,7 @@ static int journal_get_superblock(journal_t *journal) if (sb->s_header.h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER) || sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) { - printk(KERN_WARNING "JBD: no valid journal superblock found\n"); + printk(KERN_WARNING "JBD2: no valid journal superblock found\n"); goto out; } @@ -1240,14 +1240,22 @@ static int journal_get_superblock(journal_t *journal) journal->j_format_version = 2; break; default: - printk(KERN_WARNING "JBD: unrecognised superblock format ID\n"); + printk(KERN_WARNING "JBD2: unrecognised superblock format ID\n"); goto out; } if (be32_to_cpu(sb->s_maxlen) < journal->j_maxlen) journal->j_maxlen = be32_to_cpu(sb->s_maxlen); else if (be32_to_cpu(sb->s_maxlen) > journal->j_maxlen) { - printk (KERN_WARNING "JBD: journal file too short\n"); + printk(KERN_WARNING "JBD2: journal file too short\n"); + goto out; + } + + if (be32_to_cpu(sb->s_first) == 0 || + be32_to_cpu(sb->s_first) >= journal->j_maxlen) { + printk(KERN_WARNING + "JBD2: Invalid start block of journal: %u\n", + be32_to_cpu(sb->s_first)); goto out; } @@ -1310,8 +1318,8 @@ int jbd2_journal_load(journal_t *journal) ~cpu_to_be32(JBD2_KNOWN_ROCOMPAT_FEATURES)) || (sb->s_feature_incompat & ~cpu_to_be32(JBD2_KNOWN_INCOMPAT_FEATURES))) { - printk (KERN_WARNING - "JBD: Unrecognised features on journal\n"); + printk(KERN_WARNING + "JBD2: Unrecognised features on journal\n"); return -EINVAL; } } @@ -1346,7 +1354,7 @@ int jbd2_journal_load(journal_t *journal) return 0; recovery_error: - printk (KERN_WARNING "JBD: recovery failed\n"); + printk(KERN_WARNING "JBD2: recovery failed\n"); return -EIO; } @@ -1577,7 +1585,7 @@ static int journal_convert_superblock_v1(journal_t *journal, struct buffer_head *bh; printk(KERN_WARNING - "JBD: Converting superblock from version 1 to 2.\n"); + "JBD2: Converting superblock from version 1 to 2.\n"); /* Pre-initialise new fields to zero */ offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb); @@ -1694,7 +1702,7 @@ int jbd2_journal_wipe(journal_t *journal, int write) if (!journal->j_tail) goto no_recovery; - printk (KERN_WARNING "JBD: %s recovery information on journal\n", + printk(KERN_WARNING "JBD2: %s recovery information on journal\n", write ? "Clearing" : "Ignoring"); err = jbd2_journal_skip_recovery(journal); @@ -2020,7 +2028,7 @@ static int journal_init_jbd2_journal_head_cache(void) retval = 0; if (!jbd2_journal_head_cache) { retval = -ENOMEM; - printk(KERN_EMERG "JBD: no memory for journal_head cache\n"); + printk(KERN_EMERG "JBD2: no memory for journal_head cache\n"); } return retval; } @@ -2383,80 +2391,13 @@ static void __exit journal_exit(void) #ifdef CONFIG_JBD2_DEBUG int n = atomic_read(&nr_journal_heads); if (n) - printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n); + printk(KERN_EMERG "JBD2: leaked %d journal_heads!\n", n); #endif jbd2_remove_debugfs_entry(); jbd2_remove_jbd_stats_proc_entry(); jbd2_journal_destroy_caches(); } -/* - * jbd2_dev_to_name is a utility function used by the jbd2 and ext4 - * tracing infrastructure to map a dev_t to a device name. - * - * The caller should use rcu_read_lock() in order to make sure the - * device name stays valid until its done with it. We use - * rcu_read_lock() as well to make sure we're safe in case the caller - * gets sloppy, and because rcu_read_lock() is cheap and can be safely - * nested. - */ -struct devname_cache { - struct rcu_head rcu; - dev_t device; - char devname[BDEVNAME_SIZE]; -}; -#define CACHE_SIZE_BITS 6 -static struct devname_cache *devcache[1 << CACHE_SIZE_BITS]; -static DEFINE_SPINLOCK(devname_cache_lock); - -static void free_devcache(struct rcu_head *rcu) -{ - kfree(rcu); -} - -const char *jbd2_dev_to_name(dev_t device) -{ - int i = hash_32(device, CACHE_SIZE_BITS); - char *ret; - struct block_device *bd; - static struct devname_cache *new_dev; - - rcu_read_lock(); - if (devcache[i] && devcache[i]->device == device) { - ret = devcache[i]->devname; - rcu_read_unlock(); - return ret; - } - rcu_read_unlock(); - - new_dev = kmalloc(sizeof(struct devname_cache), GFP_KERNEL); - if (!new_dev) - return "NODEV-ALLOCFAILURE"; /* Something non-NULL */ - bd = bdget(device); - spin_lock(&devname_cache_lock); - if (devcache[i]) { - if (devcache[i]->device == device) { - kfree(new_dev); - bdput(bd); - ret = devcache[i]->devname; - spin_unlock(&devname_cache_lock); - return ret; - } - call_rcu(&devcache[i]->rcu, free_devcache); - } - devcache[i] = new_dev; - devcache[i]->device = device; - if (bd) { - bdevname(bd, devcache[i]->devname); - bdput(bd); - } else - __bdevname(device, devcache[i]->devname); - ret = devcache[i]->devname; - spin_unlock(&devname_cache_lock); - return ret; -} -EXPORT_SYMBOL(jbd2_dev_to_name); - MODULE_LICENSE("GPL"); module_init(journal_init); module_exit(journal_exit); diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 1cad869494f0..da6d7baf1390 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -89,7 +89,7 @@ static int do_readahead(journal_t *journal, unsigned int start) err = jbd2_journal_bmap(journal, next, &blocknr); if (err) { - printk (KERN_ERR "JBD: bad block at offset %u\n", + printk(KERN_ERR "JBD2: bad block at offset %u\n", next); goto failed; } @@ -138,14 +138,14 @@ static int jread(struct buffer_head **bhp, journal_t *journal, *bhp = NULL; if (offset >= journal->j_maxlen) { - printk(KERN_ERR "JBD: corrupted journal superblock\n"); + printk(KERN_ERR "JBD2: corrupted journal superblock\n"); return -EIO; } err = jbd2_journal_bmap(journal, offset, &blocknr); if (err) { - printk (KERN_ERR "JBD: bad block at offset %u\n", + printk(KERN_ERR "JBD2: bad block at offset %u\n", offset); return err; } @@ -163,7 +163,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal, } if (!buffer_uptodate(bh)) { - printk (KERN_ERR "JBD: Failed to read block at offset %u\n", + printk(KERN_ERR "JBD2: Failed to read block at offset %u\n", offset); brelse(bh); return -EIO; @@ -251,10 +251,10 @@ int jbd2_journal_recover(journal_t *journal) if (!err) err = do_one_pass(journal, &info, PASS_REPLAY); - jbd_debug(1, "JBD: recovery, exit status %d, " + jbd_debug(1, "JBD2: recovery, exit status %d, " "recovered transactions %u to %u\n", err, info.start_transaction, info.end_transaction); - jbd_debug(1, "JBD: Replayed %d and revoked %d/%d blocks\n", + jbd_debug(1, "JBD2: Replayed %d and revoked %d/%d blocks\n", info.nr_replays, info.nr_revoke_hits, info.nr_revokes); /* Restart the log at the next transaction ID, thus invalidating @@ -293,14 +293,14 @@ int jbd2_journal_skip_recovery(journal_t *journal) err = do_one_pass(journal, &info, PASS_SCAN); if (err) { - printk(KERN_ERR "JBD: error %d scanning journal\n", err); + printk(KERN_ERR "JBD2: error %d scanning journal\n", err); ++journal->j_transaction_sequence; } else { #ifdef CONFIG_JBD2_DEBUG int dropped = info.end_transaction - be32_to_cpu(journal->j_superblock->s_sequence); jbd_debug(1, - "JBD: ignoring %d transaction%s from the journal.\n", + "JBD2: ignoring %d transaction%s from the journal.\n", dropped, (dropped == 1) ? "" : "s"); #endif journal->j_transaction_sequence = ++info.end_transaction; @@ -338,7 +338,7 @@ static int calc_chksums(journal_t *journal, struct buffer_head *bh, wrap(journal, *next_log_block); err = jread(&obh, journal, io_block); if (err) { - printk(KERN_ERR "JBD: IO error %d recovering block " + printk(KERN_ERR "JBD2: IO error %d recovering block " "%lu in log\n", err, io_block); return 1; } else { @@ -411,7 +411,7 @@ static int do_one_pass(journal_t *journal, * either the next descriptor block or the final commit * record. */ - jbd_debug(3, "JBD: checking block %ld\n", next_log_block); + jbd_debug(3, "JBD2: checking block %ld\n", next_log_block); err = jread(&bh, journal, next_log_block); if (err) goto failed; @@ -491,8 +491,8 @@ static int do_one_pass(journal_t *journal, /* Recover what we can, but * report failure at the end. */ success = err; - printk (KERN_ERR - "JBD: IO error %d recovering " + printk(KERN_ERR + "JBD2: IO error %d recovering " "block %ld in log\n", err, io_block); } else { @@ -520,7 +520,7 @@ static int do_one_pass(journal_t *journal, journal->j_blocksize); if (nbh == NULL) { printk(KERN_ERR - "JBD: Out of memory " + "JBD2: Out of memory " "during recovery.\n"); err = -ENOMEM; brelse(bh); @@ -689,7 +689,7 @@ static int do_one_pass(journal_t *journal, /* It's really bad news if different passes end up at * different places (but possible due to IO errors). */ if (info->end_transaction != next_commit_ID) { - printk (KERN_ERR "JBD: recovery pass %d ended at " + printk(KERN_ERR "JBD2: recovery pass %d ended at " "transaction %u, expected %u\n", pass, next_commit_ID, info->end_transaction); if (!success) diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 2d7109414cdd..a0e41a4c080e 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -27,6 +27,7 @@ #include <linux/highmem.h> #include <linux/hrtimer.h> #include <linux/backing-dev.h> +#include <linux/bug.h> #include <linux/module.h> static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh); @@ -115,7 +116,7 @@ static inline void update_t_max_wait(transaction_t *transaction, */ static int start_this_handle(journal_t *journal, handle_t *handle, - int gfp_mask) + gfp_t gfp_mask) { transaction_t *transaction, *new_transaction = NULL; tid_t tid; @@ -124,7 +125,7 @@ static int start_this_handle(journal_t *journal, handle_t *handle, unsigned long ts = jiffies; if (nblocks > journal->j_max_transaction_buffers) { - printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n", + printk(KERN_ERR "JBD2: %s wants too many credits (%d > %d)\n", current->comm, nblocks, journal->j_max_transaction_buffers); return -ENOSPC; @@ -320,7 +321,7 @@ static handle_t *new_handle(int nblocks) * Return a pointer to a newly allocated handle, or an ERR_PTR() value * on failure. */ -handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask) +handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask) { handle_t *handle = journal_current_handle(); int err; @@ -443,7 +444,7 @@ out: * transaction capabable of guaranteeing the requested number of * credits. */ -int jbd2__journal_restart(handle_t *handle, int nblocks, int gfp_mask) +int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask) { transaction_t *transaction = handle->h_transaction; journal_t *journal = transaction->t_journal; @@ -563,7 +564,7 @@ static void warn_dirty_buffer(struct buffer_head *bh) char b[BDEVNAME_SIZE]; printk(KERN_WARNING - "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). " + "JBD2: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). " "There's a risk of filesystem corruption in case of system " "crash.\n", bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); @@ -1049,6 +1050,10 @@ void jbd2_buffer_abort_trigger(struct journal_head *jh, * mark dirty metadata which needs to be journaled as part of the current * transaction. * + * The buffer must have previously had jbd2_journal_get_write_access() + * called so that it has a valid journal_head attached to the buffer + * head. + * * The buffer is placed on the transaction's metadata list and is marked * as belonging to the transaction. * @@ -1065,11 +1070,16 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) transaction_t *transaction = handle->h_transaction; journal_t *journal = transaction->t_journal; struct journal_head *jh = bh2jh(bh); + int ret = 0; jbd_debug(5, "journal_head %p\n", jh); JBUFFER_TRACE(jh, "entry"); if (is_handle_aborted(handle)) goto out; + if (!buffer_jbd(bh)) { + ret = -EUCLEAN; + goto out; + } jbd_lock_bh_state(bh); @@ -1093,8 +1103,20 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) */ if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) { JBUFFER_TRACE(jh, "fastpath"); - J_ASSERT_JH(jh, jh->b_transaction == - journal->j_running_transaction); + if (unlikely(jh->b_transaction != + journal->j_running_transaction)) { + printk(KERN_EMERG "JBD: %s: " + "jh->b_transaction (%llu, %p, %u) != " + "journal->j_running_transaction (%p, %u)", + journal->j_devname, + (unsigned long long) bh->b_blocknr, + jh->b_transaction, + jh->b_transaction ? jh->b_transaction->t_tid : 0, + journal->j_running_transaction, + journal->j_running_transaction ? + journal->j_running_transaction->t_tid : 0); + ret = -EINVAL; + } goto out_unlock_bh; } @@ -1108,9 +1130,32 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) */ if (jh->b_transaction != transaction) { JBUFFER_TRACE(jh, "already on other transaction"); - J_ASSERT_JH(jh, jh->b_transaction == - journal->j_committing_transaction); - J_ASSERT_JH(jh, jh->b_next_transaction == transaction); + if (unlikely(jh->b_transaction != + journal->j_committing_transaction)) { + printk(KERN_EMERG "JBD: %s: " + "jh->b_transaction (%llu, %p, %u) != " + "journal->j_committing_transaction (%p, %u)", + journal->j_devname, + (unsigned long long) bh->b_blocknr, + jh->b_transaction, + jh->b_transaction ? jh->b_transaction->t_tid : 0, + journal->j_committing_transaction, + journal->j_committing_transaction ? + journal->j_committing_transaction->t_tid : 0); + ret = -EINVAL; + } + if (unlikely(jh->b_next_transaction != transaction)) { + printk(KERN_EMERG "JBD: %s: " + "jh->b_next_transaction (%llu, %p, %u) != " + "transaction (%p, %u)", + journal->j_devname, + (unsigned long long) bh->b_blocknr, + jh->b_next_transaction, + jh->b_next_transaction ? + jh->b_next_transaction->t_tid : 0, + transaction, transaction->t_tid); + ret = -EINVAL; + } /* And this case is illegal: we can't reuse another * transaction's data buffer, ever. */ goto out_unlock_bh; @@ -1127,7 +1172,8 @@ out_unlock_bh: jbd_unlock_bh_state(bh); out: JBUFFER_TRACE(jh, "exit"); - return 0; + WARN_ON(ret); /* All errors are bugs, so dump the stack */ + return ret; } /* diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index 3675b3cdee89..926d02068a14 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c @@ -156,7 +156,7 @@ static void *jffs2_acl_to_medium(const struct posix_acl *acl, size_t *size) return ERR_PTR(-EINVAL); } -static struct posix_acl *jffs2_get_acl(struct inode *inode, int type) +struct posix_acl *jffs2_get_acl(struct inode *inode, int type) { struct posix_acl *acl; char *value = NULL; @@ -227,7 +227,7 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl) case ACL_TYPE_ACCESS: xprefix = JFFS2_XPREFIX_ACL_ACCESS; if (acl) { - mode_t mode = inode->i_mode; + umode_t mode = inode->i_mode; rc = posix_acl_equiv_mode(acl, &mode); if (rc < 0) return rc; @@ -259,30 +259,11 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl) return rc; } -int jffs2_check_acl(struct inode *inode, int mask) +int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, umode_t *i_mode) { struct posix_acl *acl; int rc; - if (mask & MAY_NOT_BLOCK) - return -ECHILD; - - acl = jffs2_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl) { - rc = posix_acl_permission(inode, acl, mask); - posix_acl_release(acl); - return rc; - } - return -EAGAIN; -} - -int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode) -{ - struct posix_acl *acl, *clone; - int rc; - cache_no_acl(inode); if (S_ISLNK(*i_mode)) @@ -298,18 +279,13 @@ int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode) if (S_ISDIR(*i_mode)) set_cached_acl(inode, ACL_TYPE_DEFAULT, acl); - clone = posix_acl_clone(acl, GFP_KERNEL); - if (!clone) - return -ENOMEM; - rc = posix_acl_create_masq(clone, (mode_t *)i_mode); - if (rc < 0) { - posix_acl_release(clone); + rc = posix_acl_create(&acl, GFP_KERNEL, i_mode); + if (rc < 0) return rc; - } if (rc > 0) - set_cached_acl(inode, ACL_TYPE_ACCESS, clone); + set_cached_acl(inode, ACL_TYPE_ACCESS, acl); - posix_acl_release(clone); + posix_acl_release(acl); } return 0; } @@ -335,7 +311,7 @@ int jffs2_init_acl_post(struct inode *inode) int jffs2_acl_chmod(struct inode *inode) { - struct posix_acl *acl, *clone; + struct posix_acl *acl; int rc; if (S_ISLNK(inode->i_mode)) @@ -343,14 +319,11 @@ int jffs2_acl_chmod(struct inode *inode) acl = jffs2_get_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR(acl) || !acl) return PTR_ERR(acl); - clone = posix_acl_clone(acl, GFP_KERNEL); + rc = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); + if (rc) + return rc; + rc = jffs2_set_acl(inode, ACL_TYPE_ACCESS, acl); posix_acl_release(acl); - if (!clone) - return -ENOMEM; - rc = posix_acl_chmod_masq(clone, inode->i_mode); - if (!rc) - rc = jffs2_set_acl(inode, ACL_TYPE_ACCESS, clone); - posix_acl_release(clone); return rc; } diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h index 5e42de8d9541..9b477246f2a6 100644 --- a/fs/jffs2/acl.h +++ b/fs/jffs2/acl.h @@ -26,9 +26,9 @@ struct jffs2_acl_header { #ifdef CONFIG_JFFS2_FS_POSIX_ACL -extern int jffs2_check_acl(struct inode *, int); +struct posix_acl *jffs2_get_acl(struct inode *inode, int type); extern int jffs2_acl_chmod(struct inode *); -extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *); +extern int jffs2_init_acl_pre(struct inode *, struct inode *, umode_t *); extern int jffs2_init_acl_post(struct inode *); extern const struct xattr_handler jffs2_acl_access_xattr_handler; @@ -36,7 +36,7 @@ extern const struct xattr_handler jffs2_acl_default_xattr_handler; #else -#define jffs2_check_acl (NULL) +#define jffs2_get_acl (NULL) #define jffs2_acl_chmod(inode) (0) #define jffs2_init_acl_pre(dir_i,inode,mode) (0) #define jffs2_init_acl_post(inode) (0) diff --git a/fs/jffs2/compr.c b/fs/jffs2/compr.c index de4247021d25..5b6c9d1a2fb9 100644 --- a/fs/jffs2/compr.c +++ b/fs/jffs2/compr.c @@ -53,6 +53,78 @@ static int jffs2_is_best_compression(struct jffs2_compressor *this, return 0; } +/* + * jffs2_selected_compress: + * @compr: Explicit compression type to use (ie, JFFS2_COMPR_ZLIB). + * If 0, just take the first available compression mode. + * @data_in: Pointer to uncompressed data + * @cpage_out: Pointer to returned pointer to buffer for compressed data + * @datalen: On entry, holds the amount of data available for compression. + * On exit, expected to hold the amount of data actually compressed. + * @cdatalen: On entry, holds the amount of space available for compressed + * data. On exit, expected to hold the actual size of the compressed + * data. + * + * Returns: the compression type used. Zero is used to show that the data + * could not be compressed; probably because we couldn't find the requested + * compression mode. + */ +static int jffs2_selected_compress(u8 compr, unsigned char *data_in, + unsigned char **cpage_out, u32 *datalen, u32 *cdatalen) +{ + struct jffs2_compressor *this; + int err, ret = JFFS2_COMPR_NONE; + uint32_t orig_slen, orig_dlen; + char *output_buf; + + output_buf = kmalloc(*cdatalen, GFP_KERNEL); + if (!output_buf) { + printk(KERN_WARNING "JFFS2: No memory for compressor allocation. Compression failed.\n"); + return ret; + } + orig_slen = *datalen; + orig_dlen = *cdatalen; + spin_lock(&jffs2_compressor_list_lock); + list_for_each_entry(this, &jffs2_compressor_list, list) { + /* Skip decompress-only and disabled modules */ + if (!this->compress || this->disabled) + continue; + + /* Skip if not the desired compression type */ + if (compr && (compr != this->compr)) + continue; + + /* + * Either compression type was unspecified, or we found our + * compressor; either way, we're good to go. + */ + this->usecount++; + spin_unlock(&jffs2_compressor_list_lock); + + *datalen = orig_slen; + *cdatalen = orig_dlen; + err = this->compress(data_in, output_buf, datalen, cdatalen); + + spin_lock(&jffs2_compressor_list_lock); + this->usecount--; + if (!err) { + /* Success */ + ret = this->compr; + this->stat_compr_blocks++; + this->stat_compr_orig_size += *datalen; + this->stat_compr_new_size += *cdatalen; + break; + } + } + spin_unlock(&jffs2_compressor_list_lock); + if (ret == JFFS2_COMPR_NONE) + kfree(output_buf); + else + *cpage_out = output_buf; + + return ret; +} + /* jffs2_compress: * @data_in: Pointer to uncompressed data * @cpage_out: Pointer to returned pointer to buffer for compressed data @@ -76,47 +148,23 @@ uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f, uint32_t *datalen, uint32_t *cdatalen) { int ret = JFFS2_COMPR_NONE; - int compr_ret; + int mode, compr_ret; struct jffs2_compressor *this, *best=NULL; unsigned char *output_buf = NULL, *tmp_buf; uint32_t orig_slen, orig_dlen; uint32_t best_slen=0, best_dlen=0; - switch (jffs2_compression_mode) { + if (c->mount_opts.override_compr) + mode = c->mount_opts.compr; + else + mode = jffs2_compression_mode; + + switch (mode) { case JFFS2_COMPR_MODE_NONE: break; case JFFS2_COMPR_MODE_PRIORITY: - output_buf = kmalloc(*cdatalen,GFP_KERNEL); - if (!output_buf) { - printk(KERN_WARNING "JFFS2: No memory for compressor allocation. Compression failed.\n"); - goto out; - } - orig_slen = *datalen; - orig_dlen = *cdatalen; - spin_lock(&jffs2_compressor_list_lock); - list_for_each_entry(this, &jffs2_compressor_list, list) { - /* Skip decompress-only backwards-compatibility and disabled modules */ - if ((!this->compress)||(this->disabled)) - continue; - - this->usecount++; - spin_unlock(&jffs2_compressor_list_lock); - *datalen = orig_slen; - *cdatalen = orig_dlen; - compr_ret = this->compress(data_in, output_buf, datalen, cdatalen); - spin_lock(&jffs2_compressor_list_lock); - this->usecount--; - if (!compr_ret) { - ret = this->compr; - this->stat_compr_blocks++; - this->stat_compr_orig_size += *datalen; - this->stat_compr_new_size += *cdatalen; - break; - } - } - spin_unlock(&jffs2_compressor_list_lock); - if (ret == JFFS2_COMPR_NONE) - kfree(output_buf); + ret = jffs2_selected_compress(0, data_in, cpage_out, datalen, + cdatalen); break; case JFFS2_COMPR_MODE_SIZE: case JFFS2_COMPR_MODE_FAVOURLZO: @@ -174,22 +222,28 @@ uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f, best->stat_compr_orig_size += best_slen; best->stat_compr_new_size += best_dlen; ret = best->compr; + *cpage_out = output_buf; } spin_unlock(&jffs2_compressor_list_lock); break; + case JFFS2_COMPR_MODE_FORCELZO: + ret = jffs2_selected_compress(JFFS2_COMPR_LZO, data_in, + cpage_out, datalen, cdatalen); + break; + case JFFS2_COMPR_MODE_FORCEZLIB: + ret = jffs2_selected_compress(JFFS2_COMPR_ZLIB, data_in, + cpage_out, datalen, cdatalen); + break; default: printk(KERN_ERR "JFFS2: unknown compression mode.\n"); } - out: + if (ret == JFFS2_COMPR_NONE) { *cpage_out = data_in; *datalen = *cdatalen; none_stat_compr_blocks++; none_stat_compr_size += *datalen; } - else { - *cpage_out = output_buf; - } return ret; } diff --git a/fs/jffs2/compr.h b/fs/jffs2/compr.h index 13bb7597ab39..5e91d578f4ed 100644 --- a/fs/jffs2/compr.h +++ b/fs/jffs2/compr.h @@ -40,6 +40,8 @@ #define JFFS2_COMPR_MODE_PRIORITY 1 #define JFFS2_COMPR_MODE_SIZE 2 #define JFFS2_COMPR_MODE_FAVOURLZO 3 +#define JFFS2_COMPR_MODE_FORCELZO 4 +#define JFFS2_COMPR_MODE_FORCEZLIB 5 #define FAVOUR_LZO_PERCENT 80 diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index 5f243cd63afc..be6169bd8acd 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -56,7 +56,7 @@ const struct inode_operations jffs2_dir_inode_operations = .rmdir = jffs2_rmdir, .mknod = jffs2_mknod, .rename = jffs2_rename, - .check_acl = jffs2_check_acl, + .get_acl = jffs2_get_acl, .setattr = jffs2_setattr, .setxattr = jffs2_setxattr, .getxattr = jffs2_getxattr, @@ -245,7 +245,7 @@ static int jffs2_unlink(struct inode *dir_i, struct dentry *dentry) ret = jffs2_do_unlink(c, dir_f, dentry->d_name.name, dentry->d_name.len, dead_f, now); if (dead_f->inocache) - dentry->d_inode->i_nlink = dead_f->inocache->pino_nlink; + set_nlink(dentry->d_inode, dead_f->inocache->pino_nlink); if (!ret) dir_i->i_mtime = dir_i->i_ctime = ITIME(now); return ret; @@ -278,7 +278,7 @@ static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct de if (!ret) { mutex_lock(&f->sem); - old_dentry->d_inode->i_nlink = ++f->inocache->pino_nlink; + set_nlink(old_dentry->d_inode, ++f->inocache->pino_nlink); mutex_unlock(&f->sem); d_instantiate(dentry, old_dentry->d_inode); dir_i->i_mtime = dir_i->i_ctime = ITIME(now); @@ -497,7 +497,7 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode) f = JFFS2_INODE_INFO(inode); /* Directories get nlink 2 at start */ - inode->i_nlink = 2; + set_nlink(inode, 2); /* but ic->pino_nlink is the parent ino# */ f->inocache->pino_nlink = dir_i->i_ino; diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index 3989f7e09f7f..61e6723535b9 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -63,7 +63,7 @@ const struct file_operations jffs2_file_operations = const struct inode_operations jffs2_file_inode_operations = { - .check_acl = jffs2_check_acl, + .get_acl = jffs2_get_acl, .setattr = jffs2_setattr, .setxattr = jffs2_setxattr, .getxattr = jffs2_getxattr, diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 46ad619b6124..4b8afe39a87f 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -80,7 +80,7 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr) ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE); if (ret) { jffs2_free_raw_inode(ri); - if (S_ISLNK(inode->i_mode & S_IFMT)) + if (S_ISLNK(inode->i_mode)) kfree(mdata); return ret; } @@ -278,7 +278,7 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino) inode->i_mtime = ITIME(je32_to_cpu(latest_node.mtime)); inode->i_ctime = ITIME(je32_to_cpu(latest_node.ctime)); - inode->i_nlink = f->inocache->pino_nlink; + set_nlink(inode, f->inocache->pino_nlink); inode->i_blocks = (inode->i_size + 511) >> 9; @@ -291,7 +291,7 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino) case S_IFDIR: { struct jffs2_full_dirent *fd; - inode->i_nlink = 2; /* parent and '.' */ + set_nlink(inode, 2); /* parent and '.' */ for (fd=f->dents; fd; fd = fd->next) { if (fd->type == DT_DIR && fd->ino) @@ -379,7 +379,7 @@ void jffs2_dirty_inode(struct inode *inode, int flags) jffs2_do_setattr(inode, &iattr); } -int jffs2_remount_fs (struct super_block *sb, int *flags, char *data) +int jffs2_do_remount_fs(struct super_block *sb, int *flags, char *data) { struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); @@ -406,7 +406,7 @@ int jffs2_remount_fs (struct super_block *sb, int *flags, char *data) /* jffs2_new_inode: allocate a new inode and inocache, add it to the hash, fill in the raw_inode while you're at it. */ -struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_inode *ri) +struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_raw_inode *ri) { struct inode *inode; struct super_block *sb = dir_i->i_sb; @@ -453,7 +453,7 @@ struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_i iput(inode); return ERR_PTR(ret); } - inode->i_nlink = 1; + set_nlink(inode, 1); inode->i_ino = je32_to_cpu(ri->ino); inode->i_mode = jemode_to_cpu(ri->mode); inode->i_gid = je16_to_cpu(ri->gid); diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h index 0bc6a6c80a56..55a0c1dceadf 100644 --- a/fs/jffs2/jffs2_fs_sb.h +++ b/fs/jffs2/jffs2_fs_sb.h @@ -29,6 +29,11 @@ struct jffs2_inodirty; +struct jffs2_mount_opts { + bool override_compr; + unsigned int compr; +}; + /* A struct for the overall file system control. Pointers to jffs2_sb_info structs are named `c' in the source code. Nee jffs_control @@ -126,6 +131,7 @@ struct jffs2_sb_info { #endif struct jffs2_summary *summary; /* Summary information */ + struct jffs2_mount_opts mount_opts; #ifdef CONFIG_JFFS2_FS_XATTR #define XATTRINDEX_HASHSIZE (57) diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h index 9c252835e8e5..ab65ee3ec858 100644 --- a/fs/jffs2/os-linux.h +++ b/fs/jffs2/os-linux.h @@ -173,10 +173,10 @@ int jffs2_do_setattr (struct inode *, struct iattr *); struct inode *jffs2_iget(struct super_block *, unsigned long); void jffs2_evict_inode (struct inode *); void jffs2_dirty_inode(struct inode *inode, int flags); -struct inode *jffs2_new_inode (struct inode *dir_i, int mode, +struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_raw_inode *ri); int jffs2_statfs (struct dentry *, struct kstatfs *); -int jffs2_remount_fs (struct super_block *, int *, char *); +int jffs2_do_remount_fs(struct super_block *, int *, char *); int jffs2_do_fill_super(struct super_block *sb, void *data, int silent); void jffs2_gc_release_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f); diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c index 2ab1a0d91210..ee57bac1ba6d 100644 --- a/fs/jffs2/readinode.c +++ b/fs/jffs2/readinode.c @@ -1041,7 +1041,7 @@ static int jffs2_get_inode_nodes(struct jffs2_sb_info *c, struct jffs2_inode_inf /* FIXME: point() */ err = jffs2_flash_read(c, ref_offset(ref), len, &retlen, buf); if (err) { - JFFS2_ERROR("can not read %d bytes from 0x%08x, " "error code: %d.\n", len, ref_offset(ref), err); + JFFS2_ERROR("can not read %d bytes from 0x%08x, error code: %d.\n", len, ref_offset(ref), err); goto free_out; } diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c index 8d8cd3419d02..28107ca136e4 100644 --- a/fs/jffs2/scan.c +++ b/fs/jffs2/scan.c @@ -275,9 +275,7 @@ int jffs2_scan_medium(struct jffs2_sb_info *c) else c->mtd->unpoint(c->mtd, 0, c->mtd->size); #endif - if (s) - kfree(s); - + kfree(s); return ret; } diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c index cfeb7164b085..0f20208df602 100644 --- a/fs/jffs2/security.c +++ b/fs/jffs2/security.c @@ -22,26 +22,29 @@ #include <linux/security.h> #include "nodelist.h" -/* ---- Initial Security Label Attachment -------------- */ -int jffs2_init_security(struct inode *inode, struct inode *dir, - const struct qstr *qstr) +/* ---- Initial Security Label(s) Attachment callback --- */ +int jffs2_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) { - int rc; - size_t len; - void *value; - char *name; + const struct xattr *xattr; + int err = 0; - rc = security_inode_init_security(inode, dir, qstr, &name, &value, &len); - if (rc) { - if (rc == -EOPNOTSUPP) - return 0; - return rc; + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + err = do_jffs2_setxattr(inode, JFFS2_XPREFIX_SECURITY, + xattr->name, xattr->value, + xattr->value_len, 0); + if (err < 0) + break; } - rc = do_jffs2_setxattr(inode, JFFS2_XPREFIX_SECURITY, name, value, len, 0); + return err; +} - kfree(name); - kfree(value); - return rc; +/* ---- Initial Security Label(s) Attachment ----------- */ +int jffs2_init_security(struct inode *inode, struct inode *dir, + const struct qstr *qstr) +{ + return security_inode_init_security(inode, dir, qstr, + &jffs2_initxattrs, NULL); } /* ---- XATTR Handler for "security.*" ----------------- */ diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index 853b8e300084..e7e974454115 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -17,11 +17,13 @@ #include <linux/fs.h> #include <linux/err.h> #include <linux/mount.h> +#include <linux/parser.h> #include <linux/jffs2.h> #include <linux/pagemap.h> #include <linux/mtd/super.h> #include <linux/ctype.h> #include <linux/namei.h> +#include <linux/seq_file.h> #include <linux/exportfs.h> #include "compr.h" #include "nodelist.h" @@ -75,6 +77,37 @@ static void jffs2_write_super(struct super_block *sb) unlock_super(sb); } +static const char *jffs2_compr_name(unsigned int compr) +{ + switch (compr) { + case JFFS2_COMPR_MODE_NONE: + return "none"; +#ifdef CONFIG_JFFS2_LZO + case JFFS2_COMPR_MODE_FORCELZO: + return "lzo"; +#endif +#ifdef CONFIG_JFFS2_ZLIB + case JFFS2_COMPR_MODE_FORCEZLIB: + return "zlib"; +#endif + default: + /* should never happen; programmer error */ + WARN_ON(1); + return ""; + } +} + +static int jffs2_show_options(struct seq_file *s, struct vfsmount *mnt) +{ + struct jffs2_sb_info *c = JFFS2_SB_INFO(mnt->mnt_sb); + struct jffs2_mount_opts *opts = &c->mount_opts; + + if (opts->override_compr) + seq_printf(s, ",compr=%s", jffs2_compr_name(opts->compr)); + + return 0; +} + static int jffs2_sync_fs(struct super_block *sb, int wait) { struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); @@ -133,6 +166,85 @@ static const struct export_operations jffs2_export_ops = { .fh_to_parent = jffs2_fh_to_parent, }; +/* + * JFFS2 mount options. + * + * Opt_override_compr: override default compressor + * Opt_err: just end of array marker + */ +enum { + Opt_override_compr, + Opt_err, +}; + +static const match_table_t tokens = { + {Opt_override_compr, "compr=%s"}, + {Opt_err, NULL}, +}; + +static int jffs2_parse_options(struct jffs2_sb_info *c, char *data) +{ + substring_t args[MAX_OPT_ARGS]; + char *p, *name; + + if (!data) + return 0; + + while ((p = strsep(&data, ","))) { + int token; + + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_override_compr: + name = match_strdup(&args[0]); + + if (!name) + return -ENOMEM; + if (!strcmp(name, "none")) + c->mount_opts.compr = JFFS2_COMPR_MODE_NONE; +#ifdef CONFIG_JFFS2_LZO + else if (!strcmp(name, "lzo")) + c->mount_opts.compr = JFFS2_COMPR_MODE_FORCELZO; +#endif +#ifdef CONFIG_JFFS2_ZLIB + else if (!strcmp(name, "zlib")) + c->mount_opts.compr = + JFFS2_COMPR_MODE_FORCEZLIB; +#endif + else { + printk(KERN_ERR "JFFS2 Error: unknown compressor \"%s\"", + name); + kfree(name); + return -EINVAL; + } + kfree(name); + c->mount_opts.override_compr = true; + break; + default: + printk(KERN_ERR "JFFS2 Error: unrecognized mount option '%s' or missing value\n", + p); + return -EINVAL; + } + } + + return 0; +} + +static int jffs2_remount_fs(struct super_block *sb, int *flags, char *data) +{ + struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); + int err; + + err = jffs2_parse_options(c, data); + if (err) + return -EINVAL; + + return jffs2_do_remount_fs(sb, flags, data); +} + static const struct super_operations jffs2_super_operations = { .alloc_inode = jffs2_alloc_inode, @@ -143,6 +255,7 @@ static const struct super_operations jffs2_super_operations = .remount_fs = jffs2_remount_fs, .evict_inode = jffs2_evict_inode, .dirty_inode = jffs2_dirty_inode, + .show_options = jffs2_show_options, .sync_fs = jffs2_sync_fs, }; @@ -166,6 +279,12 @@ static int jffs2_fill_super(struct super_block *sb, void *data, int silent) c->os_priv = sb; sb->s_fs_info = c; + ret = jffs2_parse_options(c, data); + if (ret) { + kfree(c); + return -EINVAL; + } + /* Initialize JFFS2 superblock locks, the further initialization will * be done later */ mutex_init(&c->alloc_sem); diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c index b955626071c2..e3035afb1814 100644 --- a/fs/jffs2/symlink.c +++ b/fs/jffs2/symlink.c @@ -20,7 +20,7 @@ const struct inode_operations jffs2_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = jffs2_follow_link, - .check_acl = jffs2_check_acl, + .get_acl = jffs2_get_acl, .setattr = jffs2_setattr, .setxattr = jffs2_setxattr, .getxattr = jffs2_getxattr, diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c index 4515bea0268f..b09e51d2f81f 100644 --- a/fs/jffs2/wbuf.c +++ b/fs/jffs2/wbuf.c @@ -578,8 +578,7 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad) if (!jffs2_is_writebuffered(c)) return 0; - if (mutex_trylock(&c->alloc_sem)) { - mutex_unlock(&c->alloc_sem); + if (!mutex_is_locked(&c->alloc_sem)) { printk(KERN_CRIT "jffs2_flush_wbuf() called with alloc_sem not locked!\n"); BUG(); } @@ -1026,7 +1025,7 @@ int jffs2_check_oob_empty(struct jffs2_sb_info *c, int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE); struct mtd_oob_ops ops; - ops.mode = MTD_OOB_AUTO; + ops.mode = MTD_OPS_AUTO_OOB; ops.ooblen = NR_OOB_SCAN_PAGES * c->oobavail; ops.oobbuf = c->oobbuf; ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0; @@ -1069,7 +1068,7 @@ int jffs2_check_nand_cleanmarker(struct jffs2_sb_info *c, struct mtd_oob_ops ops; int ret, cmlen = min_t(int, c->oobavail, OOB_CM_SIZE); - ops.mode = MTD_OOB_AUTO; + ops.mode = MTD_OPS_AUTO_OOB; ops.ooblen = cmlen; ops.oobbuf = c->oobbuf; ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0; @@ -1095,7 +1094,7 @@ int jffs2_write_nand_cleanmarker(struct jffs2_sb_info *c, struct mtd_oob_ops ops; int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE); - ops.mode = MTD_OOB_AUTO; + ops.mode = MTD_OPS_AUTO_OOB; ops.ooblen = cmlen; ops.oobbuf = (uint8_t *)&oob_cleanmarker; ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0; diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index 8a0a0666d5a6..45559dc3ea2f 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -27,7 +27,7 @@ #include "jfs_xattr.h" #include "jfs_acl.h" -static struct posix_acl *jfs_get_acl(struct inode *inode, int type) +struct posix_acl *jfs_get_acl(struct inode *inode, int type) { struct posix_acl *acl; char *ea_name; @@ -114,30 +114,9 @@ out: return rc; } -int jfs_check_acl(struct inode *inode, int mask) -{ - struct posix_acl *acl; - - if (mask & MAY_NOT_BLOCK) - return -ECHILD; - - acl = jfs_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl) { - int error = posix_acl_permission(inode, acl, mask); - posix_acl_release(acl); - return error; - } - - return -EAGAIN; -} - int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir) { struct posix_acl *acl = NULL; - struct posix_acl *clone; - mode_t mode; int rc = 0; if (S_ISLNK(inode->i_mode)) @@ -153,20 +132,11 @@ int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir) if (rc) goto cleanup; } - clone = posix_acl_clone(acl, GFP_KERNEL); - if (!clone) { - rc = -ENOMEM; - goto cleanup; - } - mode = inode->i_mode; - rc = posix_acl_create_masq(clone, &mode); - if (rc >= 0) { - inode->i_mode = mode; - if (rc > 0) - rc = jfs_set_acl(tid, inode, ACL_TYPE_ACCESS, - clone); - } - posix_acl_release(clone); + rc = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode); + if (rc < 0) + goto cleanup; /* posix_acl_release(NULL) is no-op */ + if (rc > 0) + rc = jfs_set_acl(tid, inode, ACL_TYPE_ACCESS, acl); cleanup: posix_acl_release(acl); } else @@ -180,8 +150,9 @@ cleanup: int jfs_acl_chmod(struct inode *inode) { - struct posix_acl *acl, *clone; + struct posix_acl *acl; int rc; + tid_t tid; if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; @@ -190,22 +161,18 @@ int jfs_acl_chmod(struct inode *inode) if (IS_ERR(acl) || !acl) return PTR_ERR(acl); - clone = posix_acl_clone(acl, GFP_KERNEL); - posix_acl_release(acl); - if (!clone) - return -ENOMEM; - - rc = posix_acl_chmod_masq(clone, inode->i_mode); - if (!rc) { - tid_t tid = txBegin(inode->i_sb, 0); - mutex_lock(&JFS_IP(inode)->commit_mutex); - rc = jfs_set_acl(tid, inode, ACL_TYPE_ACCESS, clone); - if (!rc) - rc = txCommit(tid, 1, &inode, 0); - txEnd(tid); - mutex_unlock(&JFS_IP(inode)->commit_mutex); - } + rc = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); + if (rc) + return rc; - posix_acl_release(clone); + tid = txBegin(inode->i_sb, 0); + mutex_lock(&JFS_IP(inode)->commit_mutex); + rc = jfs_set_acl(tid, inode, ACL_TYPE_ACCESS, acl); + if (!rc) + rc = txCommit(tid, 1, &inode, 0); + txEnd(tid); + mutex_unlock(&JFS_IP(inode)->commit_mutex); + + posix_acl_release(acl); return rc; } diff --git a/fs/jfs/file.c b/fs/jfs/file.c index 7527855b5cc6..844f9460cb11 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -140,7 +140,7 @@ const struct inode_operations jfs_file_inode_operations = { .removexattr = jfs_removexattr, .setattr = jfs_setattr, #ifdef CONFIG_JFS_POSIX_ACL - .check_acl = jfs_check_acl, + .get_acl = jfs_get_acl, #endif }; diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h index 54e07559878d..ad84fe50ca9e 100644 --- a/fs/jfs/jfs_acl.h +++ b/fs/jfs/jfs_acl.h @@ -20,7 +20,7 @@ #ifdef CONFIG_JFS_POSIX_ACL -int jfs_check_acl(struct inode *, int); +struct posix_acl *jfs_get_acl(struct inode *inode, int type); int jfs_init_acl(tid_t, struct inode *, struct inode *); int jfs_acl_chmod(struct inode *inode); diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index 4496872cf4e7..9cbd11a3f804 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -3161,7 +3161,7 @@ static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno, { int rc; int dbitno, word, rembits, nb, nwords, wbitno, agno; - s8 oldroot, *leaf; + s8 oldroot; struct dmaptree *tp = (struct dmaptree *) & dp->tree; /* save the current value of the root (i.e. maximum free string) @@ -3169,9 +3169,6 @@ static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno, */ oldroot = tp->stree[ROOT]; - /* pick up a pointer to the leaves of the dmap tree */ - leaf = tp->stree + LEAFIND; - /* determine the bit number and word within the dmap of the * starting block. */ diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c index b78b2f978f04..1b6f15f191b3 100644 --- a/fs/jfs/jfs_imap.c +++ b/fs/jfs/jfs_imap.c @@ -457,7 +457,7 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary) /* read the page of fixed disk inode (AIT) in raw mode */ mp = read_metapage(ip, address << sbi->l2nbperpage, PSIZE, 1); if (mp == NULL) { - ip->i_nlink = 1; /* Don't want iput() deleting it */ + set_nlink(ip, 1); /* Don't want iput() deleting it */ iput(ip); return (NULL); } @@ -469,7 +469,7 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary) /* copy on-disk inode to in-memory inode */ if ((copy_from_dinode(dp, ip)) != 0) { /* handle bad return by returning NULL for ip */ - ip->i_nlink = 1; /* Don't want iput() deleting it */ + set_nlink(ip, 1); /* Don't want iput() deleting it */ iput(ip); /* release the page */ release_metapage(mp); @@ -3076,7 +3076,7 @@ static int copy_from_dinode(struct dinode * dip, struct inode *ip) ip->i_mode |= 0001; } } - ip->i_nlink = le32_to_cpu(dip->di_nlink); + set_nlink(ip, le32_to_cpu(dip->di_nlink)); jfs_ip->saved_uid = le32_to_cpu(dip->di_uid); if (sbi->uid == -1) diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c index 2686531e235a..c1a3e603279c 100644 --- a/fs/jfs/jfs_inode.c +++ b/fs/jfs/jfs_inode.c @@ -157,7 +157,7 @@ fail_drop: dquot_drop(inode); inode->i_flags |= S_NOQUOTA; fail_unlock: - inode->i_nlink = 0; + clear_nlink(inode); unlock_new_inode(inode); fail_put: iput(inode); diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index 583636f745e5..cc5f811ed383 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -67,6 +67,7 @@ #include <linux/buffer_head.h> /* for sync_blockdev() */ #include <linux/bio.h> #include <linux/freezer.h> +#include <linux/export.h> #include <linux/delay.h> #include <linux/mutex.h> #include <linux/seq_file.h> diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index f6cc0c09ec63..af9606057dde 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -1143,7 +1143,6 @@ int txCommit(tid_t tid, /* transaction identifier */ struct jfs_log *log; struct tblock *tblk; struct lrd *lrd; - int lsn; struct inode *ip; struct jfs_inode_info *jfs_ip; int k, n; @@ -1310,7 +1309,7 @@ int txCommit(tid_t tid, /* transaction identifier */ */ lrd->type = cpu_to_le16(LOG_COMMIT); lrd->length = 0; - lsn = lmLog(log, tblk, lrd, NULL); + lmLog(log, tblk, lrd, NULL); lmGroupCommit(log, tblk); @@ -2935,7 +2934,6 @@ int jfs_sync(void *arg) { struct inode *ip; struct jfs_inode_info *jfs_ip; - int rc; tid_t tid; do { @@ -2961,7 +2959,7 @@ int jfs_sync(void *arg) */ TXN_UNLOCK(); tid = txBegin(ip->i_sb, COMMIT_INODE); - rc = txCommit(tid, 1, &ip, 0); + txCommit(tid, 1, &ip, 0); txEnd(tid); mutex_unlock(&jfs_ip->commit_mutex); diff --git a/fs/jfs/jfs_umount.c b/fs/jfs/jfs_umount.c index adcf92d3b603..7971f37534a3 100644 --- a/fs/jfs/jfs_umount.c +++ b/fs/jfs/jfs_umount.c @@ -68,7 +68,7 @@ int jfs_umount(struct super_block *sb) /* * Wait for outstanding transactions to be written to log: */ - jfs_flush_journal(log, 1); + jfs_flush_journal(log, 2); /* * close fileset inode allocation map (aka fileset inode) @@ -146,7 +146,7 @@ int jfs_umount_rw(struct super_block *sb) * * remove file system from log active file system list. */ - jfs_flush_journal(log, 1); + jfs_flush_journal(log, 2); /* * Make sure all metadata makes it to disk diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 03787ef6a118..a112ad96e474 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -172,7 +172,7 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode, mutex_unlock(&JFS_IP(dip)->commit_mutex); if (rc) { free_ea_wmap(ip); - ip->i_nlink = 0; + clear_nlink(ip); unlock_new_inode(ip); iput(ip); } else { @@ -292,7 +292,7 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode) goto out3; } - ip->i_nlink = 2; /* for '.' */ + set_nlink(ip, 2); /* for '.' */ ip->i_op = &jfs_dir_inode_operations; ip->i_fop = &jfs_dir_operations; @@ -311,7 +311,7 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode) mutex_unlock(&JFS_IP(dip)->commit_mutex); if (rc) { free_ea_wmap(ip); - ip->i_nlink = 0; + clear_nlink(ip); unlock_new_inode(ip); iput(ip); } else { @@ -844,7 +844,7 @@ static int jfs_link(struct dentry *old_dentry, rc = txCommit(tid, 2, &iplist[0], 0); if (rc) { - ip->i_nlink--; /* never instantiated */ + drop_nlink(ip); /* never instantiated */ iput(ip); } else d_instantiate(dentry, ip); @@ -893,7 +893,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, unchar *i_fastsymlink; s64 xlen = 0; int bmask = 0, xsize; - s64 extent = 0, xaddr; + s64 xaddr; struct metapage *mp; struct super_block *sb; struct tblock *tblk; @@ -993,7 +993,6 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, txAbort(tid, 0); goto out3; } - extent = xaddr; ip->i_size = ssize - 1; while (ssize) { /* This is kind of silly since PATH_MAX == 4K */ @@ -1049,7 +1048,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, mutex_unlock(&JFS_IP(dip)->commit_mutex); if (rc) { free_ea_wmap(ip); - ip->i_nlink = 0; + clear_nlink(ip); unlock_new_inode(ip); iput(ip); } else { @@ -1434,7 +1433,7 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry, mutex_unlock(&JFS_IP(dir)->commit_mutex); if (rc) { free_ea_wmap(ip); - ip->i_nlink = 0; + clear_nlink(ip); unlock_new_inode(ip); iput(ip); } else { @@ -1537,7 +1536,7 @@ const struct inode_operations jfs_dir_inode_operations = { .removexattr = jfs_removexattr, .setattr = jfs_setattr, #ifdef CONFIG_JFS_POSIX_ACL - .check_acl = jfs_check_acl, + .get_acl = jfs_get_acl, #endif }; diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 06c8a67cbe76..a44eff076c17 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -485,7 +485,6 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent) goto out_unload; } inode->i_ino = 0; - inode->i_nlink = 1; inode->i_size = sb->s_bdev->bd_inode->i_size; inode->i_mapping->a_ops = &jfs_metapage_aops; insert_inode_hash(inode); diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index 24838f1eeee5..26683e15b3ac 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -693,8 +693,7 @@ static int can_set_system_xattr(struct inode *inode, const char *name, return rc; } if (acl) { - mode_t mode = inode->i_mode; - rc = posix_acl_equiv_mode(acl, &mode); + rc = posix_acl_equiv_mode(acl, &inode->i_mode); posix_acl_release(acl); if (rc < 0) { printk(KERN_ERR @@ -702,7 +701,6 @@ static int can_set_system_xattr(struct inode *inode, const char *name, rc); return rc; } - inode->i_mode = mode; mark_inode_dirty(inode); } /* @@ -1091,38 +1089,37 @@ int jfs_removexattr(struct dentry *dentry, const char *name) } #ifdef CONFIG_JFS_SECURITY -int jfs_init_security(tid_t tid, struct inode *inode, struct inode *dir, - const struct qstr *qstr) +int jfs_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) { - int rc; - size_t len; - void *value; - char *suffix; + const struct xattr *xattr; + tid_t *tid = fs_info; char *name; - - rc = security_inode_init_security(inode, dir, qstr, &suffix, &value, - &len); - if (rc) { - if (rc == -EOPNOTSUPP) - return 0; - return rc; - } - name = kmalloc(XATTR_SECURITY_PREFIX_LEN + 1 + strlen(suffix), - GFP_NOFS); - if (!name) { - rc = -ENOMEM; - goto kmalloc_failed; + int err = 0; + + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + name = kmalloc(XATTR_SECURITY_PREFIX_LEN + + strlen(xattr->name) + 1, GFP_NOFS); + if (!name) { + err = -ENOMEM; + break; + } + strcpy(name, XATTR_SECURITY_PREFIX); + strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name); + + err = __jfs_setxattr(*tid, inode, name, + xattr->value, xattr->value_len, 0); + kfree(name); + if (err < 0) + break; } - strcpy(name, XATTR_SECURITY_PREFIX); - strcpy(name + XATTR_SECURITY_PREFIX_LEN, suffix); - - rc = __jfs_setxattr(tid, inode, name, value, len, 0); - - kfree(name); -kmalloc_failed: - kfree(suffix); - kfree(value); + return err; +} - return rc; +int jfs_init_security(tid_t tid, struct inode *inode, struct inode *dir, + const struct qstr *qstr) +{ + return security_inode_init_security(inode, dir, qstr, + &jfs_initxattrs, &tid); } #endif diff --git a/fs/libfs.c b/fs/libfs.c index c18e9a1235b6..f6d411eef1e7 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -490,7 +490,7 @@ int simple_fill_super(struct super_block *s, unsigned long magic, inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; - inode->i_nlink = 2; + set_nlink(inode, 2); root = d_alloc_root(inode); if (!root) { iput(inode); @@ -510,8 +510,10 @@ int simple_fill_super(struct super_block *s, unsigned long magic, if (!dentry) goto out; inode = new_inode(s); - if (!inode) + if (!inode) { + dput(dentry); goto out; + } inode->i_mode = S_IFREG | files->mode; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_fop = files->ops; diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index e374050a911c..8392cb85bd54 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -302,7 +302,8 @@ nlmclnt_call(struct rpc_cred *cred, struct nlm_rqst *req, u32 proc) /* We appear to be out of the grace period */ wake_up_all(&host->h_gracewait); } - dprintk("lockd: server returns status %d\n", resp->status); + dprintk("lockd: server returns status %d\n", + ntohl(resp->status)); return 0; /* Okay, call complete */ } @@ -690,7 +691,8 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl) goto out; if (resp->status != nlm_lck_denied_nolocks) - printk("lockd: unexpected unlock status: %d\n", resp->status); + printk("lockd: unexpected unlock status: %d\n", + ntohl(resp->status)); /* What to do now? I'm out of my depth... */ status = -ENOLCK; out: @@ -843,6 +845,7 @@ nlm_stat_to_errno(__be32 status) return -ENOLCK; #endif } - printk(KERN_NOTICE "lockd: unexpected server status %d\n", status); + printk(KERN_NOTICE "lockd: unexpected server status %d\n", + ntohl(status)); return -ENOLCK; } diff --git a/fs/lockd/host.c b/fs/lockd/host.c index b7c99bfb3da6..6f29836ec0cb 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -316,14 +316,8 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp, struct hlist_node *pos; struct nlm_host *host = NULL; struct nsm_handle *nsm = NULL; - struct sockaddr_in sin = { - .sin_family = AF_INET, - }; - struct sockaddr_in6 sin6 = { - .sin6_family = AF_INET6, - }; - struct sockaddr *src_sap; - size_t src_len = rqstp->rq_addrlen; + struct sockaddr *src_sap = svc_daddr(rqstp); + size_t src_len = rqstp->rq_daddrlen; struct nlm_lookup_host_info ni = { .server = 1, .sap = svc_addr(rqstp), @@ -340,21 +334,6 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp, mutex_lock(&nlm_host_mutex); - switch (ni.sap->sa_family) { - case AF_INET: - sin.sin_addr.s_addr = rqstp->rq_daddr.addr.s_addr; - src_sap = (struct sockaddr *)&sin; - break; - case AF_INET6: - ipv6_addr_copy(&sin6.sin6_addr, &rqstp->rq_daddr.addr6); - src_sap = (struct sockaddr *)&sin6; - break; - default: - dprintk("lockd: %s failed; unrecognized address family\n", - __func__); - goto out; - } - if (time_after_eq(jiffies, next_gc)) nlm_gc_hosts(); diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index abfff9d7979d..c061b9aa7ddb 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -282,7 +282,7 @@ int lockd_up(void) /* * Create the kernel thread and wait for it to start. */ - nlmsvc_rqst = svc_prepare_thread(serv, &serv->sv_pools[0]); + nlmsvc_rqst = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE); if (IS_ERR(nlmsvc_rqst)) { error = PTR_ERR(nlmsvc_rqst); nlmsvc_rqst = NULL; diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 6e31695d046f..f0179c3745d2 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -632,7 +632,7 @@ nlmsvc_cancel_blocked(struct nlm_file *file, struct nlm_lock *lock) /* * This is a callback from the filesystem for VFS file lock requests. - * It will be used if fl_grant is defined and the filesystem can not + * It will be used if lm_grant is defined and the filesystem can not * respond to the request immediately. * For GETLK request it will copy the reply to the nlm_block. * For SETLK or SETLKW request it will get the local posix lock. @@ -719,9 +719,9 @@ static int nlmsvc_same_owner(struct file_lock *fl1, struct file_lock *fl2) } const struct lock_manager_operations nlmsvc_lock_operations = { - .fl_compare_owner = nlmsvc_same_owner, - .fl_notify = nlmsvc_notify_blocked, - .fl_grant = nlmsvc_grant_deferred, + .lm_compare_owner = nlmsvc_same_owner, + .lm_notify = nlmsvc_notify_blocked, + .lm_grant = nlmsvc_grant_deferred, }; /* diff --git a/fs/locks.c b/fs/locks.c index b286539d547a..3b0d05dcd7c1 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -60,7 +60,7 @@ * * Initial implementation of mandatory locks. SunOS turned out to be * a rotten model, so I implemented the "obvious" semantics. - * See 'Documentation/mandatory.txt' for details. + * See 'Documentation/filesystems/mandatory-locking.txt' for details. * Andy Walker (andy@lysaker.kvaerner.no), April 06, 1996. * * Don't allow mandatory locks on mmap()'ed files. Added simple functions to @@ -133,6 +133,20 @@ #define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK) #define IS_LEASE(fl) (fl->fl_flags & FL_LEASE) +static bool lease_breaking(struct file_lock *fl) +{ + return fl->fl_flags & (FL_UNLOCK_PENDING | FL_DOWNGRADE_PENDING); +} + +static int target_leasetype(struct file_lock *fl) +{ + if (fl->fl_flags & FL_UNLOCK_PENDING) + return F_UNLCK; + if (fl->fl_flags & FL_DOWNGRADE_PENDING) + return F_RDLCK; + return fl->fl_type; +} + int leases_enable = 1; int lease_break_time = 45; @@ -160,26 +174,20 @@ EXPORT_SYMBOL_GPL(unlock_flocks); static struct kmem_cache *filelock_cache __read_mostly; -static void locks_init_lock_always(struct file_lock *fl) +static void locks_init_lock_heads(struct file_lock *fl) { - fl->fl_next = NULL; - fl->fl_fasync = NULL; - fl->fl_owner = NULL; - fl->fl_pid = 0; - fl->fl_nspid = NULL; - fl->fl_file = NULL; - fl->fl_flags = 0; - fl->fl_type = 0; - fl->fl_start = fl->fl_end = 0; + INIT_LIST_HEAD(&fl->fl_link); + INIT_LIST_HEAD(&fl->fl_block); + init_waitqueue_head(&fl->fl_wait); } /* Allocate an empty lock structure. */ struct file_lock *locks_alloc_lock(void) { - struct file_lock *fl = kmem_cache_alloc(filelock_cache, GFP_KERNEL); + struct file_lock *fl = kmem_cache_zalloc(filelock_cache, GFP_KERNEL); if (fl) - locks_init_lock_always(fl); + locks_init_lock_heads(fl); return fl; } @@ -193,8 +201,8 @@ void locks_release_private(struct file_lock *fl) fl->fl_ops = NULL; } if (fl->fl_lmops) { - if (fl->fl_lmops->fl_release_private) - fl->fl_lmops->fl_release_private(fl); + if (fl->fl_lmops->lm_release_private) + fl->fl_lmops->lm_release_private(fl); fl->fl_lmops = NULL; } @@ -215,27 +223,12 @@ EXPORT_SYMBOL(locks_free_lock); void locks_init_lock(struct file_lock *fl) { - INIT_LIST_HEAD(&fl->fl_link); - INIT_LIST_HEAD(&fl->fl_block); - init_waitqueue_head(&fl->fl_wait); - fl->fl_ops = NULL; - fl->fl_lmops = NULL; - locks_init_lock_always(fl); + memset(fl, 0, sizeof(struct file_lock)); + locks_init_lock_heads(fl); } EXPORT_SYMBOL(locks_init_lock); -/* - * Initialises the fields of the file lock which are invariant for - * free file_locks. - */ -static void init_once(void *foo) -{ - struct file_lock *lock = (struct file_lock *) foo; - - locks_init_lock(lock); -} - static void locks_copy_private(struct file_lock *new, struct file_lock *fl) { if (fl->fl_ops) { @@ -444,9 +437,9 @@ static void lease_release_private_callback(struct file_lock *fl) } static const struct lock_manager_operations lease_manager_ops = { - .fl_break = lease_break_callback, - .fl_release_private = lease_release_private_callback, - .fl_change = lease_modify, + .lm_break = lease_break_callback, + .lm_release_private = lease_release_private_callback, + .lm_change = lease_modify, }; /* @@ -499,9 +492,9 @@ static inline int locks_overlap(struct file_lock *fl1, struct file_lock *fl2) */ static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2) { - if (fl1->fl_lmops && fl1->fl_lmops->fl_compare_owner) + if (fl1->fl_lmops && fl1->fl_lmops->lm_compare_owner) return fl2->fl_lmops == fl1->fl_lmops && - fl1->fl_lmops->fl_compare_owner(fl1, fl2); + fl1->fl_lmops->lm_compare_owner(fl1, fl2); return fl1->fl_owner == fl2->fl_owner; } @@ -551,8 +544,8 @@ static void locks_wake_up_blocks(struct file_lock *blocker) waiter = list_first_entry(&blocker->fl_block, struct file_lock, fl_block); __locks_delete_block(waiter); - if (waiter->fl_lmops && waiter->fl_lmops->fl_notify) - waiter->fl_lmops->fl_notify(waiter); + if (waiter->fl_lmops && waiter->fl_lmops->lm_notify) + waiter->fl_lmops->lm_notify(waiter); else wake_up(&waiter->fl_wait); } @@ -1140,6 +1133,17 @@ int locks_mandatory_area(int read_write, struct inode *inode, EXPORT_SYMBOL(locks_mandatory_area); +static void lease_clear_pending(struct file_lock *fl, int arg) +{ + switch (arg) { + case F_UNLCK: + fl->fl_flags &= ~FL_UNLOCK_PENDING; + /* fall through: */ + case F_RDLCK: + fl->fl_flags &= ~FL_DOWNGRADE_PENDING; + } +} + /* We already had a lease on this file; just change its type */ int lease_modify(struct file_lock **before, int arg) { @@ -1148,6 +1152,7 @@ int lease_modify(struct file_lock **before, int arg) if (error) return error; + lease_clear_pending(fl, arg); locks_wake_up_blocks(fl); if (arg == F_UNLCK) locks_delete_lock(before); @@ -1156,19 +1161,25 @@ int lease_modify(struct file_lock **before, int arg) EXPORT_SYMBOL(lease_modify); +static bool past_time(unsigned long then) +{ + if (!then) + /* 0 is a special value meaning "this never expires": */ + return false; + return time_after(jiffies, then); +} + static void time_out_leases(struct inode *inode) { struct file_lock **before; struct file_lock *fl; before = &inode->i_flock; - while ((fl = *before) && IS_LEASE(fl) && (fl->fl_type & F_INPROGRESS)) { - if ((fl->fl_break_time == 0) - || time_before(jiffies, fl->fl_break_time)) { - before = &fl->fl_next; - continue; - } - lease_modify(before, fl->fl_type & ~F_INPROGRESS); + while ((fl = *before) && IS_LEASE(fl) && lease_breaking(fl)) { + if (past_time(fl->fl_downgrade_time)) + lease_modify(before, F_RDLCK); + if (past_time(fl->fl_break_time)) + lease_modify(before, F_UNLCK); if (fl == *before) /* lease_modify may have freed fl */ before = &fl->fl_next; } @@ -1186,7 +1197,7 @@ static void time_out_leases(struct inode *inode) */ int __break_lease(struct inode *inode, unsigned int mode) { - int error = 0, future; + int error = 0; struct file_lock *new_fl, *flock; struct file_lock *fl; unsigned long break_time; @@ -1203,24 +1214,13 @@ int __break_lease(struct inode *inode, unsigned int mode) if ((flock == NULL) || !IS_LEASE(flock)) goto out; + if (!locks_conflict(flock, new_fl)) + goto out; + for (fl = flock; fl && IS_LEASE(fl); fl = fl->fl_next) if (fl->fl_owner == current->files) i_have_this_lease = 1; - if (want_write) { - /* If we want write access, we have to revoke any lease. */ - future = F_UNLCK | F_INPROGRESS; - } else if (flock->fl_type & F_INPROGRESS) { - /* If the lease is already being broken, we just leave it */ - future = flock->fl_type; - } else if (flock->fl_type & F_WRLCK) { - /* Downgrade the exclusive lease to a read-only lease. */ - future = F_RDLCK | F_INPROGRESS; - } else { - /* the existing lease was read-only, so we can read too. */ - goto out; - } - if (IS_ERR(new_fl) && !i_have_this_lease && ((mode & O_NONBLOCK) == 0)) { error = PTR_ERR(new_fl); @@ -1235,12 +1235,18 @@ int __break_lease(struct inode *inode, unsigned int mode) } for (fl = flock; fl && IS_LEASE(fl); fl = fl->fl_next) { - if (fl->fl_type != future) { - fl->fl_type = future; + if (want_write) { + if (fl->fl_flags & FL_UNLOCK_PENDING) + continue; + fl->fl_flags |= FL_UNLOCK_PENDING; fl->fl_break_time = break_time; - /* lease must have lmops break callback */ - fl->fl_lmops->fl_break(fl); + } else { + if (lease_breaking(flock)) + continue; + fl->fl_flags |= FL_DOWNGRADE_PENDING; + fl->fl_downgrade_time = break_time; } + fl->fl_lmops->lm_break(fl); } if (i_have_this_lease || (mode & O_NONBLOCK)) { @@ -1264,10 +1270,13 @@ restart: if (error >= 0) { if (error == 0) time_out_leases(inode); - /* Wait for the next lease that has not been broken yet */ + /* + * Wait for the next conflicting lease that has not been + * broken yet + */ for (flock = inode->i_flock; flock && IS_LEASE(flock); flock = flock->fl_next) { - if (flock->fl_type & F_INPROGRESS) + if (locks_conflict(new_fl, flock)) goto restart; } error = 0; @@ -1335,7 +1344,7 @@ int fcntl_getlease(struct file *filp) for (fl = filp->f_path.dentry->d_inode->i_flock; fl && IS_LEASE(fl); fl = fl->fl_next) { if (fl->fl_file == filp) { - type = fl->fl_type & ~F_INPROGRESS; + type = target_leasetype(fl); break; } } @@ -1343,50 +1352,23 @@ int fcntl_getlease(struct file *filp) return type; } -/** - * generic_setlease - sets a lease on an open file - * @filp: file pointer - * @arg: type of lease to obtain - * @flp: input - file_lock to use, output - file_lock inserted - * - * The (input) flp->fl_lmops->fl_break function is required - * by break_lease(). - * - * Called with file_lock_lock held. - */ -int generic_setlease(struct file *filp, long arg, struct file_lock **flp) +int generic_add_lease(struct file *filp, long arg, struct file_lock **flp) { struct file_lock *fl, **before, **my_before = NULL, *lease; struct dentry *dentry = filp->f_path.dentry; struct inode *inode = dentry->d_inode; - int error, rdlease_count = 0, wrlease_count = 0; + int error; lease = *flp; - error = -EACCES; - if ((current_fsuid() != inode->i_uid) && !capable(CAP_LEASE)) - goto out; - error = -EINVAL; - if (!S_ISREG(inode->i_mode)) + error = -EAGAIN; + if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0)) goto out; - error = security_file_lock(filp, arg); - if (error) + if ((arg == F_WRLCK) + && ((dentry->d_count > 1) + || (atomic_read(&inode->i_count) > 1))) goto out; - time_out_leases(inode); - - BUG_ON(!(*flp)->fl_lmops->fl_break); - - if (arg != F_UNLCK) { - error = -EAGAIN; - if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0)) - goto out; - if ((arg == F_WRLCK) - && ((dentry->d_count > 1) - || (atomic_read(&inode->i_count) > 1))) - goto out; - } - /* * At this point, we know that if there is an exclusive * lease on this file, then we hold it on this filp @@ -1395,37 +1377,35 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp) * then the file is not open by anyone (including us) * except for this filp. */ + error = -EAGAIN; for (before = &inode->i_flock; ((fl = *before) != NULL) && IS_LEASE(fl); before = &fl->fl_next) { - if (fl->fl_file == filp) + if (fl->fl_file == filp) { my_before = before; - else if (fl->fl_type == (F_INPROGRESS | F_UNLCK)) - /* - * Someone is in the process of opening this - * file for writing so we may not take an - * exclusive lease on it. - */ - wrlease_count++; - else - rdlease_count++; + continue; + } + /* + * No exclusive leases if someone else has a lease on + * this file: + */ + if (arg == F_WRLCK) + goto out; + /* + * Modifying our existing lease is OK, but no getting a + * new lease if someone else is opening for write: + */ + if (fl->fl_flags & FL_UNLOCK_PENDING) + goto out; } - error = -EAGAIN; - if ((arg == F_RDLCK && (wrlease_count > 0)) || - (arg == F_WRLCK && ((rdlease_count + wrlease_count) > 0))) - goto out; - if (my_before != NULL) { - error = lease->fl_lmops->fl_change(my_before, arg); + error = lease->fl_lmops->lm_change(my_before, arg); if (!error) *flp = *my_before; goto out; } - if (arg == F_UNLCK) - goto out; - error = -EINVAL; if (!leases_enable) goto out; @@ -1436,6 +1416,62 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp) out: return error; } + +int generic_delete_lease(struct file *filp, struct file_lock **flp) +{ + struct file_lock *fl, **before; + struct dentry *dentry = filp->f_path.dentry; + struct inode *inode = dentry->d_inode; + + for (before = &inode->i_flock; + ((fl = *before) != NULL) && IS_LEASE(fl); + before = &fl->fl_next) { + if (fl->fl_file != filp) + continue; + return (*flp)->fl_lmops->lm_change(before, F_UNLCK); + } + return -EAGAIN; +} + +/** + * generic_setlease - sets a lease on an open file + * @filp: file pointer + * @arg: type of lease to obtain + * @flp: input - file_lock to use, output - file_lock inserted + * + * The (input) flp->fl_lmops->lm_break function is required + * by break_lease(). + * + * Called with file_lock_lock held. + */ +int generic_setlease(struct file *filp, long arg, struct file_lock **flp) +{ + struct dentry *dentry = filp->f_path.dentry; + struct inode *inode = dentry->d_inode; + int error; + + if ((current_fsuid() != inode->i_uid) && !capable(CAP_LEASE)) + return -EACCES; + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + error = security_file_lock(filp, arg); + if (error) + return error; + + time_out_leases(inode); + + BUG_ON(!(*flp)->fl_lmops->lm_break); + + switch (arg) { + case F_UNLCK: + return generic_delete_lease(filp, flp); + case F_RDLCK: + case F_WRLCK: + return generic_add_lease(filp, arg, flp); + default: + BUG(); + } +} EXPORT_SYMBOL(generic_setlease); static int __vfs_setlease(struct file *filp, long arg, struct file_lock **lease) @@ -1453,7 +1489,7 @@ static int __vfs_setlease(struct file *filp, long arg, struct file_lock **lease) * @lease: file_lock to use * * Call this to establish a lease on the file. - * The (*lease)->fl_lmops->fl_break operation must be set; if not, + * The (*lease)->fl_lmops->lm_break operation must be set; if not, * break_lease will oops! * * This will call the filesystem's setlease file method, if @@ -1751,10 +1787,10 @@ out: * To avoid blocking kernel daemons, such as lockd, that need to acquire POSIX * locks, the ->lock() interface may return asynchronously, before the lock has * been granted or denied by the underlying filesystem, if (and only if) - * fl_grant is set. Callers expecting ->lock() to return asynchronously + * lm_grant is set. Callers expecting ->lock() to return asynchronously * will only use F_SETLK, not F_SETLKW; they will set FL_SLEEP if (and only if) * the request is for a blocking lock. When ->lock() does return asynchronously, - * it must return FILE_LOCK_DEFERRED, and call ->fl_grant() when the lock + * it must return FILE_LOCK_DEFERRED, and call ->lm_grant() when the lock * request completes. * If the request is for non-blocking lock the file system should return * FILE_LOCK_DEFERRED then try to get the lock and call the callback routine @@ -1764,7 +1800,7 @@ out: * grants a lock so the VFS can find out which locks are locally held and do * the correct lock cleanup when required. * The underlying filesystem must not drop the kernel lock or call - * ->fl_grant() before returning to the caller with a FILE_LOCK_DEFERRED + * ->lm_grant() before returning to the caller with a FILE_LOCK_DEFERRED * return code. */ int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, struct file_lock *conf) @@ -2147,7 +2183,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, } } else if (IS_LEASE(fl)) { seq_printf(f, "LEASE "); - if (fl->fl_type & F_INPROGRESS) + if (lease_breaking(fl)) seq_printf(f, "BREAKING "); else if (fl->fl_file) seq_printf(f, "ACTIVE "); @@ -2163,7 +2199,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, : (fl->fl_type & LOCK_WRITE) ? "WRITE" : "NONE "); } else { seq_printf(f, "%s ", - (fl->fl_type & F_INPROGRESS) + (lease_breaking(fl)) ? (fl->fl_type & F_UNLCK) ? "UNLCK" : "READ " : (fl->fl_type & F_WRLCK) ? "WRITE" : "READ "); } @@ -2333,8 +2369,8 @@ EXPORT_SYMBOL(lock_may_write); static int __init filelock_init(void) { filelock_cache = kmem_cache_create("file_lock_cache", - sizeof(struct file_lock), 0, SLAB_PANIC, - init_once); + sizeof(struct file_lock), 0, SLAB_PANIC, NULL); + return 0; } diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c index b3ff3d894165..b7d7f67cee5a 100644 --- a/fs/logfs/dir.c +++ b/fs/logfs/dir.c @@ -197,7 +197,7 @@ static int logfs_remove_inode(struct inode *inode) { int ret; - inode->i_nlink--; + drop_nlink(inode); ret = write_inode(inode); LOGFS_BUG_ON(ret, inode->i_sb); return ret; @@ -433,7 +433,7 @@ static int __logfs_create(struct inode *dir, struct dentry *dentry, ta = kzalloc(sizeof(*ta), GFP_KERNEL); if (!ta) { - inode->i_nlink--; + drop_nlink(inode); iput(inode); return -ENOMEM; } @@ -456,7 +456,7 @@ static int __logfs_create(struct inode *dir, struct dentry *dentry, abort_transaction(inode, ta); li->li_flags |= LOGFS_IF_STILLBORN; /* FIXME: truncate symlink */ - inode->i_nlink--; + drop_nlink(inode); iput(inode); goto out; } @@ -563,7 +563,7 @@ static int logfs_link(struct dentry *old_dentry, struct inode *dir, inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; ihold(inode); - inode->i_nlink++; + inc_nlink(inode); mark_inode_dirty_sync(inode); return __logfs_create(dir, dentry, inode, NULL, 0); diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c index edfea7a3a747..7e441ad5f792 100644 --- a/fs/logfs/inode.c +++ b/fs/logfs/inode.c @@ -93,7 +93,7 @@ static struct inode *__logfs_iget(struct super_block *sb, ino_t ino) /* inode->i_nlink == 0 can be true when called from * block validator */ /* set i_nlink to 0 to prevent caching */ - inode->i_nlink = 0; + clear_nlink(inode); logfs_inode(inode)->li_flags |= LOGFS_IF_ZOMBIE; iget_failed(inode); if (!err) @@ -199,7 +199,6 @@ static void logfs_init_inode(struct super_block *sb, struct inode *inode) inode->i_blocks = 0; inode->i_ctime = CURRENT_TIME; inode->i_mtime = CURRENT_TIME; - inode->i_nlink = 1; li->li_refcount = 1; INIT_LIST_HEAD(&li->li_freeing_list); diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h index f22d108bfa5d..398ecff6e548 100644 --- a/fs/logfs/logfs.h +++ b/fs/logfs/logfs.h @@ -618,7 +618,6 @@ static inline int logfs_buf_recover(struct logfs_area *area, u64 ofs, struct page *emergency_read_begin(struct address_space *mapping, pgoff_t index); void emergency_read_end(struct page *page); void logfs_crash_dump(struct super_block *sb); -void *memchr_inv(const void *s, int c, size_t n); int logfs_statfs(struct dentry *dentry, struct kstatfs *stats); int logfs_check_ds(struct logfs_disk_super *ds); int logfs_write_sb(struct super_block *sb); diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c index d8d09380c7de..2ac4217b7901 100644 --- a/fs/logfs/readwrite.c +++ b/fs/logfs/readwrite.c @@ -126,7 +126,7 @@ static void logfs_disk_to_inode(struct logfs_disk_inode *di, struct inode*inode) inode->i_atime = be64_to_timespec(di->di_atime); inode->i_ctime = be64_to_timespec(di->di_ctime); inode->i_mtime = be64_to_timespec(di->di_mtime); - inode->i_nlink = be32_to_cpu(di->di_refcount); + set_nlink(inode, be32_to_cpu(di->di_refcount)); inode->i_generation = be32_to_cpu(di->di_generation); switch (inode->i_mode & S_IFMT) { diff --git a/fs/logfs/super.c b/fs/logfs/super.c index ce03a182c771..e795c234ea33 100644 --- a/fs/logfs/super.c +++ b/fs/logfs/super.c @@ -13,6 +13,7 @@ #include <linux/bio.h> #include <linux/slab.h> #include <linux/blkdev.h> +#include <linux/module.h> #include <linux/mtd/mtd.h> #include <linux/statfs.h> #include <linux/buffer_head.h> @@ -91,28 +92,6 @@ void logfs_crash_dump(struct super_block *sb) } /* - * TODO: move to lib/string.c - */ -/** - * memchr_inv - Find a character in an area of memory. - * @s: The memory area - * @c: The byte to search for - * @n: The size of the area. - * - * returns the address of the first character other than @c, or %NULL - * if the whole buffer contains just @c. - */ -void *memchr_inv(const void *s, int c, size_t n) -{ - const unsigned char *p = s; - while (n-- != 0) - if ((unsigned char)c != *p++) - return (void *)(p - 1); - - return NULL; -} - -/* * FIXME: There should be a reserve for root, similar to ext2. */ int logfs_statfs(struct dentry *dentry, struct kstatfs *stats) diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c index 3f32bcb0d9bd..ef175cb8cfd8 100644 --- a/fs/minix/bitmap.c +++ b/fs/minix/bitmap.c @@ -16,38 +16,26 @@ #include <linux/bitops.h> #include <linux/sched.h> -static const int nibblemap[] = { 4,3,3,2,3,2,2,1,3,2,2,1,2,1,1,0 }; - static DEFINE_SPINLOCK(bitmap_lock); -static unsigned long count_free(struct buffer_head *map[], unsigned numblocks, __u32 numbits) +/* + * bitmap consists of blocks filled with 16bit words + * bit set == busy, bit clear == free + * endianness is a mess, but for counting zero bits it really doesn't matter... + */ +static __u32 count_free(struct buffer_head *map[], unsigned blocksize, __u32 numbits) { - unsigned i, j, sum = 0; - struct buffer_head *bh; - - for (i=0; i<numblocks-1; i++) { - if (!(bh=map[i])) - return(0); - for (j=0; j<bh->b_size; j++) - sum += nibblemap[bh->b_data[j] & 0xf] - + nibblemap[(bh->b_data[j]>>4) & 0xf]; - } + __u32 sum = 0; + unsigned blocks = DIV_ROUND_UP(numbits, blocksize * 8); - if (numblocks==0 || !(bh=map[numblocks-1])) - return(0); - i = ((numbits - (numblocks-1) * bh->b_size * 8) / 16) * 2; - for (j=0; j<i; j++) { - sum += nibblemap[bh->b_data[j] & 0xf] - + nibblemap[(bh->b_data[j]>>4) & 0xf]; + while (blocks--) { + unsigned words = blocksize / 2; + __u16 *p = (__u16 *)(*map++)->b_data; + while (words--) + sum += 16 - hweight16(*p++); } - i = numbits%16; - if (i!=0) { - i = *(__u16 *)(&bh->b_data[j]) | ~((1<<i) - 1); - sum += nibblemap[i & 0xf] + nibblemap[(i>>4) & 0xf]; - sum += nibblemap[(i>>8) & 0xf] + nibblemap[(i>>12) & 0xf]; - } - return(sum); + return sum; } void minix_free_block(struct inode *inode, unsigned long block) @@ -105,10 +93,12 @@ int minix_new_block(struct inode * inode) return 0; } -unsigned long minix_count_free_blocks(struct minix_sb_info *sbi) +unsigned long minix_count_free_blocks(struct super_block *sb) { - return (count_free(sbi->s_zmap, sbi->s_zmap_blocks, - sbi->s_nzones - sbi->s_firstdatazone + 1) + struct minix_sb_info *sbi = minix_sb(sb); + u32 bits = sbi->s_nzones - (sbi->s_firstdatazone + 1); + + return (count_free(sbi->s_zmap, sb->s_blocksize, bits) << sbi->s_log_zone_size); } @@ -273,7 +263,10 @@ struct inode *minix_new_inode(const struct inode *dir, int mode, int *error) return inode; } -unsigned long minix_count_free_inodes(struct minix_sb_info *sbi) +unsigned long minix_count_free_inodes(struct super_block *sb) { - return count_free(sbi->s_imap, sbi->s_imap_blocks, sbi->s_ninodes + 1); + struct minix_sb_info *sbi = minix_sb(sb); + u32 bits = sbi->s_ninodes + 1; + + return count_free(sbi->s_imap, sb->s_blocksize, bits); } diff --git a/fs/minix/inode.c b/fs/minix/inode.c index e7d23e25bf1d..1d9e33966db0 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -279,6 +279,27 @@ static int minix_fill_super(struct super_block *s, void *data, int silent) else if (sbi->s_mount_state & MINIX_ERROR_FS) printk("MINIX-fs: mounting file system with errors, " "running fsck is recommended\n"); + + /* Apparently minix can create filesystems that allocate more blocks for + * the bitmaps than needed. We simply ignore that, but verify it didn't + * create one with not enough blocks and bail out if so. + */ + block = minix_blocks_needed(sbi->s_ninodes, s->s_blocksize); + if (sbi->s_imap_blocks < block) { + printk("MINIX-fs: file system does not have enough " + "imap blocks allocated. Refusing to mount\n"); + goto out_iput; + } + + block = minix_blocks_needed( + (sbi->s_nzones - (sbi->s_firstdatazone + 1)), + s->s_blocksize); + if (sbi->s_zmap_blocks < block) { + printk("MINIX-fs: file system does not have enough " + "zmap blocks allocated. Refusing to mount.\n"); + goto out_iput; + } + return 0; out_iput: @@ -339,10 +360,10 @@ static int minix_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_type = sb->s_magic; buf->f_bsize = sb->s_blocksize; buf->f_blocks = (sbi->s_nzones - sbi->s_firstdatazone) << sbi->s_log_zone_size; - buf->f_bfree = minix_count_free_blocks(sbi); + buf->f_bfree = minix_count_free_blocks(sb); buf->f_bavail = buf->f_bfree; buf->f_files = sbi->s_ninodes; - buf->f_ffree = minix_count_free_inodes(sbi); + buf->f_ffree = minix_count_free_inodes(sb); buf->f_namelen = sbi->s_namelen; buf->f_fsid.val[0] = (u32)id; buf->f_fsid.val[1] = (u32)(id >> 32); @@ -446,7 +467,7 @@ static struct inode *V1_minix_iget(struct inode *inode) inode->i_mode = raw_inode->i_mode; inode->i_uid = (uid_t)raw_inode->i_uid; inode->i_gid = (gid_t)raw_inode->i_gid; - inode->i_nlink = raw_inode->i_nlinks; + set_nlink(inode, raw_inode->i_nlinks); inode->i_size = raw_inode->i_size; inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec = raw_inode->i_time; inode->i_mtime.tv_nsec = 0; @@ -479,7 +500,7 @@ static struct inode *V2_minix_iget(struct inode *inode) inode->i_mode = raw_inode->i_mode; inode->i_uid = (uid_t)raw_inode->i_uid; inode->i_gid = (gid_t)raw_inode->i_gid; - inode->i_nlink = raw_inode->i_nlinks; + set_nlink(inode, raw_inode->i_nlinks); inode->i_size = raw_inode->i_size; inode->i_mtime.tv_sec = raw_inode->i_mtime; inode->i_atime.tv_sec = raw_inode->i_atime; diff --git a/fs/minix/minix.h b/fs/minix/minix.h index 341e2122879a..26bbd55e82ea 100644 --- a/fs/minix/minix.h +++ b/fs/minix/minix.h @@ -48,10 +48,10 @@ extern struct minix_inode * minix_V1_raw_inode(struct super_block *, ino_t, stru extern struct minix2_inode * minix_V2_raw_inode(struct super_block *, ino_t, struct buffer_head **); extern struct inode * minix_new_inode(const struct inode *, int, int *); extern void minix_free_inode(struct inode * inode); -extern unsigned long minix_count_free_inodes(struct minix_sb_info *sbi); +extern unsigned long minix_count_free_inodes(struct super_block *sb); extern int minix_new_block(struct inode * inode); extern void minix_free_block(struct inode *inode, unsigned long block); -extern unsigned long minix_count_free_blocks(struct minix_sb_info *sbi); +extern unsigned long minix_count_free_blocks(struct super_block *sb); extern int minix_getattr(struct vfsmount *, struct dentry *, struct kstat *); extern int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len); @@ -88,6 +88,11 @@ static inline struct minix_inode_info *minix_i(struct inode *inode) return list_entry(inode, struct minix_inode_info, vfs_inode); } +static inline unsigned minix_blocks_needed(unsigned bits, unsigned blocksize) +{ + return DIV_ROUND_UP(bits, blocksize * 8); +} + #if defined(CONFIG_MINIX_FS_NATIVE_ENDIAN) && \ defined(CONFIG_MINIX_FS_BIG_ENDIAN_16BIT_INDEXED) @@ -125,7 +130,7 @@ static inline int minix_find_first_zero_bit(const void *vaddr, unsigned size) if (!size) return 0; - size = (size >> 4) + ((size & 15) > 0); + size >>= 4; while (*p++ == 0xffff) { if (--size == 0) return (p - addr) << 4; diff --git a/fs/namei.c b/fs/namei.c index b7fad009bbf6..5008f01787f5 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -32,6 +32,7 @@ #include <linux/fcntl.h> #include <linux/device_cgroup.h> #include <linux/fs_struct.h> +#include <linux/posix_acl.h> #include <asm/uaccess.h> #include "internal.h" @@ -136,7 +137,7 @@ static int do_getname(const char __user *filename, char *page) return retval; } -static char *getname_flags(const char __user * filename, int flags) +static char *getname_flags(const char __user *filename, int flags, int *empty) { char *tmp, *result; @@ -147,6 +148,8 @@ static char *getname_flags(const char __user * filename, int flags) result = tmp; if (retval < 0) { + if (retval == -ENOENT && empty) + *empty = 1; if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) { __putname(tmp); result = ERR_PTR(retval); @@ -159,7 +162,7 @@ static char *getname_flags(const char __user * filename, int flags) char *getname(const char __user * filename) { - return getname_flags(filename, 0); + return getname_flags(filename, 0, 0); } #ifdef CONFIG_AUDITSYSCALL @@ -173,24 +176,66 @@ void putname(const char *name) EXPORT_SYMBOL(putname); #endif +static int check_acl(struct inode *inode, int mask) +{ +#ifdef CONFIG_FS_POSIX_ACL + struct posix_acl *acl; + + if (mask & MAY_NOT_BLOCK) { + acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS); + if (!acl) + return -EAGAIN; + /* no ->get_acl() calls in RCU mode... */ + if (acl == ACL_NOT_CACHED) + return -ECHILD; + return posix_acl_permission(inode, acl, mask & ~MAY_NOT_BLOCK); + } + + acl = get_cached_acl(inode, ACL_TYPE_ACCESS); + + /* + * A filesystem can force a ACL callback by just never filling the + * ACL cache. But normally you'd fill the cache either at inode + * instantiation time, or on the first ->get_acl call. + * + * If the filesystem doesn't have a get_acl() function at all, we'll + * just create the negative cache entry. + */ + if (acl == ACL_NOT_CACHED) { + if (inode->i_op->get_acl) { + acl = inode->i_op->get_acl(inode, ACL_TYPE_ACCESS); + if (IS_ERR(acl)) + return PTR_ERR(acl); + } else { + set_cached_acl(inode, ACL_TYPE_ACCESS, NULL); + return -EAGAIN; + } + } + + if (acl) { + int error = posix_acl_permission(inode, acl, mask); + posix_acl_release(acl); + return error; + } +#endif + + return -EAGAIN; +} + /* - * This does basic POSIX ACL permission checking + * This does the basic permission checking */ static int acl_permission_check(struct inode *inode, int mask) { - int (*check_acl)(struct inode *inode, int mask); unsigned int mode = inode->i_mode; - mask &= MAY_READ | MAY_WRITE | MAY_EXEC | MAY_NOT_BLOCK; - if (current_user_ns() != inode_userns(inode)) goto other_perms; - if (current_fsuid() == inode->i_uid) + if (likely(current_fsuid() == inode->i_uid)) mode >>= 6; else { - check_acl = inode->i_op->check_acl; - if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) { + if (IS_POSIXACL(inode) && (mode & S_IRWXG)) { int error = check_acl(inode, mask); if (error != -EAGAIN) return error; @@ -212,7 +257,7 @@ other_perms: /** * generic_permission - check for access rights on a Posix-like filesystem * @inode: inode to check access rights for - * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) + * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...) * * Used to check for read/write/execute permissions on a file. * We use "fsuid" for this, letting us set arbitrary permissions @@ -228,7 +273,7 @@ int generic_permission(struct inode *inode, int mask) int ret; /* - * Do the basic POSIX ACL permission checks. + * Do the basic permission checks. */ ret = acl_permission_check(inode, mask); if (ret != -EACCES) @@ -263,21 +308,43 @@ int generic_permission(struct inode *inode, int mask) return -EACCES; } +/* + * We _really_ want to just do "generic_permission()" without + * even looking at the inode->i_op values. So we keep a cache + * flag in inode->i_opflags, that says "this has not special + * permission function, use the fast case". + */ +static inline int do_inode_permission(struct inode *inode, int mask) +{ + if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) { + if (likely(inode->i_op->permission)) + return inode->i_op->permission(inode, mask); + + /* This gets set once for the inode lifetime */ + spin_lock(&inode->i_lock); + inode->i_opflags |= IOP_FASTPERM; + spin_unlock(&inode->i_lock); + } + return generic_permission(inode, mask); +} + /** * inode_permission - check for access rights to a given inode * @inode: inode to check permission on - * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) + * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...) * * Used to check for read/write/execute permissions on an inode. * We use "fsuid" for this, letting us set arbitrary permissions * for filesystem access without changing the "normal" uids which * are used for other things. + * + * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask. */ int inode_permission(struct inode *inode, int mask) { int retval; - if (mask & MAY_WRITE) { + if (unlikely(mask & MAY_WRITE)) { umode_t mode = inode->i_mode; /* @@ -294,11 +361,7 @@ int inode_permission(struct inode *inode, int mask) return -EACCES; } - if (inode->i_op->permission) - retval = inode->i_op->permission(inode, mask); - else - retval = generic_permission(inode, mask); - + retval = do_inode_permission(inode, mask); if (retval) return retval; @@ -660,23 +723,20 @@ static int follow_automount(struct path *path, unsigned flags, if (!path->dentry->d_op || !path->dentry->d_op->d_automount) return -EREMOTE; - /* We don't want to mount if someone supplied AT_NO_AUTOMOUNT - * and this is the terminal part of the path. - */ - if ((flags & LOOKUP_NO_AUTOMOUNT) && !(flags & LOOKUP_PARENT)) - return -EISDIR; /* we actually want to stop here */ - - /* We want to mount if someone is trying to open/create a file of any - * type under the mountpoint, wants to traverse through the mountpoint - * or wants to open the mounted directory. + /* We don't want to mount if someone's just doing a stat - + * unless they're stat'ing a directory and appended a '/' to + * the name. * - * We don't want to mount if someone's just doing a stat and they've - * set AT_SYMLINK_NOFOLLOW - unless they're stat'ing a directory and - * appended a '/' to the name. + * We do, however, want to mount if someone wants to open or + * create a file of any type under the mountpoint, wants to + * traverse through the mountpoint or wants to open the + * mounted directory. Also, autofs may mark negative dentries + * as being automount points. These will need the attentions + * of the daemon to instantiate them before they can be used. */ - if (!(flags & LOOKUP_FOLLOW) && - !(flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY | - LOOKUP_OPEN | LOOKUP_CREATE))) + if (!(flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY | + LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) && + path->dentry->d_inode) return -EISDIR; current->total_link_count++; @@ -792,7 +852,7 @@ static int follow_managed(struct path *path, unsigned flags) mntput(path->mnt); if (ret == -EISDIR) ret = 0; - return ret; + return ret < 0 ? ret : need_mntput; } int follow_down_one(struct path *path) @@ -840,6 +900,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, break; path->mnt = mounted; path->dentry = mounted->mnt_root; + nd->flags |= LOOKUP_JUMPED; nd->seq = read_seqcount_begin(&path->dentry->d_seq); /* * Update the inode too. We don't need to re-check the @@ -1153,6 +1214,8 @@ retry: path_put_conditional(path, nd); return err; } + if (err) + nd->flags |= LOOKUP_JUMPED; *inode = path->dentry->d_inode; return 0; } @@ -1194,6 +1257,26 @@ static void terminate_walk(struct nameidata *nd) } } +/* + * Do we need to follow links? We _really_ want to be able + * to do this check without having to look at inode->i_op, + * so we keep a cache of "no, this doesn't need follow_link" + * for the common case. + */ +static inline int should_follow_link(struct inode *inode, int follow) +{ + if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) { + if (likely(inode->i_op->follow_link)) + return follow; + + /* This gets set once for the inode lifetime */ + spin_lock(&inode->i_lock); + inode->i_opflags |= IOP_NOFOLLOW; + spin_unlock(&inode->i_lock); + } + return 0; +} + static inline int walk_component(struct nameidata *nd, struct path *path, struct qstr *name, int type, int follow) { @@ -1216,7 +1299,7 @@ static inline int walk_component(struct nameidata *nd, struct path *path, terminate_walk(nd); return -ENOENT; } - if (unlikely(inode->i_op->follow_link) && follow) { + if (should_follow_link(inode, follow)) { if (nd->flags & LOOKUP_RCU) { if (unlikely(unlazy_walk(nd, path->dentry))) { terminate_walk(nd); @@ -1269,6 +1352,26 @@ static inline int nested_symlink(struct path *path, struct nameidata *nd) } /* + * We really don't want to look at inode->i_op->lookup + * when we don't have to. So we keep a cache bit in + * the inode ->i_opflags field that says "yes, we can + * do lookup on this inode". + */ +static inline int can_lookup(struct inode *inode) +{ + if (likely(inode->i_opflags & IOP_LOOKUP)) + return 1; + if (likely(!inode->i_op->lookup)) + return 0; + + /* We do this once for the lifetime of the inode */ + spin_lock(&inode->i_lock); + inode->i_opflags |= IOP_LOOKUP; + spin_unlock(&inode->i_lock); + return 1; +} + +/* * Name resolution. * This is the basic name resolution function, turning a pathname into * the final dentry. We expect 'base' to be positive and a directory. @@ -1347,10 +1450,10 @@ static int link_path_walk(const char *name, struct nameidata *nd) if (err) return err; } + if (can_lookup(nd->inode)) + continue; err = -ENOTDIR; - if (!nd->inode->i_op->lookup) - break; - continue; + break; /* here ends the main loop */ last_component: @@ -1700,11 +1803,11 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) return __lookup_hash(&this, base, NULL); } -int user_path_at(int dfd, const char __user *name, unsigned flags, - struct path *path) +int user_path_at_empty(int dfd, const char __user *name, unsigned flags, + struct path *path, int *empty) { struct nameidata nd; - char *tmp = getname_flags(name, flags); + char *tmp = getname_flags(name, flags, empty); int err = PTR_ERR(tmp); if (!IS_ERR(tmp)) { @@ -1718,6 +1821,12 @@ int user_path_at(int dfd, const char __user *name, unsigned flags, return err; } +int user_path_at(int dfd, const char __user *name, unsigned flags, + struct path *path) +{ + return user_path_at_empty(dfd, name, flags, path, 0); +} + static int user_path_parent(int dfd, const char __user *path, struct nameidata *nd, char **name) { @@ -1937,10 +2046,7 @@ static int may_open(struct path *path, int acc_mode, int flag) if (flag & O_NOATIME && !inode_owner_or_capable(inode)) return -EPERM; - /* - * Ensure there are no outstanding leases on the file. - */ - return break_lease(inode, flag); + return 0; } static int handle_truncate(struct file *filp) @@ -2043,6 +2149,10 @@ static struct file *do_last(struct nameidata *nd, struct path *path, } /* create side of things */ + /* + * This will *only* deal with leaving RCU mode - LOOKUP_JUMPED has been + * cleared when we got to the last component we are about to look up + */ error = complete_walk(nd); if (error) return ERR_PTR(error); @@ -2111,6 +2221,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path, if (error < 0) goto exit_dput; + if (error) + nd->flags |= LOOKUP_JUMPED; + error = -ENOENT; if (!path->dentry->d_inode) goto exit_dput; @@ -2120,6 +2233,10 @@ static struct file *do_last(struct nameidata *nd, struct path *path, path_to_nameidata(path, nd); nd->inode = path->dentry->d_inode; + /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */ + error = complete_walk(nd); + if (error) + goto exit; error = -EISDIR; if (S_ISDIR(nd->inode->i_mode)) goto exit; @@ -2512,6 +2629,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry) if (!dir->i_op->rmdir) return -EPERM; + dget(dentry); mutex_lock(&dentry->d_inode->i_mutex); error = -EBUSY; @@ -2532,6 +2650,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry) out: mutex_unlock(&dentry->d_inode->i_mutex); + dput(dentry); if (!error) d_delete(dentry); return error; @@ -2921,6 +3040,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, if (error) return error; + dget(new_dentry); if (target) mutex_lock(&target->i_mutex); @@ -2941,6 +3061,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, out: if (target) mutex_unlock(&target->i_mutex); + dput(new_dentry); if (!error) if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) d_move(old_dentry,new_dentry); diff --git a/fs/namespace.c b/fs/namespace.c index cda50fe9250a..6d3a1963879b 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1109,6 +1109,7 @@ static int show_vfsstat(struct seq_file *m, void *v) /* device */ if (mnt->mnt_sb->s_op->show_devname) { + seq_puts(m, "device "); err = mnt->mnt_sb->s_op->show_devname(m, mnt); } else { if (mnt->mnt_devname) { @@ -1757,7 +1758,7 @@ static int do_loopback(struct path *path, char *old_name, return err; if (!old_name || !*old_name) return -EINVAL; - err = kern_path(old_name, LOOKUP_FOLLOW, &old_path); + err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path); if (err) return err; @@ -2482,11 +2483,43 @@ struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt) __mnt_make_longterm(mnt); new_ns->root = mnt; list_add(&new_ns->list, &new_ns->root->mnt_list); + } else { + mntput(mnt); } return new_ns; } EXPORT_SYMBOL(create_mnt_ns); +struct dentry *mount_subtree(struct vfsmount *mnt, const char *name) +{ + struct mnt_namespace *ns; + struct super_block *s; + struct path path; + int err; + + ns = create_mnt_ns(mnt); + if (IS_ERR(ns)) + return ERR_CAST(ns); + + err = vfs_path_lookup(mnt->mnt_root, mnt, + name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path); + + put_mnt_ns(ns); + + if (err) + return ERR_PTR(err); + + /* trade a vfsmount reference for active sb one */ + s = path.mnt->mnt_sb; + atomic_inc(&s->s_active); + mntput(path.mnt); + /* lock the sucker */ + down_write(&s->s_umount); + /* ... and return the root of (sub)tree on it */ + return path.dentry; +} +EXPORT_SYMBOL(mount_subtree); + SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, char __user *, type, unsigned long, flags, void __user *, data) { @@ -2721,6 +2754,25 @@ EXPORT_SYMBOL(put_mnt_ns); struct vfsmount *kern_mount_data(struct file_system_type *type, void *data) { - return vfs_kern_mount(type, MS_KERNMOUNT, type->name, data); + struct vfsmount *mnt; + mnt = vfs_kern_mount(type, MS_KERNMOUNT, type->name, data); + if (!IS_ERR(mnt)) { + /* + * it is a longterm mount, don't release mnt until + * we unmount before file sys is unregistered + */ + mnt_make_longterm(mnt); + } + return mnt; } EXPORT_SYMBOL_GPL(kern_mount_data); + +void kern_unmount(struct vfsmount *mnt) +{ + /* release long term mount so mount point can be released */ + if (!IS_ERR_OR_NULL(mnt)) { + mnt_make_shortterm(mnt); + mntput(mnt); + } +} +EXPORT_SYMBOL(kern_unmount); diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index 202f370526a7..5b5fa33b6b9d 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -228,7 +228,7 @@ static void ncp_set_attr(struct inode *inode, struct ncp_entry_info *nwinfo) DDPRINTK("ncp_read_inode: inode->i_mode = %u\n", inode->i_mode); - inode->i_nlink = 1; + set_nlink(inode, 1); inode->i_uid = server->m.uid; inode->i_gid = server->m.gid; diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index 81515545ba75..dbcd82126aed 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -77,6 +77,7 @@ config NFS_V4 config NFS_V4_1 bool "NFS client support for NFSv4.1 (EXPERIMENTAL)" depends on NFS_FS && NFS_V4 && EXPERIMENTAL + select SUNRPC_BACKCHANNEL select PNFS_FILE_LAYOUT help This option enables support for minor version 1 of the NFSv4 protocol @@ -87,15 +88,15 @@ config NFS_V4_1 config PNFS_FILE_LAYOUT tristate +config PNFS_BLOCK + tristate + depends on NFS_FS && NFS_V4_1 && BLK_DEV_DM + default m + config PNFS_OBJLAYOUT - tristate "Provide support for the pNFS Objects Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)" + tristate depends on NFS_FS && NFS_V4_1 && SCSI_OSD_ULD - help - Say M here if you want your pNFS client to support the Objects Layout Driver. - Requires the SCSI osd initiator library (SCSI_OSD_INITIATOR) and - upper level driver (SCSI_OSD_ULD). - - If unsure, say N. + default m config ROOT_NFS bool "Root file system on NFS" diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index 6a34f7dd0e6f..b58613d0abb3 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -23,3 +23,4 @@ obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/ +obj-$(CONFIG_PNFS_BLOCK) += blocklayout/ diff --git a/fs/nfs/blocklayout/Makefile b/fs/nfs/blocklayout/Makefile new file mode 100644 index 000000000000..d5815505c020 --- /dev/null +++ b/fs/nfs/blocklayout/Makefile @@ -0,0 +1,5 @@ +# +# Makefile for the pNFS block layout driver kernel module +# +obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o +blocklayoutdriver-objs := blocklayout.o extents.o blocklayoutdev.o blocklayoutdm.o diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c new file mode 100644 index 000000000000..281ae95932c9 --- /dev/null +++ b/fs/nfs/blocklayout/blocklayout.c @@ -0,0 +1,1024 @@ +/* + * linux/fs/nfs/blocklayout/blocklayout.c + * + * Module for the NFSv4.1 pNFS block layout driver. + * + * Copyright (c) 2006 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson <andros@citi.umich.edu> + * Fred Isaman <iisaman@umich.edu> + * + * permission is granted to use, copy, create derivative works and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the university of michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. if + * the above copyright notice or any other identification of the + * university of michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * this software is provided as is, without representation from the + * university of michigan as to its fitness for any purpose, and without + * warranty by the university of michigan of any kind, either express + * or implied, including without limitation the implied warranties of + * merchantability and fitness for a particular purpose. the regents + * of the university of michigan shall not be liable for any damages, + * including special, indirect, incidental, or consequential damages, + * with respect to any claim arising out or in connection with the use + * of the software, even if it has been or is hereafter advised of the + * possibility of such damages. + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/mount.h> +#include <linux/namei.h> +#include <linux/bio.h> /* struct bio */ +#include <linux/buffer_head.h> /* various write calls */ +#include <linux/prefetch.h> + +#include "blocklayout.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Andy Adamson <andros@citi.umich.edu>"); +MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver"); + +struct dentry *bl_device_pipe; +wait_queue_head_t bl_wq; + +static void print_page(struct page *page) +{ + dprintk("PRINTPAGE page %p\n", page); + dprintk(" PagePrivate %d\n", PagePrivate(page)); + dprintk(" PageUptodate %d\n", PageUptodate(page)); + dprintk(" PageError %d\n", PageError(page)); + dprintk(" PageDirty %d\n", PageDirty(page)); + dprintk(" PageReferenced %d\n", PageReferenced(page)); + dprintk(" PageLocked %d\n", PageLocked(page)); + dprintk(" PageWriteback %d\n", PageWriteback(page)); + dprintk(" PageMappedToDisk %d\n", PageMappedToDisk(page)); + dprintk("\n"); +} + +/* Given the be associated with isect, determine if page data needs to be + * initialized. + */ +static int is_hole(struct pnfs_block_extent *be, sector_t isect) +{ + if (be->be_state == PNFS_BLOCK_NONE_DATA) + return 1; + else if (be->be_state != PNFS_BLOCK_INVALID_DATA) + return 0; + else + return !bl_is_sector_init(be->be_inval, isect); +} + +/* Given the be associated with isect, determine if page data can be + * written to disk. + */ +static int is_writable(struct pnfs_block_extent *be, sector_t isect) +{ + return (be->be_state == PNFS_BLOCK_READWRITE_DATA || + be->be_state == PNFS_BLOCK_INVALID_DATA); +} + +/* The data we are handed might be spread across several bios. We need + * to track when the last one is finished. + */ +struct parallel_io { + struct kref refcnt; + struct rpc_call_ops call_ops; + void (*pnfs_callback) (void *data); + void *data; +}; + +static inline struct parallel_io *alloc_parallel(void *data) +{ + struct parallel_io *rv; + + rv = kmalloc(sizeof(*rv), GFP_NOFS); + if (rv) { + rv->data = data; + kref_init(&rv->refcnt); + } + return rv; +} + +static inline void get_parallel(struct parallel_io *p) +{ + kref_get(&p->refcnt); +} + +static void destroy_parallel(struct kref *kref) +{ + struct parallel_io *p = container_of(kref, struct parallel_io, refcnt); + + dprintk("%s enter\n", __func__); + p->pnfs_callback(p->data); + kfree(p); +} + +static inline void put_parallel(struct parallel_io *p) +{ + kref_put(&p->refcnt, destroy_parallel); +} + +static struct bio * +bl_submit_bio(int rw, struct bio *bio) +{ + if (bio) { + get_parallel(bio->bi_private); + dprintk("%s submitting %s bio %u@%llu\n", __func__, + rw == READ ? "read" : "write", + bio->bi_size, (unsigned long long)bio->bi_sector); + submit_bio(rw, bio); + } + return NULL; +} + +static struct bio *bl_alloc_init_bio(int npg, sector_t isect, + struct pnfs_block_extent *be, + void (*end_io)(struct bio *, int err), + struct parallel_io *par) +{ + struct bio *bio; + + bio = bio_alloc(GFP_NOIO, npg); + if (!bio) + return NULL; + + bio->bi_sector = isect - be->be_f_offset + be->be_v_offset; + bio->bi_bdev = be->be_mdev; + bio->bi_end_io = end_io; + bio->bi_private = par; + return bio; +} + +static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw, + sector_t isect, struct page *page, + struct pnfs_block_extent *be, + void (*end_io)(struct bio *, int err), + struct parallel_io *par) +{ +retry: + if (!bio) { + bio = bl_alloc_init_bio(npg, isect, be, end_io, par); + if (!bio) + return ERR_PTR(-ENOMEM); + } + if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) { + bio = bl_submit_bio(rw, bio); + goto retry; + } + return bio; +} + +/* This is basically copied from mpage_end_io_read */ +static void bl_end_io_read(struct bio *bio, int err) +{ + struct parallel_io *par = bio->bi_private; + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct nfs_read_data *rdata = (struct nfs_read_data *)par->data; + + do { + struct page *page = bvec->bv_page; + + if (--bvec >= bio->bi_io_vec) + prefetchw(&bvec->bv_page->flags); + if (uptodate) + SetPageUptodate(page); + } while (bvec >= bio->bi_io_vec); + if (!uptodate) { + if (!rdata->pnfs_error) + rdata->pnfs_error = -EIO; + pnfs_set_lo_fail(rdata->lseg); + } + bio_put(bio); + put_parallel(par); +} + +static void bl_read_cleanup(struct work_struct *work) +{ + struct rpc_task *task; + struct nfs_read_data *rdata; + dprintk("%s enter\n", __func__); + task = container_of(work, struct rpc_task, u.tk_work); + rdata = container_of(task, struct nfs_read_data, task); + pnfs_ld_read_done(rdata); +} + +static void +bl_end_par_io_read(void *data) +{ + struct nfs_read_data *rdata = data; + + INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup); + schedule_work(&rdata->task.u.tk_work); +} + +/* We don't want normal .rpc_call_done callback used, so we replace it + * with this stub. + */ +static void bl_rpc_do_nothing(struct rpc_task *task, void *calldata) +{ + return; +} + +static enum pnfs_try_status +bl_read_pagelist(struct nfs_read_data *rdata) +{ + int i, hole; + struct bio *bio = NULL; + struct pnfs_block_extent *be = NULL, *cow_read = NULL; + sector_t isect, extent_length = 0; + struct parallel_io *par; + loff_t f_offset = rdata->args.offset; + size_t count = rdata->args.count; + struct page **pages = rdata->args.pages; + int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT; + + dprintk("%s enter nr_pages %u offset %lld count %Zd\n", __func__, + rdata->npages, f_offset, count); + + par = alloc_parallel(rdata); + if (!par) + goto use_mds; + par->call_ops = *rdata->mds_ops; + par->call_ops.rpc_call_done = bl_rpc_do_nothing; + par->pnfs_callback = bl_end_par_io_read; + /* At this point, we can no longer jump to use_mds */ + + isect = (sector_t) (f_offset >> SECTOR_SHIFT); + /* Code assumes extents are page-aligned */ + for (i = pg_index; i < rdata->npages; i++) { + if (!extent_length) { + /* We've used up the previous extent */ + bl_put_extent(be); + bl_put_extent(cow_read); + bio = bl_submit_bio(READ, bio); + /* Get the next one */ + be = bl_find_get_extent(BLK_LSEG2EXT(rdata->lseg), + isect, &cow_read); + if (!be) { + rdata->pnfs_error = -EIO; + goto out; + } + extent_length = be->be_length - + (isect - be->be_f_offset); + if (cow_read) { + sector_t cow_length = cow_read->be_length - + (isect - cow_read->be_f_offset); + extent_length = min(extent_length, cow_length); + } + } + hole = is_hole(be, isect); + if (hole && !cow_read) { + bio = bl_submit_bio(READ, bio); + /* Fill hole w/ zeroes w/o accessing device */ + dprintk("%s Zeroing page for hole\n", __func__); + zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE); + print_page(pages[i]); + SetPageUptodate(pages[i]); + } else { + struct pnfs_block_extent *be_read; + + be_read = (hole && cow_read) ? cow_read : be; + bio = bl_add_page_to_bio(bio, rdata->npages - i, READ, + isect, pages[i], be_read, + bl_end_io_read, par); + if (IS_ERR(bio)) { + rdata->pnfs_error = PTR_ERR(bio); + bio = NULL; + goto out; + } + } + isect += PAGE_CACHE_SECTORS; + extent_length -= PAGE_CACHE_SECTORS; + } + if ((isect << SECTOR_SHIFT) >= rdata->inode->i_size) { + rdata->res.eof = 1; + rdata->res.count = rdata->inode->i_size - f_offset; + } else { + rdata->res.count = (isect << SECTOR_SHIFT) - f_offset; + } +out: + bl_put_extent(be); + bl_put_extent(cow_read); + bl_submit_bio(READ, bio); + put_parallel(par); + return PNFS_ATTEMPTED; + + use_mds: + dprintk("Giving up and using normal NFS\n"); + return PNFS_NOT_ATTEMPTED; +} + +static void mark_extents_written(struct pnfs_block_layout *bl, + __u64 offset, __u32 count) +{ + sector_t isect, end; + struct pnfs_block_extent *be; + + dprintk("%s(%llu, %u)\n", __func__, offset, count); + if (count == 0) + return; + isect = (offset & (long)(PAGE_CACHE_MASK)) >> SECTOR_SHIFT; + end = (offset + count + PAGE_CACHE_SIZE - 1) & (long)(PAGE_CACHE_MASK); + end >>= SECTOR_SHIFT; + while (isect < end) { + sector_t len; + be = bl_find_get_extent(bl, isect, NULL); + BUG_ON(!be); /* FIXME */ + len = min(end, be->be_f_offset + be->be_length) - isect; + if (be->be_state == PNFS_BLOCK_INVALID_DATA) + bl_mark_for_commit(be, isect, len); /* What if fails? */ + isect += len; + bl_put_extent(be); + } +} + +static void bl_end_io_write_zero(struct bio *bio, int err) +{ + struct parallel_io *par = bio->bi_private; + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct nfs_write_data *wdata = (struct nfs_write_data *)par->data; + + do { + struct page *page = bvec->bv_page; + + if (--bvec >= bio->bi_io_vec) + prefetchw(&bvec->bv_page->flags); + /* This is the zeroing page we added */ + end_page_writeback(page); + page_cache_release(page); + } while (bvec >= bio->bi_io_vec); + if (!uptodate) { + if (!wdata->pnfs_error) + wdata->pnfs_error = -EIO; + pnfs_set_lo_fail(wdata->lseg); + } + bio_put(bio); + put_parallel(par); +} + +/* This is basically copied from mpage_end_io_read */ +static void bl_end_io_write(struct bio *bio, int err) +{ + struct parallel_io *par = bio->bi_private; + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct nfs_write_data *wdata = (struct nfs_write_data *)par->data; + + if (!uptodate) { + if (!wdata->pnfs_error) + wdata->pnfs_error = -EIO; + pnfs_set_lo_fail(wdata->lseg); + } + bio_put(bio); + put_parallel(par); +} + +/* Function scheduled for call during bl_end_par_io_write, + * it marks sectors as written and extends the commitlist. + */ +static void bl_write_cleanup(struct work_struct *work) +{ + struct rpc_task *task; + struct nfs_write_data *wdata; + dprintk("%s enter\n", __func__); + task = container_of(work, struct rpc_task, u.tk_work); + wdata = container_of(task, struct nfs_write_data, task); + if (!wdata->pnfs_error) { + /* Marks for LAYOUTCOMMIT */ + mark_extents_written(BLK_LSEG2EXT(wdata->lseg), + wdata->args.offset, wdata->args.count); + } + pnfs_ld_write_done(wdata); +} + +/* Called when last of bios associated with a bl_write_pagelist call finishes */ +static void bl_end_par_io_write(void *data) +{ + struct nfs_write_data *wdata = data; + + wdata->task.tk_status = 0; + wdata->verf.committed = NFS_FILE_SYNC; + INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup); + schedule_work(&wdata->task.u.tk_work); +} + +/* FIXME STUB - mark intersection of layout and page as bad, so is not + * used again. + */ +static void mark_bad_read(void) +{ + return; +} + +/* + * map_block: map a requested I/0 block (isect) into an offset in the LVM + * block_device + */ +static void +map_block(struct buffer_head *bh, sector_t isect, struct pnfs_block_extent *be) +{ + dprintk("%s enter be=%p\n", __func__, be); + + set_buffer_mapped(bh); + bh->b_bdev = be->be_mdev; + bh->b_blocknr = (isect - be->be_f_offset + be->be_v_offset) >> + (be->be_mdev->bd_inode->i_blkbits - SECTOR_SHIFT); + + dprintk("%s isect %llu, bh->b_blocknr %ld, using bsize %Zd\n", + __func__, (unsigned long long)isect, (long)bh->b_blocknr, + bh->b_size); + return; +} + +/* Given an unmapped page, zero it or read in page for COW, page is locked + * by caller. + */ +static int +init_page_for_write(struct page *page, struct pnfs_block_extent *cow_read) +{ + struct buffer_head *bh = NULL; + int ret = 0; + sector_t isect; + + dprintk("%s enter, %p\n", __func__, page); + BUG_ON(PageUptodate(page)); + if (!cow_read) { + zero_user_segment(page, 0, PAGE_SIZE); + SetPageUptodate(page); + goto cleanup; + } + + bh = alloc_page_buffers(page, PAGE_CACHE_SIZE, 0); + if (!bh) { + ret = -ENOMEM; + goto cleanup; + } + + isect = (sector_t) page->index << PAGE_CACHE_SECTOR_SHIFT; + map_block(bh, isect, cow_read); + if (!bh_uptodate_or_lock(bh)) + ret = bh_submit_read(bh); + if (ret) + goto cleanup; + SetPageUptodate(page); + +cleanup: + bl_put_extent(cow_read); + if (bh) + free_buffer_head(bh); + if (ret) { + /* Need to mark layout with bad read...should now + * just use nfs4 for reads and writes. + */ + mark_bad_read(); + } + return ret; +} + +static enum pnfs_try_status +bl_write_pagelist(struct nfs_write_data *wdata, int sync) +{ + int i, ret, npg_zero, pg_index, last = 0; + struct bio *bio = NULL; + struct pnfs_block_extent *be = NULL, *cow_read = NULL; + sector_t isect, last_isect = 0, extent_length = 0; + struct parallel_io *par; + loff_t offset = wdata->args.offset; + size_t count = wdata->args.count; + struct page **pages = wdata->args.pages; + struct page *page; + pgoff_t index; + u64 temp; + int npg_per_block = + NFS_SERVER(wdata->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT; + + dprintk("%s enter, %Zu@%lld\n", __func__, count, offset); + /* At this point, wdata->pages is a (sequential) list of nfs_pages. + * We want to write each, and if there is an error set pnfs_error + * to have it redone using nfs. + */ + par = alloc_parallel(wdata); + if (!par) + return PNFS_NOT_ATTEMPTED; + par->call_ops = *wdata->mds_ops; + par->call_ops.rpc_call_done = bl_rpc_do_nothing; + par->pnfs_callback = bl_end_par_io_write; + /* At this point, have to be more careful with error handling */ + + isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT); + be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), isect, &cow_read); + if (!be || !is_writable(be, isect)) { + dprintk("%s no matching extents!\n", __func__); + wdata->pnfs_error = -EINVAL; + goto out; + } + + /* First page inside INVALID extent */ + if (be->be_state == PNFS_BLOCK_INVALID_DATA) { + temp = offset >> PAGE_CACHE_SHIFT; + npg_zero = do_div(temp, npg_per_block); + isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) & + (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT); + extent_length = be->be_length - (isect - be->be_f_offset); + +fill_invalid_ext: + dprintk("%s need to zero %d pages\n", __func__, npg_zero); + for (;npg_zero > 0; npg_zero--) { + if (bl_is_sector_init(be->be_inval, isect)) { + dprintk("isect %llu already init\n", + (unsigned long long)isect); + goto next_page; + } + /* page ref released in bl_end_io_write_zero */ + index = isect >> PAGE_CACHE_SECTOR_SHIFT; + dprintk("%s zero %dth page: index %lu isect %llu\n", + __func__, npg_zero, index, + (unsigned long long)isect); + page = + find_or_create_page(wdata->inode->i_mapping, index, + GFP_NOFS); + if (!page) { + dprintk("%s oom\n", __func__); + wdata->pnfs_error = -ENOMEM; + goto out; + } + + /* PageDirty: Other will write this out + * PageWriteback: Other is writing this out + * PageUptodate: It was read before + * sector_initialized: already written out + */ + if (PageDirty(page) || PageWriteback(page)) { + print_page(page); + unlock_page(page); + page_cache_release(page); + goto next_page; + } + if (!PageUptodate(page)) { + /* New page, readin or zero it */ + init_page_for_write(page, cow_read); + } + set_page_writeback(page); + unlock_page(page); + + ret = bl_mark_sectors_init(be->be_inval, isect, + PAGE_CACHE_SECTORS, + NULL); + if (unlikely(ret)) { + dprintk("%s bl_mark_sectors_init fail %d\n", + __func__, ret); + end_page_writeback(page); + page_cache_release(page); + wdata->pnfs_error = ret; + goto out; + } + bio = bl_add_page_to_bio(bio, npg_zero, WRITE, + isect, page, be, + bl_end_io_write_zero, par); + if (IS_ERR(bio)) { + wdata->pnfs_error = PTR_ERR(bio); + bio = NULL; + goto out; + } + /* FIXME: This should be done in bi_end_io */ + mark_extents_written(BLK_LSEG2EXT(wdata->lseg), + page->index << PAGE_CACHE_SHIFT, + PAGE_CACHE_SIZE); +next_page: + isect += PAGE_CACHE_SECTORS; + extent_length -= PAGE_CACHE_SECTORS; + } + if (last) + goto write_done; + } + bio = bl_submit_bio(WRITE, bio); + + /* Middle pages */ + pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT; + for (i = pg_index; i < wdata->npages; i++) { + if (!extent_length) { + /* We've used up the previous extent */ + bl_put_extent(be); + bio = bl_submit_bio(WRITE, bio); + /* Get the next one */ + be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), + isect, NULL); + if (!be || !is_writable(be, isect)) { + wdata->pnfs_error = -EINVAL; + goto out; + } + extent_length = be->be_length - + (isect - be->be_f_offset); + } + if (be->be_state == PNFS_BLOCK_INVALID_DATA) { + ret = bl_mark_sectors_init(be->be_inval, isect, + PAGE_CACHE_SECTORS, + NULL); + if (unlikely(ret)) { + dprintk("%s bl_mark_sectors_init fail %d\n", + __func__, ret); + wdata->pnfs_error = ret; + goto out; + } + } + bio = bl_add_page_to_bio(bio, wdata->npages - i, WRITE, + isect, pages[i], be, + bl_end_io_write, par); + if (IS_ERR(bio)) { + wdata->pnfs_error = PTR_ERR(bio); + bio = NULL; + goto out; + } + isect += PAGE_CACHE_SECTORS; + last_isect = isect; + extent_length -= PAGE_CACHE_SECTORS; + } + + /* Last page inside INVALID extent */ + if (be->be_state == PNFS_BLOCK_INVALID_DATA) { + bio = bl_submit_bio(WRITE, bio); + temp = last_isect >> PAGE_CACHE_SECTOR_SHIFT; + npg_zero = npg_per_block - do_div(temp, npg_per_block); + if (npg_zero < npg_per_block) { + last = 1; + goto fill_invalid_ext; + } + } + +write_done: + wdata->res.count = (last_isect << SECTOR_SHIFT) - (offset); + if (count < wdata->res.count) { + wdata->res.count = count; + } +out: + bl_put_extent(be); + bl_submit_bio(WRITE, bio); + put_parallel(par); + return PNFS_ATTEMPTED; +} + +/* FIXME - range ignored */ +static void +release_extents(struct pnfs_block_layout *bl, struct pnfs_layout_range *range) +{ + int i; + struct pnfs_block_extent *be; + + spin_lock(&bl->bl_ext_lock); + for (i = 0; i < EXTENT_LISTS; i++) { + while (!list_empty(&bl->bl_extents[i])) { + be = list_first_entry(&bl->bl_extents[i], + struct pnfs_block_extent, + be_node); + list_del(&be->be_node); + bl_put_extent(be); + } + } + spin_unlock(&bl->bl_ext_lock); +} + +static void +release_inval_marks(struct pnfs_inval_markings *marks) +{ + struct pnfs_inval_tracking *pos, *temp; + + list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) { + list_del(&pos->it_link); + kfree(pos); + } + return; +} + +static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo) +{ + struct pnfs_block_layout *bl = BLK_LO2EXT(lo); + + dprintk("%s enter\n", __func__); + release_extents(bl, NULL); + release_inval_marks(&bl->bl_inval); + kfree(bl); +} + +static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode, + gfp_t gfp_flags) +{ + struct pnfs_block_layout *bl; + + dprintk("%s enter\n", __func__); + bl = kzalloc(sizeof(*bl), gfp_flags); + if (!bl) + return NULL; + spin_lock_init(&bl->bl_ext_lock); + INIT_LIST_HEAD(&bl->bl_extents[0]); + INIT_LIST_HEAD(&bl->bl_extents[1]); + INIT_LIST_HEAD(&bl->bl_commit); + INIT_LIST_HEAD(&bl->bl_committing); + bl->bl_count = 0; + bl->bl_blocksize = NFS_SERVER(inode)->pnfs_blksize >> SECTOR_SHIFT; + BL_INIT_INVAL_MARKS(&bl->bl_inval, bl->bl_blocksize); + return &bl->bl_layout; +} + +static void bl_free_lseg(struct pnfs_layout_segment *lseg) +{ + dprintk("%s enter\n", __func__); + kfree(lseg); +} + +/* We pretty much ignore lseg, and store all data layout wide, so we + * can correctly merge. + */ +static struct pnfs_layout_segment *bl_alloc_lseg(struct pnfs_layout_hdr *lo, + struct nfs4_layoutget_res *lgr, + gfp_t gfp_flags) +{ + struct pnfs_layout_segment *lseg; + int status; + + dprintk("%s enter\n", __func__); + lseg = kzalloc(sizeof(*lseg), gfp_flags); + if (!lseg) + return ERR_PTR(-ENOMEM); + status = nfs4_blk_process_layoutget(lo, lgr, gfp_flags); + if (status) { + /* We don't want to call the full-blown bl_free_lseg, + * since on error extents were not touched. + */ + kfree(lseg); + return ERR_PTR(status); + } + return lseg; +} + +static void +bl_encode_layoutcommit(struct pnfs_layout_hdr *lo, struct xdr_stream *xdr, + const struct nfs4_layoutcommit_args *arg) +{ + dprintk("%s enter\n", __func__); + encode_pnfs_block_layoutupdate(BLK_LO2EXT(lo), xdr, arg); +} + +static void +bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata) +{ + struct pnfs_layout_hdr *lo = NFS_I(lcdata->args.inode)->layout; + + dprintk("%s enter\n", __func__); + clean_pnfs_block_layoutupdate(BLK_LO2EXT(lo), &lcdata->args, lcdata->res.status); +} + +static void free_blk_mountid(struct block_mount_id *mid) +{ + if (mid) { + struct pnfs_block_dev *dev; + spin_lock(&mid->bm_lock); + while (!list_empty(&mid->bm_devlist)) { + dev = list_first_entry(&mid->bm_devlist, + struct pnfs_block_dev, + bm_node); + list_del(&dev->bm_node); + bl_free_block_dev(dev); + } + spin_unlock(&mid->bm_lock); + kfree(mid); + } +} + +/* This is mostly copied from the filelayout's get_device_info function. + * It seems much of this should be at the generic pnfs level. + */ +static struct pnfs_block_dev * +nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh, + struct nfs4_deviceid *d_id) +{ + struct pnfs_device *dev; + struct pnfs_block_dev *rv; + u32 max_resp_sz; + int max_pages; + struct page **pages = NULL; + int i, rc; + + /* + * Use the session max response size as the basis for setting + * GETDEVICEINFO's maxcount + */ + max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; + max_pages = max_resp_sz >> PAGE_SHIFT; + dprintk("%s max_resp_sz %u max_pages %d\n", + __func__, max_resp_sz, max_pages); + + dev = kmalloc(sizeof(*dev), GFP_NOFS); + if (!dev) { + dprintk("%s kmalloc failed\n", __func__); + return ERR_PTR(-ENOMEM); + } + + pages = kzalloc(max_pages * sizeof(struct page *), GFP_NOFS); + if (pages == NULL) { + kfree(dev); + return ERR_PTR(-ENOMEM); + } + for (i = 0; i < max_pages; i++) { + pages[i] = alloc_page(GFP_NOFS); + if (!pages[i]) { + rv = ERR_PTR(-ENOMEM); + goto out_free; + } + } + + memcpy(&dev->dev_id, d_id, sizeof(*d_id)); + dev->layout_type = LAYOUT_BLOCK_VOLUME; + dev->pages = pages; + dev->pgbase = 0; + dev->pglen = PAGE_SIZE * max_pages; + dev->mincount = 0; + + dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data); + rc = nfs4_proc_getdeviceinfo(server, dev); + dprintk("%s getdevice info returns %d\n", __func__, rc); + if (rc) { + rv = ERR_PTR(rc); + goto out_free; + } + + rv = nfs4_blk_decode_device(server, dev); + out_free: + for (i = 0; i < max_pages; i++) + __free_page(pages[i]); + kfree(pages); + kfree(dev); + return rv; +} + +static int +bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh) +{ + struct block_mount_id *b_mt_id = NULL; + struct pnfs_devicelist *dlist = NULL; + struct pnfs_block_dev *bdev; + LIST_HEAD(block_disklist); + int status, i; + + dprintk("%s enter\n", __func__); + + if (server->pnfs_blksize == 0) { + dprintk("%s Server did not return blksize\n", __func__); + return -EINVAL; + } + b_mt_id = kzalloc(sizeof(struct block_mount_id), GFP_NOFS); + if (!b_mt_id) { + status = -ENOMEM; + goto out_error; + } + /* Initialize nfs4 block layout mount id */ + spin_lock_init(&b_mt_id->bm_lock); + INIT_LIST_HEAD(&b_mt_id->bm_devlist); + + dlist = kmalloc(sizeof(struct pnfs_devicelist), GFP_NOFS); + if (!dlist) { + status = -ENOMEM; + goto out_error; + } + dlist->eof = 0; + while (!dlist->eof) { + status = nfs4_proc_getdevicelist(server, fh, dlist); + if (status) + goto out_error; + dprintk("%s GETDEVICELIST numdevs=%i, eof=%i\n", + __func__, dlist->num_devs, dlist->eof); + for (i = 0; i < dlist->num_devs; i++) { + bdev = nfs4_blk_get_deviceinfo(server, fh, + &dlist->dev_id[i]); + if (IS_ERR(bdev)) { + status = PTR_ERR(bdev); + goto out_error; + } + spin_lock(&b_mt_id->bm_lock); + list_add(&bdev->bm_node, &b_mt_id->bm_devlist); + spin_unlock(&b_mt_id->bm_lock); + } + } + dprintk("%s SUCCESS\n", __func__); + server->pnfs_ld_data = b_mt_id; + + out_return: + kfree(dlist); + return status; + + out_error: + free_blk_mountid(b_mt_id); + goto out_return; +} + +static int +bl_clear_layoutdriver(struct nfs_server *server) +{ + struct block_mount_id *b_mt_id = server->pnfs_ld_data; + + dprintk("%s enter\n", __func__); + free_blk_mountid(b_mt_id); + dprintk("%s RETURNS\n", __func__); + return 0; +} + +static const struct nfs_pageio_ops bl_pg_read_ops = { + .pg_init = pnfs_generic_pg_init_read, + .pg_test = pnfs_generic_pg_test, + .pg_doio = pnfs_generic_pg_readpages, +}; + +static const struct nfs_pageio_ops bl_pg_write_ops = { + .pg_init = pnfs_generic_pg_init_write, + .pg_test = pnfs_generic_pg_test, + .pg_doio = pnfs_generic_pg_writepages, +}; + +static struct pnfs_layoutdriver_type blocklayout_type = { + .id = LAYOUT_BLOCK_VOLUME, + .name = "LAYOUT_BLOCK_VOLUME", + .read_pagelist = bl_read_pagelist, + .write_pagelist = bl_write_pagelist, + .alloc_layout_hdr = bl_alloc_layout_hdr, + .free_layout_hdr = bl_free_layout_hdr, + .alloc_lseg = bl_alloc_lseg, + .free_lseg = bl_free_lseg, + .encode_layoutcommit = bl_encode_layoutcommit, + .cleanup_layoutcommit = bl_cleanup_layoutcommit, + .set_layoutdriver = bl_set_layoutdriver, + .clear_layoutdriver = bl_clear_layoutdriver, + .pg_read_ops = &bl_pg_read_ops, + .pg_write_ops = &bl_pg_write_ops, +}; + +static const struct rpc_pipe_ops bl_upcall_ops = { + .upcall = rpc_pipe_generic_upcall, + .downcall = bl_pipe_downcall, + .destroy_msg = bl_pipe_destroy_msg, +}; + +static int __init nfs4blocklayout_init(void) +{ + struct vfsmount *mnt; + struct path path; + int ret; + + dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__); + + ret = pnfs_register_layoutdriver(&blocklayout_type); + if (ret) + goto out; + + init_waitqueue_head(&bl_wq); + + mnt = rpc_get_mount(); + if (IS_ERR(mnt)) { + ret = PTR_ERR(mnt); + goto out_remove; + } + + ret = vfs_path_lookup(mnt->mnt_root, + mnt, + NFS_PIPE_DIRNAME, 0, &path); + if (ret) + goto out_putrpc; + + bl_device_pipe = rpc_mkpipe(path.dentry, "blocklayout", NULL, + &bl_upcall_ops, 0); + path_put(&path); + if (IS_ERR(bl_device_pipe)) { + ret = PTR_ERR(bl_device_pipe); + goto out_putrpc; + } +out: + return ret; + +out_putrpc: + rpc_put_mount(); +out_remove: + pnfs_unregister_layoutdriver(&blocklayout_type); + return ret; +} + +static void __exit nfs4blocklayout_exit(void) +{ + dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n", + __func__); + + pnfs_unregister_layoutdriver(&blocklayout_type); + rpc_unlink(bl_device_pipe); + rpc_put_mount(); +} + +MODULE_ALIAS("nfs-layouttype4-3"); + +module_init(nfs4blocklayout_init); +module_exit(nfs4blocklayout_exit); diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h new file mode 100644 index 000000000000..42acf7ef5992 --- /dev/null +++ b/fs/nfs/blocklayout/blocklayout.h @@ -0,0 +1,205 @@ +/* + * linux/fs/nfs/blocklayout/blocklayout.h + * + * Module for the NFSv4.1 pNFS block layout driver. + * + * Copyright (c) 2006 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson <andros@citi.umich.edu> + * Fred Isaman <iisaman@umich.edu> + * + * permission is granted to use, copy, create derivative works and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the university of michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. if + * the above copyright notice or any other identification of the + * university of michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * this software is provided as is, without representation from the + * university of michigan as to its fitness for any purpose, and without + * warranty by the university of michigan of any kind, either express + * or implied, including without limitation the implied warranties of + * merchantability and fitness for a particular purpose. the regents + * of the university of michigan shall not be liable for any damages, + * including special, indirect, incidental, or consequential damages, + * with respect to any claim arising out or in connection with the use + * of the software, even if it has been or is hereafter advised of the + * possibility of such damages. + */ +#ifndef FS_NFS_NFS4BLOCKLAYOUT_H +#define FS_NFS_NFS4BLOCKLAYOUT_H + +#include <linux/device-mapper.h> +#include <linux/nfs_fs.h> +#include <linux/sunrpc/rpc_pipe_fs.h> + +#include "../pnfs.h" + +#define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> SECTOR_SHIFT) +#define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT) + +struct block_mount_id { + spinlock_t bm_lock; /* protects list */ + struct list_head bm_devlist; /* holds pnfs_block_dev */ +}; + +struct pnfs_block_dev { + struct list_head bm_node; + struct nfs4_deviceid bm_mdevid; /* associated devid */ + struct block_device *bm_mdev; /* meta device itself */ +}; + +enum exstate4 { + PNFS_BLOCK_READWRITE_DATA = 0, + PNFS_BLOCK_READ_DATA = 1, + PNFS_BLOCK_INVALID_DATA = 2, /* mapped, but data is invalid */ + PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */ +}; + +#define MY_MAX_TAGS (15) /* tag bitnums used must be less than this */ + +struct my_tree { + sector_t mtt_step_size; /* Internal sector alignment */ + struct list_head mtt_stub; /* Should be a radix tree */ +}; + +struct pnfs_inval_markings { + spinlock_t im_lock; + struct my_tree im_tree; /* Sectors that need LAYOUTCOMMIT */ + sector_t im_block_size; /* Server blocksize in sectors */ +}; + +struct pnfs_inval_tracking { + struct list_head it_link; + int it_sector; + int it_tags; +}; + +/* sector_t fields are all in 512-byte sectors */ +struct pnfs_block_extent { + struct kref be_refcnt; + struct list_head be_node; /* link into lseg list */ + struct nfs4_deviceid be_devid; /* FIXME: could use device cache instead */ + struct block_device *be_mdev; + sector_t be_f_offset; /* the starting offset in the file */ + sector_t be_length; /* the size of the extent */ + sector_t be_v_offset; /* the starting offset in the volume */ + enum exstate4 be_state; /* the state of this extent */ + struct pnfs_inval_markings *be_inval; /* tracks INVAL->RW transition */ +}; + +/* Shortened extent used by LAYOUTCOMMIT */ +struct pnfs_block_short_extent { + struct list_head bse_node; + struct nfs4_deviceid bse_devid; + struct block_device *bse_mdev; + sector_t bse_f_offset; /* the starting offset in the file */ + sector_t bse_length; /* the size of the extent */ +}; + +static inline void +BL_INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize) +{ + spin_lock_init(&marks->im_lock); + INIT_LIST_HEAD(&marks->im_tree.mtt_stub); + marks->im_block_size = blocksize; + marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS, + blocksize); +} + +enum extentclass4 { + RW_EXTENT = 0, /* READWRTE and INVAL */ + RO_EXTENT = 1, /* READ and NONE */ + EXTENT_LISTS = 2, +}; + +static inline int bl_choose_list(enum exstate4 state) +{ + if (state == PNFS_BLOCK_READ_DATA || state == PNFS_BLOCK_NONE_DATA) + return RO_EXTENT; + else + return RW_EXTENT; +} + +struct pnfs_block_layout { + struct pnfs_layout_hdr bl_layout; + struct pnfs_inval_markings bl_inval; /* tracks INVAL->RW transition */ + spinlock_t bl_ext_lock; /* Protects list manipulation */ + struct list_head bl_extents[EXTENT_LISTS]; /* R and RW extents */ + struct list_head bl_commit; /* Needs layout commit */ + struct list_head bl_committing; /* Layout committing */ + unsigned int bl_count; /* entries in bl_commit */ + sector_t bl_blocksize; /* Server blocksize in sectors */ +}; + +#define BLK_ID(lo) ((struct block_mount_id *)(NFS_SERVER(lo->plh_inode)->pnfs_ld_data)) + +static inline struct pnfs_block_layout * +BLK_LO2EXT(struct pnfs_layout_hdr *lo) +{ + return container_of(lo, struct pnfs_block_layout, bl_layout); +} + +static inline struct pnfs_block_layout * +BLK_LSEG2EXT(struct pnfs_layout_segment *lseg) +{ + return BLK_LO2EXT(lseg->pls_layout); +} + +struct bl_dev_msg { + int32_t status; + uint32_t major, minor; +}; + +struct bl_msg_hdr { + u8 type; + u16 totallen; /* length of entire message, including hdr itself */ +}; + +extern struct dentry *bl_device_pipe; +extern wait_queue_head_t bl_wq; + +#define BL_DEVICE_UMOUNT 0x0 /* Umount--delete devices */ +#define BL_DEVICE_MOUNT 0x1 /* Mount--create devices*/ +#define BL_DEVICE_REQUEST_INIT 0x0 /* Start request */ +#define BL_DEVICE_REQUEST_PROC 0x1 /* User level process succeeds */ +#define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */ + +/* blocklayoutdev.c */ +ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t); +void bl_pipe_destroy_msg(struct rpc_pipe_msg *); +struct block_device *nfs4_blkdev_get(dev_t dev); +int nfs4_blkdev_put(struct block_device *bdev); +struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server, + struct pnfs_device *dev); +int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo, + struct nfs4_layoutget_res *lgr, gfp_t gfp_flags); + +/* blocklayoutdm.c */ +void bl_free_block_dev(struct pnfs_block_dev *bdev); + +/* extents.c */ +struct pnfs_block_extent * +bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect, + struct pnfs_block_extent **cow_read); +int bl_mark_sectors_init(struct pnfs_inval_markings *marks, + sector_t offset, sector_t length, + sector_t **pages); +void bl_put_extent(struct pnfs_block_extent *be); +struct pnfs_block_extent *bl_alloc_extent(void); +int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect); +int encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, + struct xdr_stream *xdr, + const struct nfs4_layoutcommit_args *arg); +void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, + const struct nfs4_layoutcommit_args *arg, + int status); +int bl_add_merge_extent(struct pnfs_block_layout *bl, + struct pnfs_block_extent *new); +int bl_mark_for_commit(struct pnfs_block_extent *be, + sector_t offset, sector_t length); + +#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */ diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c new file mode 100644 index 000000000000..d08ba9107fde --- /dev/null +++ b/fs/nfs/blocklayout/blocklayoutdev.c @@ -0,0 +1,391 @@ +/* + * linux/fs/nfs/blocklayout/blocklayoutdev.c + * + * Device operations for the pnfs nfs4 file layout driver. + * + * Copyright (c) 2006 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson <andros@citi.umich.edu> + * Fred Isaman <iisaman@umich.edu> + * + * permission is granted to use, copy, create derivative works and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the university of michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. if + * the above copyright notice or any other identification of the + * university of michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * this software is provided as is, without representation from the + * university of michigan as to its fitness for any purpose, and without + * warranty by the university of michigan of any kind, either express + * or implied, including without limitation the implied warranties of + * merchantability and fitness for a particular purpose. the regents + * of the university of michigan shall not be liable for any damages, + * including special, indirect, incidental, or consequential damages, + * with respect to any claim arising out or in connection with the use + * of the software, even if it has been or is hereafter advised of the + * possibility of such damages. + */ +#include <linux/module.h> +#include <linux/buffer_head.h> /* __bread */ + +#include <linux/genhd.h> +#include <linux/blkdev.h> +#include <linux/hash.h> + +#include "blocklayout.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +static int decode_sector_number(__be32 **rp, sector_t *sp) +{ + uint64_t s; + + *rp = xdr_decode_hyper(*rp, &s); + if (s & 0x1ff) { + printk(KERN_WARNING "%s: sector not aligned\n", __func__); + return -1; + } + *sp = s >> SECTOR_SHIFT; + return 0; +} + +/* Open a block_device by device number. */ +struct block_device *nfs4_blkdev_get(dev_t dev) +{ + struct block_device *bd; + + dprintk("%s enter\n", __func__); + bd = blkdev_get_by_dev(dev, FMODE_READ, NULL); + if (IS_ERR(bd)) + goto fail; + return bd; +fail: + dprintk("%s failed to open device : %ld\n", + __func__, PTR_ERR(bd)); + return NULL; +} + +/* + * Release the block device + */ +int nfs4_blkdev_put(struct block_device *bdev) +{ + dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev), + MINOR(bdev->bd_dev)); + return blkdev_put(bdev, FMODE_READ); +} + +static struct bl_dev_msg bl_mount_reply; + +ssize_t bl_pipe_downcall(struct file *filp, const char __user *src, + size_t mlen) +{ + if (mlen != sizeof (struct bl_dev_msg)) + return -EINVAL; + + if (copy_from_user(&bl_mount_reply, src, mlen) != 0) + return -EFAULT; + + wake_up(&bl_wq); + + return mlen; +} + +void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg) +{ + if (msg->errno >= 0) + return; + wake_up(&bl_wq); +} + +/* + * Decodes pnfs_block_deviceaddr4 which is XDR encoded in dev->dev_addr_buf. + */ +struct pnfs_block_dev * +nfs4_blk_decode_device(struct nfs_server *server, + struct pnfs_device *dev) +{ + struct pnfs_block_dev *rv; + struct block_device *bd = NULL; + struct rpc_pipe_msg msg; + struct bl_msg_hdr bl_msg = { + .type = BL_DEVICE_MOUNT, + .totallen = dev->mincount, + }; + uint8_t *dataptr; + DECLARE_WAITQUEUE(wq, current); + struct bl_dev_msg *reply = &bl_mount_reply; + int offset, len, i, rc; + + dprintk("%s CREATING PIPEFS MESSAGE\n", __func__); + dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data, + dev->mincount); + + memset(&msg, 0, sizeof(msg)); + msg.data = kzalloc(sizeof(bl_msg) + dev->mincount, GFP_NOFS); + if (!msg.data) { + rv = ERR_PTR(-ENOMEM); + goto out; + } + + memcpy(msg.data, &bl_msg, sizeof(bl_msg)); + dataptr = (uint8_t *) msg.data; + len = dev->mincount; + offset = sizeof(bl_msg); + for (i = 0; len > 0; i++) { + memcpy(&dataptr[offset], page_address(dev->pages[i]), + len < PAGE_CACHE_SIZE ? len : PAGE_CACHE_SIZE); + len -= PAGE_CACHE_SIZE; + offset += PAGE_CACHE_SIZE; + } + msg.len = sizeof(bl_msg) + dev->mincount; + + dprintk("%s CALLING USERSPACE DAEMON\n", __func__); + add_wait_queue(&bl_wq, &wq); + rc = rpc_queue_upcall(bl_device_pipe->d_inode, &msg); + if (rc < 0) { + remove_wait_queue(&bl_wq, &wq); + rv = ERR_PTR(rc); + goto out; + } + + set_current_state(TASK_UNINTERRUPTIBLE); + schedule(); + __set_current_state(TASK_RUNNING); + remove_wait_queue(&bl_wq, &wq); + + if (reply->status != BL_DEVICE_REQUEST_PROC) { + dprintk("%s failed to open device: %d\n", + __func__, reply->status); + rv = ERR_PTR(-EINVAL); + goto out; + } + + bd = nfs4_blkdev_get(MKDEV(reply->major, reply->minor)); + if (IS_ERR(bd)) { + rc = PTR_ERR(bd); + dprintk("%s failed to open device : %d\n", __func__, rc); + rv = ERR_PTR(rc); + goto out; + } + + rv = kzalloc(sizeof(*rv), GFP_NOFS); + if (!rv) { + rv = ERR_PTR(-ENOMEM); + goto out; + } + + rv->bm_mdev = bd; + memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct nfs4_deviceid)); + dprintk("%s Created device %s with bd_block_size %u\n", + __func__, + bd->bd_disk->disk_name, + bd->bd_block_size); + +out: + kfree(msg.data); + return rv; +} + +/* Map deviceid returned by the server to constructed block_device */ +static struct block_device *translate_devid(struct pnfs_layout_hdr *lo, + struct nfs4_deviceid *id) +{ + struct block_device *rv = NULL; + struct block_mount_id *mid; + struct pnfs_block_dev *dev; + + dprintk("%s enter, lo=%p, id=%p\n", __func__, lo, id); + mid = BLK_ID(lo); + spin_lock(&mid->bm_lock); + list_for_each_entry(dev, &mid->bm_devlist, bm_node) { + if (memcmp(id->data, dev->bm_mdevid.data, + NFS4_DEVICEID4_SIZE) == 0) { + rv = dev->bm_mdev; + goto out; + } + } + out: + spin_unlock(&mid->bm_lock); + dprintk("%s returning %p\n", __func__, rv); + return rv; +} + +/* Tracks info needed to ensure extents in layout obey constraints of spec */ +struct layout_verification { + u32 mode; /* R or RW */ + u64 start; /* Expected start of next non-COW extent */ + u64 inval; /* Start of INVAL coverage */ + u64 cowread; /* End of COW read coverage */ +}; + +/* Verify the extent meets the layout requirements of the pnfs-block draft, + * section 2.3.1. + */ +static int verify_extent(struct pnfs_block_extent *be, + struct layout_verification *lv) +{ + if (lv->mode == IOMODE_READ) { + if (be->be_state == PNFS_BLOCK_READWRITE_DATA || + be->be_state == PNFS_BLOCK_INVALID_DATA) + return -EIO; + if (be->be_f_offset != lv->start) + return -EIO; + lv->start += be->be_length; + return 0; + } + /* lv->mode == IOMODE_RW */ + if (be->be_state == PNFS_BLOCK_READWRITE_DATA) { + if (be->be_f_offset != lv->start) + return -EIO; + if (lv->cowread > lv->start) + return -EIO; + lv->start += be->be_length; + lv->inval = lv->start; + return 0; + } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) { + if (be->be_f_offset != lv->start) + return -EIO; + lv->start += be->be_length; + return 0; + } else if (be->be_state == PNFS_BLOCK_READ_DATA) { + if (be->be_f_offset > lv->start) + return -EIO; + if (be->be_f_offset < lv->inval) + return -EIO; + if (be->be_f_offset < lv->cowread) + return -EIO; + /* It looks like you might want to min this with lv->start, + * but you really don't. + */ + lv->inval = lv->inval + be->be_length; + lv->cowread = be->be_f_offset + be->be_length; + return 0; + } else + return -EIO; +} + +/* XDR decode pnfs_block_layout4 structure */ +int +nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo, + struct nfs4_layoutget_res *lgr, gfp_t gfp_flags) +{ + struct pnfs_block_layout *bl = BLK_LO2EXT(lo); + int i, status = -EIO; + uint32_t count; + struct pnfs_block_extent *be = NULL, *save; + struct xdr_stream stream; + struct xdr_buf buf; + struct page *scratch; + __be32 *p; + struct layout_verification lv = { + .mode = lgr->range.iomode, + .start = lgr->range.offset >> SECTOR_SHIFT, + .inval = lgr->range.offset >> SECTOR_SHIFT, + .cowread = lgr->range.offset >> SECTOR_SHIFT, + }; + LIST_HEAD(extents); + + dprintk("---> %s\n", __func__); + + scratch = alloc_page(gfp_flags); + if (!scratch) + return -ENOMEM; + + xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len); + xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); + + p = xdr_inline_decode(&stream, 4); + if (unlikely(!p)) + goto out_err; + + count = be32_to_cpup(p++); + + dprintk("%s enter, number of extents %i\n", __func__, count); + p = xdr_inline_decode(&stream, (28 + NFS4_DEVICEID4_SIZE) * count); + if (unlikely(!p)) + goto out_err; + + /* Decode individual extents, putting them in temporary + * staging area until whole layout is decoded to make error + * recovery easier. + */ + for (i = 0; i < count; i++) { + be = bl_alloc_extent(); + if (!be) { + status = -ENOMEM; + goto out_err; + } + memcpy(&be->be_devid, p, NFS4_DEVICEID4_SIZE); + p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); + be->be_mdev = translate_devid(lo, &be->be_devid); + if (!be->be_mdev) + goto out_err; + + /* The next three values are read in as bytes, + * but stored as 512-byte sector lengths + */ + if (decode_sector_number(&p, &be->be_f_offset) < 0) + goto out_err; + if (decode_sector_number(&p, &be->be_length) < 0) + goto out_err; + if (decode_sector_number(&p, &be->be_v_offset) < 0) + goto out_err; + be->be_state = be32_to_cpup(p++); + if (be->be_state == PNFS_BLOCK_INVALID_DATA) + be->be_inval = &bl->bl_inval; + if (verify_extent(be, &lv)) { + dprintk("%s verify failed\n", __func__); + goto out_err; + } + list_add_tail(&be->be_node, &extents); + } + if (lgr->range.offset + lgr->range.length != + lv.start << SECTOR_SHIFT) { + dprintk("%s Final length mismatch\n", __func__); + be = NULL; + goto out_err; + } + if (lv.start < lv.cowread) { + dprintk("%s Final uncovered COW extent\n", __func__); + be = NULL; + goto out_err; + } + /* Extents decoded properly, now try to merge them in to + * existing layout extents. + */ + spin_lock(&bl->bl_ext_lock); + list_for_each_entry_safe(be, save, &extents, be_node) { + list_del(&be->be_node); + status = bl_add_merge_extent(bl, be); + if (status) { + spin_unlock(&bl->bl_ext_lock); + /* This is a fairly catastrophic error, as the + * entire layout extent lists are now corrupted. + * We should have some way to distinguish this. + */ + be = NULL; + goto out_err; + } + } + spin_unlock(&bl->bl_ext_lock); + status = 0; + out: + __free_page(scratch); + dprintk("%s returns %i\n", __func__, status); + return status; + + out_err: + bl_put_extent(be); + while (!list_empty(&extents)) { + be = list_first_entry(&extents, struct pnfs_block_extent, + be_node); + list_del(&be->be_node); + bl_put_extent(be); + } + goto out; +} diff --git a/fs/nfs/blocklayout/blocklayoutdm.c b/fs/nfs/blocklayout/blocklayoutdm.c new file mode 100644 index 000000000000..d055c7558073 --- /dev/null +++ b/fs/nfs/blocklayout/blocklayoutdm.c @@ -0,0 +1,111 @@ +/* + * linux/fs/nfs/blocklayout/blocklayoutdm.c + * + * Module for the NFSv4.1 pNFS block layout driver. + * + * Copyright (c) 2007 The Regents of the University of Michigan. + * All rights reserved. + * + * Fred Isaman <iisaman@umich.edu> + * Andy Adamson <andros@citi.umich.edu> + * + * permission is granted to use, copy, create derivative works and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the university of michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. if + * the above copyright notice or any other identification of the + * university of michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * this software is provided as is, without representation from the + * university of michigan as to its fitness for any purpose, and without + * warranty by the university of michigan of any kind, either express + * or implied, including without limitation the implied warranties of + * merchantability and fitness for a particular purpose. the regents + * of the university of michigan shall not be liable for any damages, + * including special, indirect, incidental, or consequential damages, + * with respect to any claim arising out or in connection with the use + * of the software, even if it has been or is hereafter advised of the + * possibility of such damages. + */ + +#include <linux/genhd.h> /* gendisk - used in a dprintk*/ +#include <linux/sched.h> +#include <linux/hash.h> + +#include "blocklayout.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +static void dev_remove(dev_t dev) +{ + struct rpc_pipe_msg msg; + struct bl_dev_msg bl_umount_request; + struct bl_msg_hdr bl_msg = { + .type = BL_DEVICE_UMOUNT, + .totallen = sizeof(bl_umount_request), + }; + uint8_t *dataptr; + DECLARE_WAITQUEUE(wq, current); + + dprintk("Entering %s\n", __func__); + + memset(&msg, 0, sizeof(msg)); + msg.data = kzalloc(1 + sizeof(bl_umount_request), GFP_NOFS); + if (!msg.data) + goto out; + + memset(&bl_umount_request, 0, sizeof(bl_umount_request)); + bl_umount_request.major = MAJOR(dev); + bl_umount_request.minor = MINOR(dev); + + memcpy(msg.data, &bl_msg, sizeof(bl_msg)); + dataptr = (uint8_t *) msg.data; + memcpy(&dataptr[sizeof(bl_msg)], &bl_umount_request, sizeof(bl_umount_request)); + msg.len = sizeof(bl_msg) + bl_msg.totallen; + + add_wait_queue(&bl_wq, &wq); + if (rpc_queue_upcall(bl_device_pipe->d_inode, &msg) < 0) { + remove_wait_queue(&bl_wq, &wq); + goto out; + } + + set_current_state(TASK_UNINTERRUPTIBLE); + schedule(); + __set_current_state(TASK_RUNNING); + remove_wait_queue(&bl_wq, &wq); + +out: + kfree(msg.data); +} + +/* + * Release meta device + */ +static void nfs4_blk_metadev_release(struct pnfs_block_dev *bdev) +{ + int rv; + + dprintk("%s Releasing\n", __func__); + rv = nfs4_blkdev_put(bdev->bm_mdev); + if (rv) + printk(KERN_ERR "%s nfs4_blkdev_put returns %d\n", + __func__, rv); + + dev_remove(bdev->bm_mdev->bd_dev); +} + +void bl_free_block_dev(struct pnfs_block_dev *bdev) +{ + if (bdev) { + if (bdev->bm_mdev) { + dprintk("%s Removing DM device: %d:%d\n", + __func__, + MAJOR(bdev->bm_mdev->bd_dev), + MINOR(bdev->bm_mdev->bd_dev)); + nfs4_blk_metadev_release(bdev); + } + kfree(bdev); + } +} diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c new file mode 100644 index 000000000000..19fa7b0b8c00 --- /dev/null +++ b/fs/nfs/blocklayout/extents.c @@ -0,0 +1,935 @@ +/* + * linux/fs/nfs/blocklayout/blocklayout.h + * + * Module for the NFSv4.1 pNFS block layout driver. + * + * Copyright (c) 2006 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson <andros@citi.umich.edu> + * Fred Isaman <iisaman@umich.edu> + * + * permission is granted to use, copy, create derivative works and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the university of michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. if + * the above copyright notice or any other identification of the + * university of michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * this software is provided as is, without representation from the + * university of michigan as to its fitness for any purpose, and without + * warranty by the university of michigan of any kind, either express + * or implied, including without limitation the implied warranties of + * merchantability and fitness for a particular purpose. the regents + * of the university of michigan shall not be liable for any damages, + * including special, indirect, incidental, or consequential damages, + * with respect to any claim arising out or in connection with the use + * of the software, even if it has been or is hereafter advised of the + * possibility of such damages. + */ + +#include "blocklayout.h" +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +/* Bit numbers */ +#define EXTENT_INITIALIZED 0 +#define EXTENT_WRITTEN 1 +#define EXTENT_IN_COMMIT 2 +#define INTERNAL_EXISTS MY_MAX_TAGS +#define INTERNAL_MASK ((1 << INTERNAL_EXISTS) - 1) + +/* Returns largest t<=s s.t. t%base==0 */ +static inline sector_t normalize(sector_t s, int base) +{ + sector_t tmp = s; /* Since do_div modifies its argument */ + return s - do_div(tmp, base); +} + +static inline sector_t normalize_up(sector_t s, int base) +{ + return normalize(s + base - 1, base); +} + +/* Complete stub using list while determine API wanted */ + +/* Returns tags, or negative */ +static int32_t _find_entry(struct my_tree *tree, u64 s) +{ + struct pnfs_inval_tracking *pos; + + dprintk("%s(%llu) enter\n", __func__, s); + list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { + if (pos->it_sector > s) + continue; + else if (pos->it_sector == s) + return pos->it_tags & INTERNAL_MASK; + else + break; + } + return -ENOENT; +} + +static inline +int _has_tag(struct my_tree *tree, u64 s, int32_t tag) +{ + int32_t tags; + + dprintk("%s(%llu, %i) enter\n", __func__, s, tag); + s = normalize(s, tree->mtt_step_size); + tags = _find_entry(tree, s); + if ((tags < 0) || !(tags & (1 << tag))) + return 0; + else + return 1; +} + +/* Creates entry with tag, or if entry already exists, unions tag to it. + * If storage is not NULL, newly created entry will use it. + * Returns number of entries added, or negative on error. + */ +static int _add_entry(struct my_tree *tree, u64 s, int32_t tag, + struct pnfs_inval_tracking *storage) +{ + int found = 0; + struct pnfs_inval_tracking *pos; + + dprintk("%s(%llu, %i, %p) enter\n", __func__, s, tag, storage); + list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { + if (pos->it_sector > s) + continue; + else if (pos->it_sector == s) { + found = 1; + break; + } else + break; + } + if (found) { + pos->it_tags |= (1 << tag); + return 0; + } else { + struct pnfs_inval_tracking *new; + if (storage) + new = storage; + else { + new = kmalloc(sizeof(*new), GFP_NOFS); + if (!new) + return -ENOMEM; + } + new->it_sector = s; + new->it_tags = (1 << tag); + list_add(&new->it_link, &pos->it_link); + return 1; + } +} + +/* XXXX Really want option to not create */ +/* Over range, unions tag with existing entries, else creates entry with tag */ +static int _set_range(struct my_tree *tree, int32_t tag, u64 s, u64 length) +{ + u64 i; + + dprintk("%s(%i, %llu, %llu) enter\n", __func__, tag, s, length); + for (i = normalize(s, tree->mtt_step_size); i < s + length; + i += tree->mtt_step_size) + if (_add_entry(tree, i, tag, NULL)) + return -ENOMEM; + return 0; +} + +/* Ensure that future operations on given range of tree will not malloc */ +static int _preload_range(struct my_tree *tree, u64 offset, u64 length) +{ + u64 start, end, s; + int count, i, used = 0, status = -ENOMEM; + struct pnfs_inval_tracking **storage; + + dprintk("%s(%llu, %llu) enter\n", __func__, offset, length); + start = normalize(offset, tree->mtt_step_size); + end = normalize_up(offset + length, tree->mtt_step_size); + count = (int)(end - start) / (int)tree->mtt_step_size; + + /* Pre-malloc what memory we might need */ + storage = kmalloc(sizeof(*storage) * count, GFP_NOFS); + if (!storage) + return -ENOMEM; + for (i = 0; i < count; i++) { + storage[i] = kmalloc(sizeof(struct pnfs_inval_tracking), + GFP_NOFS); + if (!storage[i]) + goto out_cleanup; + } + + /* Now need lock - HOW??? */ + + for (s = start; s < end; s += tree->mtt_step_size) + used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]); + + /* Unlock - HOW??? */ + status = 0; + + out_cleanup: + for (i = used; i < count; i++) { + if (!storage[i]) + break; + kfree(storage[i]); + } + kfree(storage); + return status; +} + +static void set_needs_init(sector_t *array, sector_t offset) +{ + sector_t *p = array; + + dprintk("%s enter\n", __func__); + if (!p) + return; + while (*p < offset) + p++; + if (*p == offset) + return; + else if (*p == ~0) { + *p++ = offset; + *p = ~0; + return; + } else { + sector_t *save = p; + dprintk("%s Adding %llu\n", __func__, (u64)offset); + while (*p != ~0) + p++; + p++; + memmove(save + 1, save, (char *)p - (char *)save); + *save = offset; + return; + } +} + +/* We are relying on page lock to serialize this */ +int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect) +{ + int rv; + + spin_lock(&marks->im_lock); + rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED); + spin_unlock(&marks->im_lock); + return rv; +} + +/* Assume start, end already sector aligned */ +static int +_range_has_tag(struct my_tree *tree, u64 start, u64 end, int32_t tag) +{ + struct pnfs_inval_tracking *pos; + u64 expect = 0; + + dprintk("%s(%llu, %llu, %i) enter\n", __func__, start, end, tag); + list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { + if (pos->it_sector >= end) + continue; + if (!expect) { + if ((pos->it_sector == end - tree->mtt_step_size) && + (pos->it_tags & (1 << tag))) { + expect = pos->it_sector - tree->mtt_step_size; + if (pos->it_sector < tree->mtt_step_size || expect < start) + return 1; + continue; + } else { + return 0; + } + } + if (pos->it_sector != expect || !(pos->it_tags & (1 << tag))) + return 0; + expect -= tree->mtt_step_size; + if (expect < start) + return 1; + } + return 0; +} + +static int is_range_written(struct pnfs_inval_markings *marks, + sector_t start, sector_t end) +{ + int rv; + + spin_lock(&marks->im_lock); + rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN); + spin_unlock(&marks->im_lock); + return rv; +} + +/* Marks sectors in [offest, offset_length) as having been initialized. + * All lengths are step-aligned, where step is min(pagesize, blocksize). + * Notes where partial block is initialized, and helps prepare it for + * complete initialization later. + */ +/* Currently assumes offset is page-aligned */ +int bl_mark_sectors_init(struct pnfs_inval_markings *marks, + sector_t offset, sector_t length, + sector_t **pages) +{ + sector_t s, start, end; + sector_t *array = NULL; /* Pages to mark */ + + dprintk("%s(offset=%llu,len=%llu) enter\n", + __func__, (u64)offset, (u64)length); + s = max((sector_t) 3, + 2 * (marks->im_block_size / (PAGE_CACHE_SECTORS))); + dprintk("%s set max=%llu\n", __func__, (u64)s); + if (pages) { + array = kmalloc(s * sizeof(sector_t), GFP_NOFS); + if (!array) + goto outerr; + array[0] = ~0; + } + + start = normalize(offset, marks->im_block_size); + end = normalize_up(offset + length, marks->im_block_size); + if (_preload_range(&marks->im_tree, start, end - start)) + goto outerr; + + spin_lock(&marks->im_lock); + + for (s = normalize_up(start, PAGE_CACHE_SECTORS); + s < offset; s += PAGE_CACHE_SECTORS) { + dprintk("%s pre-area pages\n", __func__); + /* Portion of used block is not initialized */ + if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED)) + set_needs_init(array, s); + } + if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length)) + goto out_unlock; + for (s = normalize_up(offset + length, PAGE_CACHE_SECTORS); + s < end; s += PAGE_CACHE_SECTORS) { + dprintk("%s post-area pages\n", __func__); + if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED)) + set_needs_init(array, s); + } + + spin_unlock(&marks->im_lock); + + if (pages) { + if (array[0] == ~0) { + kfree(array); + *pages = NULL; + } else + *pages = array; + } + return 0; + + out_unlock: + spin_unlock(&marks->im_lock); + outerr: + if (pages) { + kfree(array); + *pages = NULL; + } + return -ENOMEM; +} + +/* Marks sectors in [offest, offset+length) as having been written to disk. + * All lengths should be block aligned. + */ +static int mark_written_sectors(struct pnfs_inval_markings *marks, + sector_t offset, sector_t length) +{ + int status; + + dprintk("%s(offset=%llu,len=%llu) enter\n", __func__, + (u64)offset, (u64)length); + spin_lock(&marks->im_lock); + status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length); + spin_unlock(&marks->im_lock); + return status; +} + +static void print_short_extent(struct pnfs_block_short_extent *be) +{ + dprintk("PRINT SHORT EXTENT extent %p\n", be); + if (be) { + dprintk(" be_f_offset %llu\n", (u64)be->bse_f_offset); + dprintk(" be_length %llu\n", (u64)be->bse_length); + } +} + +static void print_clist(struct list_head *list, unsigned int count) +{ + struct pnfs_block_short_extent *be; + unsigned int i = 0; + + ifdebug(FACILITY) { + printk(KERN_DEBUG "****************\n"); + printk(KERN_DEBUG "Extent list looks like:\n"); + list_for_each_entry(be, list, bse_node) { + i++; + print_short_extent(be); + } + if (i != count) + printk(KERN_DEBUG "\n\nExpected %u entries\n\n\n", count); + printk(KERN_DEBUG "****************\n"); + } +} + +/* Note: In theory, we should do more checking that devid's match between + * old and new, but if they don't, the lists are too corrupt to salvage anyway. + */ +/* Note this is very similar to bl_add_merge_extent */ +static void add_to_commitlist(struct pnfs_block_layout *bl, + struct pnfs_block_short_extent *new) +{ + struct list_head *clist = &bl->bl_commit; + struct pnfs_block_short_extent *old, *save; + sector_t end = new->bse_f_offset + new->bse_length; + + dprintk("%s enter\n", __func__); + print_short_extent(new); + print_clist(clist, bl->bl_count); + bl->bl_count++; + /* Scan for proper place to insert, extending new to the left + * as much as possible. + */ + list_for_each_entry_safe(old, save, clist, bse_node) { + if (new->bse_f_offset < old->bse_f_offset) + break; + if (end <= old->bse_f_offset + old->bse_length) { + /* Range is already in list */ + bl->bl_count--; + kfree(new); + return; + } else if (new->bse_f_offset <= + old->bse_f_offset + old->bse_length) { + /* new overlaps or abuts existing be */ + if (new->bse_mdev == old->bse_mdev) { + /* extend new to fully replace old */ + new->bse_length += new->bse_f_offset - + old->bse_f_offset; + new->bse_f_offset = old->bse_f_offset; + list_del(&old->bse_node); + bl->bl_count--; + kfree(old); + } + } + } + /* Note that if we never hit the above break, old will not point to a + * valid extent. However, in that case &old->bse_node==list. + */ + list_add_tail(&new->bse_node, &old->bse_node); + /* Scan forward for overlaps. If we find any, extend new and + * remove the overlapped extent. + */ + old = list_prepare_entry(new, clist, bse_node); + list_for_each_entry_safe_continue(old, save, clist, bse_node) { + if (end < old->bse_f_offset) + break; + /* new overlaps or abuts old */ + if (new->bse_mdev == old->bse_mdev) { + if (end < old->bse_f_offset + old->bse_length) { + /* extend new to fully cover old */ + end = old->bse_f_offset + old->bse_length; + new->bse_length = end - new->bse_f_offset; + } + list_del(&old->bse_node); + bl->bl_count--; + kfree(old); + } + } + dprintk("%s: after merging\n", __func__); + print_clist(clist, bl->bl_count); +} + +/* Note the range described by offset, length is guaranteed to be contained + * within be. + */ +int bl_mark_for_commit(struct pnfs_block_extent *be, + sector_t offset, sector_t length) +{ + sector_t new_end, end = offset + length; + struct pnfs_block_short_extent *new; + struct pnfs_block_layout *bl = container_of(be->be_inval, + struct pnfs_block_layout, + bl_inval); + + new = kmalloc(sizeof(*new), GFP_NOFS); + if (!new) + return -ENOMEM; + + mark_written_sectors(be->be_inval, offset, length); + /* We want to add the range to commit list, but it must be + * block-normalized, and verified that the normalized range has + * been entirely written to disk. + */ + new->bse_f_offset = offset; + offset = normalize(offset, bl->bl_blocksize); + if (offset < new->bse_f_offset) { + if (is_range_written(be->be_inval, offset, new->bse_f_offset)) + new->bse_f_offset = offset; + else + new->bse_f_offset = offset + bl->bl_blocksize; + } + new_end = normalize_up(end, bl->bl_blocksize); + if (end < new_end) { + if (is_range_written(be->be_inval, end, new_end)) + end = new_end; + else + end = new_end - bl->bl_blocksize; + } + if (end <= new->bse_f_offset) { + kfree(new); + return 0; + } + new->bse_length = end - new->bse_f_offset; + new->bse_devid = be->be_devid; + new->bse_mdev = be->be_mdev; + + spin_lock(&bl->bl_ext_lock); + /* new will be freed, either by add_to_commitlist if it decides not + * to use it, or after LAYOUTCOMMIT uses it in the commitlist. + */ + add_to_commitlist(bl, new); + spin_unlock(&bl->bl_ext_lock); + return 0; +} + +static void print_bl_extent(struct pnfs_block_extent *be) +{ + dprintk("PRINT EXTENT extent %p\n", be); + if (be) { + dprintk(" be_f_offset %llu\n", (u64)be->be_f_offset); + dprintk(" be_length %llu\n", (u64)be->be_length); + dprintk(" be_v_offset %llu\n", (u64)be->be_v_offset); + dprintk(" be_state %d\n", be->be_state); + } +} + +static void +destroy_extent(struct kref *kref) +{ + struct pnfs_block_extent *be; + + be = container_of(kref, struct pnfs_block_extent, be_refcnt); + dprintk("%s be=%p\n", __func__, be); + kfree(be); +} + +void +bl_put_extent(struct pnfs_block_extent *be) +{ + if (be) { + dprintk("%s enter %p (%i)\n", __func__, be, + atomic_read(&be->be_refcnt.refcount)); + kref_put(&be->be_refcnt, destroy_extent); + } +} + +struct pnfs_block_extent *bl_alloc_extent(void) +{ + struct pnfs_block_extent *be; + + be = kmalloc(sizeof(struct pnfs_block_extent), GFP_NOFS); + if (!be) + return NULL; + INIT_LIST_HEAD(&be->be_node); + kref_init(&be->be_refcnt); + be->be_inval = NULL; + return be; +} + +static void print_elist(struct list_head *list) +{ + struct pnfs_block_extent *be; + dprintk("****************\n"); + dprintk("Extent list looks like:\n"); + list_for_each_entry(be, list, be_node) { + print_bl_extent(be); + } + dprintk("****************\n"); +} + +static inline int +extents_consistent(struct pnfs_block_extent *old, struct pnfs_block_extent *new) +{ + /* Note this assumes new->be_f_offset >= old->be_f_offset */ + return (new->be_state == old->be_state) && + ((new->be_state == PNFS_BLOCK_NONE_DATA) || + ((new->be_v_offset - old->be_v_offset == + new->be_f_offset - old->be_f_offset) && + new->be_mdev == old->be_mdev)); +} + +/* Adds new to appropriate list in bl, modifying new and removing existing + * extents as appropriate to deal with overlaps. + * + * See bl_find_get_extent for list constraints. + * + * Refcount on new is already set. If end up not using it, or error out, + * need to put the reference. + * + * bl->bl_ext_lock is held by caller. + */ +int +bl_add_merge_extent(struct pnfs_block_layout *bl, + struct pnfs_block_extent *new) +{ + struct pnfs_block_extent *be, *tmp; + sector_t end = new->be_f_offset + new->be_length; + struct list_head *list; + + dprintk("%s enter with be=%p\n", __func__, new); + print_bl_extent(new); + list = &bl->bl_extents[bl_choose_list(new->be_state)]; + print_elist(list); + + /* Scan for proper place to insert, extending new to the left + * as much as possible. + */ + list_for_each_entry_safe_reverse(be, tmp, list, be_node) { + if (new->be_f_offset >= be->be_f_offset + be->be_length) + break; + if (new->be_f_offset >= be->be_f_offset) { + if (end <= be->be_f_offset + be->be_length) { + /* new is a subset of existing be*/ + if (extents_consistent(be, new)) { + dprintk("%s: new is subset, ignoring\n", + __func__); + bl_put_extent(new); + return 0; + } else { + goto out_err; + } + } else { + /* |<-- be -->| + * |<-- new -->| */ + if (extents_consistent(be, new)) { + /* extend new to fully replace be */ + new->be_length += new->be_f_offset - + be->be_f_offset; + new->be_f_offset = be->be_f_offset; + new->be_v_offset = be->be_v_offset; + dprintk("%s: removing %p\n", __func__, be); + list_del(&be->be_node); + bl_put_extent(be); + } else { + goto out_err; + } + } + } else if (end >= be->be_f_offset + be->be_length) { + /* new extent overlap existing be */ + if (extents_consistent(be, new)) { + /* extend new to fully replace be */ + dprintk("%s: removing %p\n", __func__, be); + list_del(&be->be_node); + bl_put_extent(be); + } else { + goto out_err; + } + } else if (end > be->be_f_offset) { + /* |<-- be -->| + *|<-- new -->| */ + if (extents_consistent(new, be)) { + /* extend new to fully replace be */ + new->be_length += be->be_f_offset + be->be_length - + new->be_f_offset - new->be_length; + dprintk("%s: removing %p\n", __func__, be); + list_del(&be->be_node); + bl_put_extent(be); + } else { + goto out_err; + } + } + } + /* Note that if we never hit the above break, be will not point to a + * valid extent. However, in that case &be->be_node==list. + */ + list_add(&new->be_node, &be->be_node); + dprintk("%s: inserting new\n", __func__); + print_elist(list); + /* FIXME - The per-list consistency checks have all been done, + * should now check cross-list consistency. + */ + return 0; + + out_err: + bl_put_extent(new); + return -EIO; +} + +/* Returns extent, or NULL. If a second READ extent exists, it is returned + * in cow_read, if given. + * + * The extents are kept in two seperate ordered lists, one for READ and NONE, + * one for READWRITE and INVALID. Within each list, we assume: + * 1. Extents are ordered by file offset. + * 2. For any given isect, there is at most one extents that matches. + */ +struct pnfs_block_extent * +bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect, + struct pnfs_block_extent **cow_read) +{ + struct pnfs_block_extent *be, *cow, *ret; + int i; + + dprintk("%s enter with isect %llu\n", __func__, (u64)isect); + cow = ret = NULL; + spin_lock(&bl->bl_ext_lock); + for (i = 0; i < EXTENT_LISTS; i++) { + list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) { + if (isect >= be->be_f_offset + be->be_length) + break; + if (isect >= be->be_f_offset) { + /* We have found an extent */ + dprintk("%s Get %p (%i)\n", __func__, be, + atomic_read(&be->be_refcnt.refcount)); + kref_get(&be->be_refcnt); + if (!ret) + ret = be; + else if (be->be_state != PNFS_BLOCK_READ_DATA) + bl_put_extent(be); + else + cow = be; + break; + } + } + if (ret && + (!cow_read || ret->be_state != PNFS_BLOCK_INVALID_DATA)) + break; + } + spin_unlock(&bl->bl_ext_lock); + if (cow_read) + *cow_read = cow; + print_bl_extent(ret); + return ret; +} + +/* Similar to bl_find_get_extent, but called with lock held, and ignores cow */ +static struct pnfs_block_extent * +bl_find_get_extent_locked(struct pnfs_block_layout *bl, sector_t isect) +{ + struct pnfs_block_extent *be, *ret = NULL; + int i; + + dprintk("%s enter with isect %llu\n", __func__, (u64)isect); + for (i = 0; i < EXTENT_LISTS; i++) { + if (ret) + break; + list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) { + if (isect >= be->be_f_offset + be->be_length) + break; + if (isect >= be->be_f_offset) { + /* We have found an extent */ + dprintk("%s Get %p (%i)\n", __func__, be, + atomic_read(&be->be_refcnt.refcount)); + kref_get(&be->be_refcnt); + ret = be; + break; + } + } + } + print_bl_extent(ret); + return ret; +} + +int +encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, + struct xdr_stream *xdr, + const struct nfs4_layoutcommit_args *arg) +{ + struct pnfs_block_short_extent *lce, *save; + unsigned int count = 0; + __be32 *p, *xdr_start; + + dprintk("%s enter\n", __func__); + /* BUG - creation of bl_commit is buggy - need to wait for + * entire block to be marked WRITTEN before it can be added. + */ + spin_lock(&bl->bl_ext_lock); + /* Want to adjust for possible truncate */ + /* We now want to adjust argument range */ + + /* XDR encode the ranges found */ + xdr_start = xdr_reserve_space(xdr, 8); + if (!xdr_start) + goto out; + list_for_each_entry_safe(lce, save, &bl->bl_commit, bse_node) { + p = xdr_reserve_space(xdr, 7 * 4 + sizeof(lce->bse_devid.data)); + if (!p) + break; + p = xdr_encode_opaque_fixed(p, lce->bse_devid.data, NFS4_DEVICEID4_SIZE); + p = xdr_encode_hyper(p, lce->bse_f_offset << SECTOR_SHIFT); + p = xdr_encode_hyper(p, lce->bse_length << SECTOR_SHIFT); + p = xdr_encode_hyper(p, 0LL); + *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA); + list_del(&lce->bse_node); + list_add_tail(&lce->bse_node, &bl->bl_committing); + bl->bl_count--; + count++; + } + xdr_start[0] = cpu_to_be32((xdr->p - xdr_start - 1) * 4); + xdr_start[1] = cpu_to_be32(count); +out: + spin_unlock(&bl->bl_ext_lock); + dprintk("%s found %i ranges\n", __func__, count); + return 0; +} + +/* Helper function to set_to_rw that initialize a new extent */ +static void +_prep_new_extent(struct pnfs_block_extent *new, + struct pnfs_block_extent *orig, + sector_t offset, sector_t length, int state) +{ + kref_init(&new->be_refcnt); + /* don't need to INIT_LIST_HEAD(&new->be_node) */ + memcpy(&new->be_devid, &orig->be_devid, sizeof(struct nfs4_deviceid)); + new->be_mdev = orig->be_mdev; + new->be_f_offset = offset; + new->be_length = length; + new->be_v_offset = orig->be_v_offset - orig->be_f_offset + offset; + new->be_state = state; + new->be_inval = orig->be_inval; +} + +/* Tries to merge be with extent in front of it in list. + * Frees storage if not used. + */ +static struct pnfs_block_extent * +_front_merge(struct pnfs_block_extent *be, struct list_head *head, + struct pnfs_block_extent *storage) +{ + struct pnfs_block_extent *prev; + + if (!storage) + goto no_merge; + if (&be->be_node == head || be->be_node.prev == head) + goto no_merge; + prev = list_entry(be->be_node.prev, struct pnfs_block_extent, be_node); + if ((prev->be_f_offset + prev->be_length != be->be_f_offset) || + !extents_consistent(prev, be)) + goto no_merge; + _prep_new_extent(storage, prev, prev->be_f_offset, + prev->be_length + be->be_length, prev->be_state); + list_replace(&prev->be_node, &storage->be_node); + bl_put_extent(prev); + list_del(&be->be_node); + bl_put_extent(be); + return storage; + + no_merge: + kfree(storage); + return be; +} + +static u64 +set_to_rw(struct pnfs_block_layout *bl, u64 offset, u64 length) +{ + u64 rv = offset + length; + struct pnfs_block_extent *be, *e1, *e2, *e3, *new, *old; + struct pnfs_block_extent *children[3]; + struct pnfs_block_extent *merge1 = NULL, *merge2 = NULL; + int i = 0, j; + + dprintk("%s(%llu, %llu)\n", __func__, offset, length); + /* Create storage for up to three new extents e1, e2, e3 */ + e1 = kmalloc(sizeof(*e1), GFP_ATOMIC); + e2 = kmalloc(sizeof(*e2), GFP_ATOMIC); + e3 = kmalloc(sizeof(*e3), GFP_ATOMIC); + /* BUG - we are ignoring any failure */ + if (!e1 || !e2 || !e3) + goto out_nosplit; + + spin_lock(&bl->bl_ext_lock); + be = bl_find_get_extent_locked(bl, offset); + rv = be->be_f_offset + be->be_length; + if (be->be_state != PNFS_BLOCK_INVALID_DATA) { + spin_unlock(&bl->bl_ext_lock); + goto out_nosplit; + } + /* Add e* to children, bumping e*'s krefs */ + if (be->be_f_offset != offset) { + _prep_new_extent(e1, be, be->be_f_offset, + offset - be->be_f_offset, + PNFS_BLOCK_INVALID_DATA); + children[i++] = e1; + print_bl_extent(e1); + } else + merge1 = e1; + _prep_new_extent(e2, be, offset, + min(length, be->be_f_offset + be->be_length - offset), + PNFS_BLOCK_READWRITE_DATA); + children[i++] = e2; + print_bl_extent(e2); + if (offset + length < be->be_f_offset + be->be_length) { + _prep_new_extent(e3, be, e2->be_f_offset + e2->be_length, + be->be_f_offset + be->be_length - + offset - length, + PNFS_BLOCK_INVALID_DATA); + children[i++] = e3; + print_bl_extent(e3); + } else + merge2 = e3; + + /* Remove be from list, and insert the e* */ + /* We don't get refs on e*, since this list is the base reference + * set when init'ed. + */ + if (i < 3) + children[i] = NULL; + new = children[0]; + list_replace(&be->be_node, &new->be_node); + bl_put_extent(be); + new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge1); + for (j = 1; j < i; j++) { + old = new; + new = children[j]; + list_add(&new->be_node, &old->be_node); + } + if (merge2) { + /* This is a HACK, should just create a _back_merge function */ + new = list_entry(new->be_node.next, + struct pnfs_block_extent, be_node); + new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge2); + } + spin_unlock(&bl->bl_ext_lock); + + /* Since we removed the base reference above, be is now scheduled for + * destruction. + */ + bl_put_extent(be); + dprintk("%s returns %llu after split\n", __func__, rv); + return rv; + + out_nosplit: + kfree(e1); + kfree(e2); + kfree(e3); + dprintk("%s returns %llu without splitting\n", __func__, rv); + return rv; +} + +void +clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, + const struct nfs4_layoutcommit_args *arg, + int status) +{ + struct pnfs_block_short_extent *lce, *save; + + dprintk("%s status %d\n", __func__, status); + list_for_each_entry_safe(lce, save, &bl->bl_committing, bse_node) { + if (likely(!status)) { + u64 offset = lce->bse_f_offset; + u64 end = offset + lce->bse_length; + + do { + offset = set_to_rw(bl, offset, end - offset); + } while (offset < end); + list_del(&lce->bse_node); + + kfree(lce); + } else { + list_del(&lce->bse_node); + spin_lock(&bl->bl_ext_lock); + add_to_commitlist(bl, lce); + spin_unlock(&bl->bl_ext_lock); + } + } +} diff --git a/fs/nfs/cache_lib.h b/fs/nfs/cache_lib.h index 76f856e284e4..7cf6cafcc007 100644 --- a/fs/nfs/cache_lib.h +++ b/fs/nfs/cache_lib.h @@ -6,7 +6,7 @@ #include <linux/completion.h> #include <linux/sunrpc/cache.h> -#include <asm/atomic.h> +#include <linux/atomic.h> /* * Deferred request handling diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index e3d294269058..516f3375e067 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -125,7 +125,7 @@ nfs4_callback_up(struct svc_serv *serv) else goto out_err; - return svc_prepare_thread(serv, &serv->sv_pools[0]); + return svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE); out_err: if (ret == 0) @@ -199,7 +199,7 @@ nfs41_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt) INIT_LIST_HEAD(&serv->sv_cb_list); spin_lock_init(&serv->sv_cb_lock); init_waitqueue_head(&serv->sv_cb_waitq); - rqstp = svc_prepare_thread(serv, &serv->sv_pools[0]); + rqstp = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE); if (IS_ERR(rqstp)) { svc_xprt_put(serv->sv_bc_xprt); serv->sv_bc_xprt = NULL; diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h index b257383bb565..07df5f1d85e5 100644 --- a/fs/nfs/callback.h +++ b/fs/nfs/callback.h @@ -38,6 +38,7 @@ enum nfs4_callback_opnum { struct cb_process_state { __be32 drc_status; struct nfs_client *clp; + int slotid; }; struct cb_compound_hdr_arg { @@ -166,7 +167,6 @@ extern unsigned nfs4_callback_layoutrecall( void *dummy, struct cb_process_state *cps); extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses); -extern void nfs4_cb_take_slot(struct nfs_client *clp); struct cb_devicenotifyitem { uint32_t cbd_notify_type; diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index d4d1954e9bb9..43926add945b 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -111,6 +111,7 @@ int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nf static u32 initiate_file_draining(struct nfs_client *clp, struct cb_layoutrecallargs *args) { + struct nfs_server *server; struct pnfs_layout_hdr *lo; struct inode *ino; bool found = false; @@ -118,21 +119,28 @@ static u32 initiate_file_draining(struct nfs_client *clp, LIST_HEAD(free_me_list); spin_lock(&clp->cl_lock); - list_for_each_entry(lo, &clp->cl_layouts, plh_layouts) { - if (nfs_compare_fh(&args->cbl_fh, - &NFS_I(lo->plh_inode)->fh)) - continue; - ino = igrab(lo->plh_inode); - if (!ino) - continue; - found = true; - /* Without this, layout can be freed as soon - * as we release cl_lock. - */ - get_layout_hdr(lo); - break; + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + list_for_each_entry(lo, &server->layouts, plh_layouts) { + if (nfs_compare_fh(&args->cbl_fh, + &NFS_I(lo->plh_inode)->fh)) + continue; + ino = igrab(lo->plh_inode); + if (!ino) + continue; + found = true; + /* Without this, layout can be freed as soon + * as we release cl_lock. + */ + get_layout_hdr(lo); + break; + } + if (found) + break; } + rcu_read_unlock(); spin_unlock(&clp->cl_lock); + if (!found) return NFS4ERR_NOMATCHING_LAYOUT; @@ -154,6 +162,7 @@ static u32 initiate_file_draining(struct nfs_client *clp, static u32 initiate_bulk_draining(struct nfs_client *clp, struct cb_layoutrecallargs *args) { + struct nfs_server *server; struct pnfs_layout_hdr *lo; struct inode *ino; u32 rv = NFS4ERR_NOMATCHING_LAYOUT; @@ -167,18 +176,24 @@ static u32 initiate_bulk_draining(struct nfs_client *clp, }; spin_lock(&clp->cl_lock); - list_for_each_entry(lo, &clp->cl_layouts, plh_layouts) { + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { if ((args->cbl_recall_type == RETURN_FSID) && - memcmp(&NFS_SERVER(lo->plh_inode)->fsid, - &args->cbl_fsid, sizeof(struct nfs_fsid))) - continue; - if (!igrab(lo->plh_inode)) + memcmp(&server->fsid, &args->cbl_fsid, + sizeof(struct nfs_fsid))) continue; - get_layout_hdr(lo); - BUG_ON(!list_empty(&lo->plh_bulk_recall)); - list_add(&lo->plh_bulk_recall, &recall_list); + + list_for_each_entry(lo, &server->layouts, plh_layouts) { + if (!igrab(lo->plh_inode)) + continue; + get_layout_hdr(lo); + BUG_ON(!list_empty(&lo->plh_bulk_recall)); + list_add(&lo->plh_bulk_recall, &recall_list); + } } + rcu_read_unlock(); spin_unlock(&clp->cl_lock); + list_for_each_entry_safe(lo, tmp, &recall_list, plh_bulk_recall) { ino = lo->plh_inode; @@ -333,7 +348,7 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args) /* Normal */ if (likely(args->csa_sequenceid == slot->seq_nr + 1)) { slot->seq_nr++; - return htonl(NFS4_OK); + goto out_ok; } /* Replay */ @@ -352,11 +367,14 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args) /* Wraparound */ if (args->csa_sequenceid == 1 && (slot->seq_nr + 1) == 0) { slot->seq_nr = 1; - return htonl(NFS4_OK); + goto out_ok; } /* Misordered request */ return htonl(NFS4ERR_SEQ_MISORDERED); +out_ok: + tbl->highest_used_slotid = args->csa_slotid; + return htonl(NFS4_OK); } /* @@ -418,26 +436,37 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args, struct cb_sequenceres *res, struct cb_process_state *cps) { + struct nfs4_slot_table *tbl; struct nfs_client *clp; int i; __be32 status = htonl(NFS4ERR_BADSESSION); - cps->clp = NULL; - clp = nfs4_find_client_sessionid(args->csa_addr, &args->csa_sessionid); if (clp == NULL) goto out; + tbl = &clp->cl_session->bc_slot_table; + + spin_lock(&tbl->slot_tbl_lock); /* state manager is resetting the session */ if (test_bit(NFS4_SESSION_DRAINING, &clp->cl_session->session_state)) { - status = NFS4ERR_DELAY; + spin_unlock(&tbl->slot_tbl_lock); + status = htonl(NFS4ERR_DELAY); + /* Return NFS4ERR_BADSESSION if we're draining the session + * in order to reset it. + */ + if (test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)) + status = htonl(NFS4ERR_BADSESSION); goto out; } status = validate_seqid(&clp->cl_session->bc_slot_table, args); + spin_unlock(&tbl->slot_tbl_lock); if (status) goto out; + cps->slotid = args->csa_slotid; + /* * Check for pending referring calls. If a match is found, a * related callback was received before the response to the original @@ -454,7 +483,6 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args, res->csr_slotid = args->csa_slotid; res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1; res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1; - nfs4_cb_take_slot(clp); out: cps->clp = clp; /* put in nfs4_callback_compound */ diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index c6c86a77e043..726e59a9e50f 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -488,17 +488,18 @@ static __be32 decode_recallany_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_recallanyargs *args) { - __be32 *p; + uint32_t bitmap[2]; + __be32 *p, status; args->craa_addr = svc_addr(rqstp); p = read_buf(xdr, 4); if (unlikely(p == NULL)) return htonl(NFS4ERR_BADXDR); args->craa_objs_to_keep = ntohl(*p++); - p = read_buf(xdr, 4); - if (unlikely(p == NULL)) - return htonl(NFS4ERR_BADXDR); - args->craa_type_mask = ntohl(*p); + status = decode_bitmap(xdr, bitmap); + if (unlikely(status)) + return status; + args->craa_type_mask = bitmap[0]; return 0; } @@ -754,26 +755,15 @@ static void nfs4_callback_free_slot(struct nfs4_session *session) * Let the state manager know callback processing done. * A single slot, so highest used slotid is either 0 or -1 */ - tbl->highest_used_slotid--; + tbl->highest_used_slotid = -1; nfs4_check_drain_bc_complete(session); spin_unlock(&tbl->slot_tbl_lock); } -static void nfs4_cb_free_slot(struct nfs_client *clp) +static void nfs4_cb_free_slot(struct cb_process_state *cps) { - if (clp && clp->cl_session) - nfs4_callback_free_slot(clp->cl_session); -} - -/* A single slot, so highest used slotid is either 0 or -1 */ -void nfs4_cb_take_slot(struct nfs_client *clp) -{ - struct nfs4_slot_table *tbl = &clp->cl_session->bc_slot_table; - - spin_lock(&tbl->slot_tbl_lock); - tbl->highest_used_slotid++; - BUG_ON(tbl->highest_used_slotid != 0); - spin_unlock(&tbl->slot_tbl_lock); + if (cps->slotid != -1) + nfs4_callback_free_slot(cps->clp->cl_session); } #else /* CONFIG_NFS_V4_1 */ @@ -784,7 +774,7 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op) return htonl(NFS4ERR_MINOR_VERS_MISMATCH); } -static void nfs4_cb_free_slot(struct nfs_client *clp) +static void nfs4_cb_free_slot(struct cb_process_state *cps) { } #endif /* CONFIG_NFS_V4_1 */ @@ -866,6 +856,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r struct cb_process_state cps = { .drc_status = 0, .clp = NULL, + .slotid = -1, }; unsigned int nops = 0; @@ -906,7 +897,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r *hdr_res.status = status; *hdr_res.nops = htonl(nops); - nfs4_cb_free_slot(cps.clp); + nfs4_cb_free_slot(&cps); nfs_put_client(cps.clp); dprintk("%s: done, status = %u\n", __func__, ntohl(status)); return rpc_success; @@ -996,4 +987,5 @@ struct svc_version nfs4_callback_version4 = { .vs_proc = nfs4_callback_procedures1, .vs_xdrsize = NFS4_CALLBACK_XDRSIZE, .vs_dispatch = NULL, + .vs_hidden = 1, }; diff --git a/fs/nfs/client.c b/fs/nfs/client.c index b3dc2b88b65b..873bf00d51a2 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -105,7 +105,7 @@ struct rpc_program nfs_program = { .nrvers = ARRAY_SIZE(nfs_version), .version = nfs_version, .stats = &nfs_rpcstat, - .pipe_dir_name = "/nfs", + .pipe_dir_name = NFS_PIPE_DIRNAME, }; struct rpc_stat nfs_rpcstat = { @@ -188,9 +188,6 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_ cred = rpc_lookup_machine_cred(); if (!IS_ERR(cred)) clp->cl_machine_cred = cred; -#if defined(CONFIG_NFS_V4_1) - INIT_LIST_HEAD(&clp->cl_layouts); -#endif nfs_fscache_get_client_cookie(clp); return clp; @@ -293,6 +290,7 @@ static void nfs_free_client(struct nfs_client *clp) nfs4_deviceid_purge_client(clp); kfree(clp->cl_hostname); + kfree(clp->server_scope); kfree(clp); dprintk("<-- nfs_free_client()\n"); @@ -338,11 +336,12 @@ static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1, const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1; const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2; - if (ipv6_addr_scope(&sin1->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL && - sin1->sin6_scope_id != sin2->sin6_scope_id) + if (!ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr)) return 0; + else if (ipv6_addr_type(&sin1->sin6_addr) & IPV6_ADDR_LINKLOCAL) + return sin1->sin6_scope_id == sin2->sin6_scope_id; - return ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr); + return 1; } #else /* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */ static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1, @@ -906,7 +905,9 @@ error: /* * Load up the server record from information gained in an fsinfo record */ -static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *fsinfo) +static void nfs_server_set_fsinfo(struct nfs_server *server, + struct nfs_fh *mntfh, + struct nfs_fsinfo *fsinfo) { unsigned long max_rpc_payload; @@ -936,7 +937,8 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo * if (server->wsize > NFS_MAX_FILE_IO_SIZE) server->wsize = NFS_MAX_FILE_IO_SIZE; server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - set_pnfs_layoutdriver(server, fsinfo->layouttype); + server->pnfs_blksize = fsinfo->blksize; + set_pnfs_layoutdriver(server, mntfh, fsinfo->layouttype); server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL); @@ -982,7 +984,7 @@ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, str if (error < 0) goto out_error; - nfs_server_set_fsinfo(server, &fsinfo); + nfs_server_set_fsinfo(server, mntfh, &fsinfo); /* Get some general file system info */ if (server->namelen == 0) { @@ -1062,6 +1064,7 @@ static struct nfs_server *nfs_alloc_server(void) INIT_LIST_HEAD(&server->client_link); INIT_LIST_HEAD(&server->master_link); INIT_LIST_HEAD(&server->delegations); + INIT_LIST_HEAD(&server->layouts); atomic_set(&server->active, 0); @@ -1464,7 +1467,7 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, dprintk("<-- %s %p\n", __func__, clp); return clp; } -EXPORT_SYMBOL(nfs4_set_ds_client); +EXPORT_SYMBOL_GPL(nfs4_set_ds_client); /* * Session has been established, and the client marked ready. @@ -1865,6 +1868,10 @@ static int nfs_server_list_show(struct seq_file *m, void *v) /* display one transport per line on subsequent lines */ clp = list_entry(v, struct nfs_client, cl_share_link); + /* Check if the client is initialized */ + if (clp->cl_cons_state != NFS_CS_READY) + return 0; + seq_printf(m, "v%u %s %s %3d %s\n", clp->rpc_ops->version, rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR), diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index dd25c2aec375..7f2654069806 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -240,7 +240,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct sizeof(delegation->stateid.data)); delegation->type = res->delegation_type; delegation->maxsize = res->maxsize; - delegation->change_attr = nfsi->change_attr; + delegation->change_attr = inode->i_version; delegation->cred = get_rpccred(cred); delegation->inode = inode; delegation->flags = 1<<NFS_DELEGATION_REFERENCED; @@ -398,12 +398,11 @@ int nfs_inode_return_delegation(struct inode *inode) return err; } -static void nfs_mark_return_delegation(struct nfs_delegation *delegation) +static void nfs_mark_return_delegation(struct nfs_server *server, + struct nfs_delegation *delegation) { - struct nfs_client *clp = NFS_SERVER(delegation->inode)->nfs_client; - set_bit(NFS_DELEGATION_RETURN, &delegation->flags); - set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state); + set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state); } /** @@ -441,7 +440,7 @@ static void nfs_mark_return_all_delegation_types(struct nfs_server *server, if ((delegation->type == (FMODE_READ|FMODE_WRITE)) && !(flags & FMODE_WRITE)) continue; if (delegation->type & flags) - nfs_mark_return_delegation(delegation); + nfs_mark_return_delegation(server, delegation); } } @@ -508,7 +507,7 @@ static void nfs_mark_return_unreferenced_delegations(struct nfs_server *server) list_for_each_entry_rcu(delegation, &server->delegations, super_list) { if (test_and_clear_bit(NFS_DELEGATION_REFERENCED, &delegation->flags)) continue; - nfs_mark_return_delegation(delegation); + nfs_mark_return_delegation(server, delegation); } } @@ -539,7 +538,8 @@ void nfs_expire_unreferenced_delegations(struct nfs_client *clp) int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid) { - struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_client *clp = server->nfs_client; struct nfs_delegation *delegation; rcu_read_lock(); @@ -549,7 +549,7 @@ int nfs_async_inode_return_delegation(struct inode *inode, rcu_read_unlock(); return -ENOENT; } - nfs_mark_return_delegation(delegation); + nfs_mark_return_delegation(server, delegation); rcu_read_unlock(); nfs_delegation_run_state_manager(clp); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 57f578e2560a..ac2899098147 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -134,18 +134,19 @@ const struct inode_operations nfs4_dir_inode_operations = { #endif /* CONFIG_NFS_V4 */ -static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct rpc_cred *cred) +static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir, struct rpc_cred *cred) { struct nfs_open_dir_context *ctx; ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); if (ctx != NULL) { ctx->duped = 0; + ctx->attr_gencount = NFS_I(dir)->attr_gencount; ctx->dir_cookie = 0; ctx->dup_cookie = 0; ctx->cred = get_rpccred(cred); - } else - ctx = ERR_PTR(-ENOMEM); - return ctx; + return ctx; + } + return ERR_PTR(-ENOMEM); } static void put_nfs_open_dir_context(struct nfs_open_dir_context *ctx) @@ -173,7 +174,7 @@ nfs_opendir(struct inode *inode, struct file *filp) cred = rpc_lookup_cred(); if (IS_ERR(cred)) return PTR_ERR(cred); - ctx = alloc_nfs_open_dir_context(cred); + ctx = alloc_nfs_open_dir_context(inode, cred); if (IS_ERR(ctx)) { res = PTR_ERR(ctx); goto out; @@ -323,7 +324,6 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri { loff_t diff = desc->file->f_pos - desc->current_index; unsigned int index; - struct nfs_open_dir_context *ctx = desc->file->private_data; if (diff < 0) goto out_eof; @@ -336,7 +336,6 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri index = (unsigned int)diff; *desc->dir_cookie = array->array[index].cookie; desc->cache_entry_index = index; - ctx->duped = 0; return 0; out_eof: desc->eof = 1; @@ -349,14 +348,34 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des int i; loff_t new_pos; int status = -EAGAIN; - struct nfs_open_dir_context *ctx = desc->file->private_data; for (i = 0; i < array->size; i++) { if (array->array[i].cookie == *desc->dir_cookie) { + struct nfs_inode *nfsi = NFS_I(desc->file->f_path.dentry->d_inode); + struct nfs_open_dir_context *ctx = desc->file->private_data; + new_pos = desc->current_index + i; - if (new_pos < desc->file->f_pos) { + if (ctx->attr_gencount != nfsi->attr_gencount + || (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))) { + ctx->duped = 0; + ctx->attr_gencount = nfsi->attr_gencount; + } else if (new_pos < desc->file->f_pos) { + if (ctx->duped > 0 + && ctx->dup_cookie == *desc->dir_cookie) { + if (printk_ratelimit()) { + pr_notice("NFS: directory %s/%s contains a readdir loop." + "Please contact your server vendor. " + "The file: %s has duplicate cookie %llu\n", + desc->file->f_dentry->d_parent->d_name.name, + desc->file->f_dentry->d_name.name, + array->array[i].string.name, + *desc->dir_cookie); + } + status = -ELOOP; + goto out; + } ctx->dup_cookie = *desc->dir_cookie; - ctx->duped = 1; + ctx->duped = -1; } desc->file->f_pos = new_pos; desc->cache_entry_index = i; @@ -368,6 +387,7 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des if (*desc->dir_cookie == array->last_cookie) desc->eof = 1; } +out: return status; } @@ -740,19 +760,6 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent, struct nfs_cache_array *array = NULL; struct nfs_open_dir_context *ctx = file->private_data; - if (ctx->duped != 0 && ctx->dup_cookie == *desc->dir_cookie) { - if (printk_ratelimit()) { - pr_notice("NFS: directory %s/%s contains a readdir loop. " - "Please contact your server vendor. " - "Offending cookie: %llu\n", - file->f_dentry->d_parent->d_name.name, - file->f_dentry->d_name.name, - *desc->dir_cookie); - } - res = -ELOOP; - goto out; - } - array = nfs_readdir_get_array(desc->page); if (IS_ERR(array)) { res = PTR_ERR(array); @@ -774,6 +781,8 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent, *desc->dir_cookie = array->array[i+1].cookie; else *desc->dir_cookie = array->last_cookie; + if (ctx->duped != 0) + ctx->duped = 1; } if (array->eof_index >= 0) desc->eof = 1; @@ -805,6 +814,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, struct page *page = NULL; int status; struct inode *inode = desc->file->f_path.dentry->d_inode; + struct nfs_open_dir_context *ctx = desc->file->private_data; dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n", (unsigned long long)*desc->dir_cookie); @@ -818,6 +828,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, desc->page_index = 0; desc->last_cookie = *desc->dir_cookie; desc->page = page; + ctx->duped = 0; status = nfs_readdir_xdr_to_array(desc, page, inode); if (status < 0) @@ -1457,12 +1468,12 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry res = NULL; goto out; /* This turned out not to be a regular file */ + case -EISDIR: case -ENOTDIR: goto no_open; case -ELOOP: if (!(nd->intent.open.flags & O_NOFOLLOW)) goto no_open; - /* case -EISDIR: */ /* case -EINVAL: */ default: res = ERR_CAST(inode); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index b35d25b98da6..1940f1a56a5f 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -53,7 +53,7 @@ #include <asm/system.h> #include <asm/uaccess.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include "internal.h" #include "iostat.h" diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 28b8c3f3cda3..eca56d4b39c0 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -40,48 +40,8 @@ #define NFSDBG_FACILITY NFSDBG_FILE -static int nfs_file_open(struct inode *, struct file *); -static int nfs_file_release(struct inode *, struct file *); -static loff_t nfs_file_llseek(struct file *file, loff_t offset, int origin); -static int nfs_file_mmap(struct file *, struct vm_area_struct *); -static ssize_t nfs_file_splice_read(struct file *filp, loff_t *ppos, - struct pipe_inode_info *pipe, - size_t count, unsigned int flags); -static ssize_t nfs_file_read(struct kiocb *, const struct iovec *iov, - unsigned long nr_segs, loff_t pos); -static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe, - struct file *filp, loff_t *ppos, - size_t count, unsigned int flags); -static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov, - unsigned long nr_segs, loff_t pos); -static int nfs_file_flush(struct file *, fl_owner_t id); -static int nfs_file_fsync(struct file *, loff_t, loff_t, int datasync); -static int nfs_check_flags(int flags); -static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl); -static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl); -static int nfs_setlease(struct file *file, long arg, struct file_lock **fl); - static const struct vm_operations_struct nfs_file_vm_ops; -const struct file_operations nfs_file_operations = { - .llseek = nfs_file_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = nfs_file_read, - .aio_write = nfs_file_write, - .mmap = nfs_file_mmap, - .open = nfs_file_open, - .flush = nfs_file_flush, - .release = nfs_file_release, - .fsync = nfs_file_fsync, - .lock = nfs_lock, - .flock = nfs_flock, - .splice_read = nfs_file_splice_read, - .splice_write = nfs_file_splice_write, - .check_flags = nfs_check_flags, - .setlease = nfs_setlease, -}; - const struct inode_operations nfs_file_inode_operations = { .permission = nfs_permission, .getattr = nfs_getattr, @@ -137,11 +97,9 @@ nfs_file_open(struct inode *inode, struct file *filp) static int nfs_file_release(struct inode *inode, struct file *filp) { - struct dentry *dentry = filp->f_path.dentry; - dprintk("NFS: release(%s/%s)\n", - dentry->d_parent->d_name.name, - dentry->d_name.name); + filp->f_path.dentry->d_parent->d_name.name, + filp->f_path.dentry->d_name.name); nfs_inc_stats(inode, NFSIOS_VFSRELEASE); return nfs_release(inode, filp); @@ -180,8 +138,6 @@ force_reval: static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin) { - loff_t loff; - dprintk("NFS: llseek file(%s/%s, %lld, %d)\n", filp->f_path.dentry->d_parent->d_name.name, filp->f_path.dentry->d_name.name, @@ -197,13 +153,9 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin) int retval = nfs_revalidate_file_size(inode, filp); if (retval < 0) return (loff_t)retval; + } - spin_lock(&inode->i_lock); - loff = generic_file_llseek_unlocked(filp, offset, origin); - spin_unlock(&inode->i_lock); - } else - loff = generic_file_llseek_unlocked(filp, offset, origin); - return loff; + return generic_file_llseek(filp, offset, origin); } /* @@ -234,14 +186,13 @@ nfs_file_read(struct kiocb *iocb, const struct iovec *iov, struct dentry * dentry = iocb->ki_filp->f_path.dentry; struct inode * inode = dentry->d_inode; ssize_t result; - size_t count = iov_length(iov, nr_segs); if (iocb->ki_filp->f_flags & O_DIRECT) return nfs_file_direct_read(iocb, iov, nr_segs, pos); dprintk("NFS: read(%s/%s, %lu@%lu)\n", dentry->d_parent->d_name.name, dentry->d_name.name, - (unsigned long) count, (unsigned long) pos); + (unsigned long) iov_length(iov, nr_segs), (unsigned long) pos); result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping); if (!result) { @@ -895,3 +846,54 @@ static int nfs_setlease(struct file *file, long arg, struct file_lock **fl) file->f_path.dentry->d_name.name, arg); return -EINVAL; } + +const struct file_operations nfs_file_operations = { + .llseek = nfs_file_llseek, + .read = do_sync_read, + .write = do_sync_write, + .aio_read = nfs_file_read, + .aio_write = nfs_file_write, + .mmap = nfs_file_mmap, + .open = nfs_file_open, + .flush = nfs_file_flush, + .release = nfs_file_release, + .fsync = nfs_file_fsync, + .lock = nfs_lock, + .flock = nfs_flock, + .splice_read = nfs_file_splice_read, + .splice_write = nfs_file_splice_write, + .check_flags = nfs_check_flags, + .setlease = nfs_setlease, +}; + +#ifdef CONFIG_NFS_V4 +static int +nfs4_file_open(struct inode *inode, struct file *filp) +{ + /* + * NFSv4 opens are handled in d_lookup and d_revalidate. If we get to + * this point, then something is very wrong + */ + dprintk("NFS: %s called! inode=%p filp=%p\n", __func__, inode, filp); + return -ENOTDIR; +} + +const struct file_operations nfs4_file_operations = { + .llseek = nfs_file_llseek, + .read = do_sync_read, + .write = do_sync_write, + .aio_read = nfs_file_read, + .aio_write = nfs_file_write, + .mmap = nfs_file_mmap, + .open = nfs4_file_open, + .flush = nfs_file_flush, + .release = nfs_file_release, + .fsync = nfs_file_fsync, + .lock = nfs_lock, + .flock = nfs_flock, + .splice_read = nfs_file_splice_read, + .splice_write = nfs_file_splice_write, + .check_flags = nfs_check_flags, + .setlease = nfs_setlease, +}; +#endif /* CONFIG_NFS_V4 */ diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c index 5b1006480bc2..7cf2c4699b08 100644 --- a/fs/nfs/fscache-index.c +++ b/fs/nfs/fscache-index.c @@ -212,7 +212,7 @@ static uint16_t nfs_fscache_inode_get_aux(const void *cookie_netfs_data, auxdata.ctime = nfsi->vfs_inode.i_ctime; if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4) - auxdata.change_attr = nfsi->change_attr; + auxdata.change_attr = nfsi->vfs_inode.i_version; if (bufmax > sizeof(auxdata)) bufmax = sizeof(auxdata); @@ -244,7 +244,7 @@ enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data, auxdata.ctime = nfsi->vfs_inode.i_ctime; if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4) - auxdata.change_attr = nfsi->change_attr; + auxdata.change_attr = nfsi->vfs_inode.i_version; if (memcmp(data, &auxdata, datalen) != 0) return FSCACHE_CHECKAUX_OBSOLETE; diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index 79664a1025af..47d1c6ff2d8e 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -36,6 +36,8 @@ #include <linux/types.h> #include <linux/string.h> #include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/nfs_idmap.h> static int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res) { @@ -59,12 +61,10 @@ static int nfs_map_numeric_to_string(__u32 id, char *buf, size_t buflen) #ifdef CONFIG_NFS_USE_NEW_IDMAPPER -#include <linux/slab.h> #include <linux/cred.h> #include <linux/sunrpc/sched.h> #include <linux/nfs4.h> #include <linux/nfs_fs_sb.h> -#include <linux/nfs_idmap.h> #include <linux/keyctl.h> #include <linux/key-type.h> #include <linux/rcupdate.h> @@ -284,18 +284,15 @@ int nfs_map_gid_to_group(const struct nfs_server *server, __u32 gid, char *buf, #include <linux/module.h> #include <linux/mutex.h> #include <linux/init.h> -#include <linux/slab.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/sched.h> - #include <linux/sunrpc/clnt.h> #include <linux/workqueue.h> #include <linux/sunrpc/rpc_pipe_fs.h> #include <linux/nfs_fs.h> -#include <linux/nfs_idmap.h> #include "nfs4_fs.h" #define IDMAP_HASH_SZ 128 @@ -339,8 +336,6 @@ struct idmap { struct idmap_hashtable idmap_group_hash; }; -static ssize_t idmap_pipe_upcall(struct file *, struct rpc_pipe_msg *, - char __user *, size_t); static ssize_t idmap_pipe_downcall(struct file *, const char __user *, size_t); static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *); @@ -348,7 +343,7 @@ static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *); static unsigned int fnvhash32(const void *, size_t); static const struct rpc_pipe_ops idmap_upcall_ops = { - .upcall = idmap_pipe_upcall, + .upcall = rpc_pipe_generic_upcall, .downcall = idmap_pipe_downcall, .destroy_msg = idmap_pipe_destroy_msg, }; @@ -598,27 +593,6 @@ nfs_idmap_name(struct idmap *idmap, struct idmap_hashtable *h, return ret; } -/* RPC pipefs upcall/downcall routines */ -static ssize_t -idmap_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg, - char __user *dst, size_t buflen) -{ - char *data = (char *)msg->data + msg->copied; - size_t mlen = min(msg->len, buflen); - unsigned long left; - - left = copy_to_user(dst, data, mlen); - if (left == mlen) { - msg->errno = -EFAULT; - return -EFAULT; - } - - mlen -= left; - msg->copied += mlen; - msg->errno = 0; - return mlen; -} - static ssize_t idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) { diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index fe1203797b2b..50a15fa8cf98 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -291,7 +291,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) */ inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->file_inode_ops; if (S_ISREG(inode->i_mode)) { - inode->i_fop = &nfs_file_operations; + inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops; inode->i_data.a_ops = &nfs_file_aops; inode->i_data.backing_dev_info = &NFS_SB(sb)->backing_dev_info; } else if (S_ISDIR(inode->i_mode)) { @@ -318,9 +318,9 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) memset(&inode->i_atime, 0, sizeof(inode->i_atime)); memset(&inode->i_mtime, 0, sizeof(inode->i_mtime)); memset(&inode->i_ctime, 0, sizeof(inode->i_ctime)); - nfsi->change_attr = 0; + inode->i_version = 0; inode->i_size = 0; - inode->i_nlink = 0; + clear_nlink(inode); inode->i_uid = -2; inode->i_gid = -2; inode->i_blocks = 0; @@ -344,7 +344,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; if (fattr->valid & NFS_ATTR_FATTR_CHANGE) - nfsi->change_attr = fattr->change_attr; + inode->i_version = fattr->change_attr; else if (nfs_server_capable(inode, NFS_CAP_CHANGE_ATTR)) nfsi->cache_validity |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_DATA; @@ -355,7 +355,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) | NFS_INO_INVALID_DATA | NFS_INO_REVAL_PAGECACHE; if (fattr->valid & NFS_ATTR_FATTR_NLINK) - inode->i_nlink = fattr->nlink; + set_nlink(inode, fattr->nlink); else if (nfs_server_capable(inode, NFS_CAP_NLINK)) nfsi->cache_validity |= NFS_INO_INVALID_ATTR; if (fattr->valid & NFS_ATTR_FATTR_OWNER) @@ -897,8 +897,8 @@ static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE) && (fattr->valid & NFS_ATTR_FATTR_CHANGE) - && nfsi->change_attr == fattr->pre_change_attr) { - nfsi->change_attr = fattr->change_attr; + && inode->i_version == fattr->pre_change_attr) { + inode->i_version = fattr->change_attr; if (S_ISDIR(inode->i_mode)) nfsi->cache_validity |= NFS_INO_INVALID_DATA; ret |= NFS_INO_INVALID_ATTR; @@ -952,7 +952,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat return -EIO; if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && - nfsi->change_attr != fattr->change_attr) + inode->i_version != fattr->change_attr) invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; /* Verify a few of the more important attributes */ @@ -1163,7 +1163,7 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa } if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && (fattr->valid & NFS_ATTR_FATTR_PRECHANGE) == 0) { - fattr->pre_change_attr = NFS_I(inode)->change_attr; + fattr->pre_change_attr = inode->i_version; fattr->valid |= NFS_ATTR_FATTR_PRECHANGE; } if ((fattr->valid & NFS_ATTR_FATTR_CTIME) != 0 && @@ -1244,13 +1244,13 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) /* More cache consistency checks */ if (fattr->valid & NFS_ATTR_FATTR_CHANGE) { - if (nfsi->change_attr != fattr->change_attr) { + if (inode->i_version != fattr->change_attr) { dprintk("NFS: change_attr change on server for file %s/%ld\n", inode->i_sb->s_id, inode->i_ino); invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; if (S_ISDIR(inode->i_mode)) nfs_force_lookup_revalidate(inode); - nfsi->change_attr = fattr->change_attr; + inode->i_version = fattr->change_attr; } } else if (server->caps & NFS_CAP_CHANGE_ATTR) invalid |= save_cache_validity; @@ -1361,7 +1361,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) invalid |= NFS_INO_INVALID_ATTR; if (S_ISDIR(inode->i_mode)) invalid |= NFS_INO_INVALID_DATA; - inode->i_nlink = fattr->nlink; + set_nlink(inode, fattr->nlink); } } else if (server->caps & NFS_CAP_NLINK) invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 2a55347a2daa..3f4d95751d52 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -277,6 +277,9 @@ extern void nfs_sb_deactive(struct super_block *sb); extern char *nfs_path(char **p, struct dentry *dentry, char *buffer, ssize_t buflen); extern struct vfsmount *nfs_d_automount(struct path *path); +#ifdef CONFIG_NFS_V4 +rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *); +#endif /* getroot.c */ extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *, @@ -288,12 +291,24 @@ extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *, extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh); #endif +struct nfs_pageio_descriptor; /* read.c */ extern int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, const struct rpc_call_ops *call_ops); extern void nfs_read_prepare(struct rpc_task *task, void *calldata); +extern int nfs_generic_pagein(struct nfs_pageio_descriptor *desc, + struct list_head *head); + +extern void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio, + struct inode *inode); +extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio); +extern void nfs_readdata_release(struct nfs_read_data *rdata); /* write.c */ +extern int nfs_generic_flush(struct nfs_pageio_descriptor *desc, + struct list_head *head); +extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio); +extern void nfs_writedata_release(struct nfs_write_data *wdata); extern void nfs_commit_free(struct nfs_write_data *p); extern int nfs_initiate_write(struct nfs_write_data *data, struct rpc_clnt *clnt, @@ -444,13 +459,3 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len) PAGE_SIZE - 1) >> PAGE_SHIFT; } -/* - * Helper for restarting RPC calls in the possible presence of NFSv4.1 - * sessions. - */ -static inline int nfs_restart_rpc(struct rpc_task *task, const struct nfs_client *clp) -{ - if (nfs4_has_session(clp)) - return rpc_restart_call_prepare(task); - return rpc_restart_call(task); -} diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 1f063bacd285..8102391bb374 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -119,7 +119,7 @@ Elong: } #ifdef CONFIG_NFS_V4 -static rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors) +rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors) { struct gss_api_mech *mech; struct xdr_netobj oid; diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 274342771655..7ef23979896d 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -415,7 +415,7 @@ fail: } int nfs3_proc_set_default_acl(struct inode *dir, struct inode *inode, - mode_t mode) + umode_t mode) { struct posix_acl *dfacl, *acl; int error = 0; @@ -427,16 +427,12 @@ int nfs3_proc_set_default_acl(struct inode *dir, struct inode *inode, } if (!dfacl) return 0; - acl = posix_acl_clone(dfacl, GFP_KERNEL); - error = -ENOMEM; - if (!acl) - goto out_release_dfacl; - error = posix_acl_create_masq(acl, &mode); + acl = posix_acl_dup(dfacl); + error = posix_acl_create(&acl, GFP_KERNEL, &mode); if (error < 0) - goto out_release_acl; + goto out_release_dfacl; error = nfs3_proc_setacls(inode, acl, S_ISDIR(inode->i_mode) ? dfacl : NULL); -out_release_acl: posix_acl_release(acl); out_release_dfacl: posix_acl_release(dfacl); diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 38053d823eb0..d4bc9ed91748 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -316,7 +316,7 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, int flags, struct nfs_open_context *ctx) { struct nfs3_createdata *data; - mode_t mode = sattr->ia_mode; + umode_t mode = sattr->ia_mode; int status = -ENOMEM; dprintk("NFS call create %s\n", dentry->d_name.name); @@ -562,7 +562,7 @@ static int nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) { struct nfs3_createdata *data; - int mode = sattr->ia_mode; + umode_t mode = sattr->ia_mode; int status = -ENOMEM; dprintk("NFS call mkdir %s\n", dentry->d_name.name); @@ -681,7 +681,7 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, dev_t rdev) { struct nfs3_createdata *data; - mode_t mode = sattr->ia_mode; + umode_t mode = sattr->ia_mode; int status = -ENOMEM; dprintk("NFS call mknod %s %u:%u\n", dentry->d_name.name, @@ -853,6 +853,7 @@ const struct nfs_rpc_ops nfs_v3_clientops = { .dentry_ops = &nfs_dentry_operations, .dir_inode_ops = &nfs3_dir_inode_operations, .file_inode_ops = &nfs3_file_inode_operations, + .file_ops = &nfs_file_operations, .getroot = nfs3_proc_get_root, .getattr = nfs3_proc_getattr, .setattr = nfs3_proc_setattr, diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index b788f2eb1ba0..693ae22f8731 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -13,30 +13,6 @@ struct idmap; -/* - * In a seqid-mutating op, this macro controls which error return - * values trigger incrementation of the seqid. - * - * from rfc 3010: - * The client MUST monotonically increment the sequence number for the - * CLOSE, LOCK, LOCKU, OPEN, OPEN_CONFIRM, and OPEN_DOWNGRADE - * operations. This is true even in the event that the previous - * operation that used the sequence number received an error. The only - * exception to this rule is if the previous operation received one of - * the following errors: NFSERR_STALE_CLIENTID, NFSERR_STALE_STATEID, - * NFSERR_BAD_STATEID, NFSERR_BAD_SEQID, NFSERR_BADXDR, - * NFSERR_RESOURCE, NFSERR_NOFILEHANDLE. - * - */ -#define seqid_mutating_err(err) \ -(((err) != NFSERR_STALE_CLIENTID) && \ - ((err) != NFSERR_STALE_STATEID) && \ - ((err) != NFSERR_BAD_STATEID) && \ - ((err) != NFSERR_BAD_SEQID) && \ - ((err) != NFSERR_BAD_XDR) && \ - ((err) != NFSERR_RESOURCE) && \ - ((err) != NFSERR_NOFILEHANDLE)) - enum nfs4_client_state { NFS4CLNT_MANAGER_RUNNING = 0, NFS4CLNT_CHECK_LEASE, @@ -48,6 +24,7 @@ enum nfs4_client_state { NFS4CLNT_SESSION_RESET, NFS4CLNT_RECALL_SLOT, NFS4CLNT_LEASE_CONFIRM, + NFS4CLNT_SERVER_SCOPE_MISMATCH, }; enum nfs4_session_state { @@ -55,6 +32,9 @@ enum nfs4_session_state { NFS4_SESSION_DRAINING, }; +#define NFS4_RENEW_TIMEOUT 0x01 +#define NFS4_RENEW_DELEGATION_CB 0x02 + struct nfs4_minor_version_ops { u32 minor_version; @@ -66,6 +46,8 @@ struct nfs4_minor_version_ops { int cache_reply); int (*validate_stateid)(struct nfs_delegation *, const nfs4_stateid *); + int (*find_root_sec)(struct nfs_server *, struct nfs_fh *, + struct nfs_fsinfo *); const struct nfs4_state_recovery_ops *reboot_recovery_ops; const struct nfs4_state_recovery_ops *nograce_recovery_ops; const struct nfs4_state_maintenance_ops *state_renewal_ops; @@ -222,7 +204,7 @@ struct nfs4_state_recovery_ops { }; struct nfs4_state_maintenance_ops { - int (*sched_state_renewal)(struct nfs_client *, struct rpc_cred *); + int (*sched_state_renewal)(struct nfs_client *, struct rpc_cred *, unsigned); struct rpc_cred * (*get_state_renewal_cred_locked)(struct nfs_client *); int (*renew_lease)(struct nfs_client *, struct rpc_cred *); }; @@ -234,8 +216,6 @@ extern const struct inode_operations nfs4_dir_inode_operations; extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *); extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *); extern int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred); -extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *); -extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *); extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *); extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *); extern int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc); @@ -315,7 +295,7 @@ extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[]; extern const u32 nfs4_fattr_bitmap[2]; extern const u32 nfs4_statfs_bitmap[2]; extern const u32 nfs4_pathconf_bitmap[2]; -extern const u32 nfs4_fsinfo_bitmap[2]; +extern const u32 nfs4_fsinfo_bitmap[3]; extern const u32 nfs4_fs_locations_bitmap[2]; /* nfs4renewd.c */ @@ -346,9 +326,12 @@ extern void nfs4_close_sync(struct nfs4_state *, fmode_t); extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t); extern void nfs4_schedule_lease_recovery(struct nfs_client *); extern void nfs4_schedule_state_manager(struct nfs_client *); +extern void nfs4_schedule_path_down_recovery(struct nfs_client *clp); extern void nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *); extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags); extern void nfs41_handle_recall_slot(struct nfs_client *clp); +extern void nfs41_handle_server_scope(struct nfs_client *, + struct server_scope **); extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t, pid_t); diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index f9d03abcd04c..a62d36b9a99e 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -31,6 +31,7 @@ #include <linux/nfs_fs.h> #include <linux/nfs_page.h> +#include <linux/module.h> #include "internal.h" #include "nfs4filelayout.h" @@ -77,19 +78,6 @@ filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset) BUG(); } -/* For data server errors we don't recover from */ -static void -filelayout_set_lo_fail(struct pnfs_layout_segment *lseg) -{ - if (lseg->pls_range.iomode == IOMODE_RW) { - dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__); - set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags); - } else { - dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__); - set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags); - } -} - static int filelayout_async_handle_error(struct rpc_task *task, struct nfs4_state *state, struct nfs_client *clp, @@ -135,7 +123,6 @@ static int filelayout_async_handle_error(struct rpc_task *task, static int filelayout_read_done_cb(struct rpc_task *task, struct nfs_read_data *data) { - struct nfs_client *clp = data->ds_clp; int reset = 0; dprintk("%s DS read\n", __func__); @@ -145,11 +132,10 @@ static int filelayout_read_done_cb(struct rpc_task *task, dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n", __func__, data->ds_clp, data->ds_clp->cl_session); if (reset) { - filelayout_set_lo_fail(data->lseg); + pnfs_set_lo_fail(data->lseg); nfs4_reset_read(task, data); - clp = NFS_SERVER(data->inode)->nfs_client; } - nfs_restart_rpc(task, clp); + rpc_restart_call_prepare(task); return -EAGAIN; } @@ -170,7 +156,7 @@ filelayout_set_layoutcommit(struct nfs_write_data *wdata) pnfs_set_layoutcommit(wdata); dprintk("%s ionde %lu pls_end_pos %lu\n", __func__, wdata->inode->i_ino, - (unsigned long) wdata->lseg->pls_end_pos); + (unsigned long) NFS_I(wdata->inode)->layout->plh_lwb); } /* @@ -216,17 +202,13 @@ static int filelayout_write_done_cb(struct rpc_task *task, if (filelayout_async_handle_error(task, data->args.context->state, data->ds_clp, &reset) == -EAGAIN) { - struct nfs_client *clp; - dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n", __func__, data->ds_clp, data->ds_clp->cl_session); if (reset) { - filelayout_set_lo_fail(data->lseg); + pnfs_set_lo_fail(data->lseg); nfs4_reset_write(task, data); - clp = NFS_SERVER(data->inode)->nfs_client; - } else - clp = data->ds_clp; - nfs_restart_rpc(task, clp); + } + rpc_restart_call_prepare(task); return -EAGAIN; } @@ -256,9 +238,9 @@ static int filelayout_commit_done_cb(struct rpc_task *task, __func__, data->ds_clp, data->ds_clp->cl_session); if (reset) { prepare_to_resend_writes(data); - filelayout_set_lo_fail(data->lseg); + pnfs_set_lo_fail(data->lseg); } else - nfs_restart_rpc(task, data->ds_clp); + rpc_restart_call_prepare(task); return -EAGAIN; } @@ -334,6 +316,9 @@ filelayout_read_pagelist(struct nfs_read_data *data) __func__, data->inode->i_ino, data->args.pgbase, (size_t)data->args.count, offset); + if (test_bit(NFS_DEVICEID_INVALID, &FILELAYOUT_DEVID_NODE(lseg)->flags)) + return PNFS_NOT_ATTEMPTED; + /* Retrieve the correct rpc_client for the byte range */ j = nfs4_fl_calc_j_index(lseg, offset); idx = nfs4_fl_calc_ds_index(lseg, j); @@ -344,8 +329,7 @@ filelayout_read_pagelist(struct nfs_read_data *data) set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags); return PNFS_NOT_ATTEMPTED; } - dprintk("%s USE DS:ip %x %hu\n", __func__, - ntohl(ds->ds_ip_addr), ntohs(ds->ds_port)); + dprintk("%s USE DS: %s\n", __func__, ds->ds_remotestr); /* No multipath support. Use first DS */ data->ds_clp = ds->ds_clp; @@ -374,6 +358,9 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync) struct nfs_fh *fh; int status; + if (test_bit(NFS_DEVICEID_INVALID, &FILELAYOUT_DEVID_NODE(lseg)->flags)) + return PNFS_NOT_ATTEMPTED; + /* Retrieve the correct rpc_client for the byte range */ j = nfs4_fl_calc_j_index(lseg, offset); idx = nfs4_fl_calc_ds_index(lseg, j); @@ -384,9 +371,9 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync) set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags); return PNFS_NOT_ATTEMPTED; } - dprintk("%s ino %lu sync %d req %Zu@%llu DS:%x:%hu\n", __func__, + dprintk("%s ino %lu sync %d req %Zu@%llu DS: %s\n", __func__, data->inode->i_ino, sync, (size_t) data->args.count, offset, - ntohl(ds->ds_ip_addr), ntohs(ds->ds_port)); + ds->ds_remotestr); data->write_done_cb = filelayout_write_done_cb; data->ds_clp = ds->ds_clp; @@ -428,6 +415,14 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo, dprintk("--> %s\n", __func__); + /* FIXME: remove this check when layout segment support is added */ + if (lgr->range.offset != 0 || + lgr->range.length != NFS4_MAX_UINT64) { + dprintk("%s Only whole file layouts supported. Use MDS i/o\n", + __func__); + goto out; + } + if (fl->pattern_offset > lgr->range.offset) { dprintk("%s pattern_offset %lld too large\n", __func__, fl->pattern_offset); @@ -449,11 +444,14 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo, goto out; } else dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node); + /* Found deviceid is being reaped */ + if (test_bit(NFS_DEVICEID_INVALID, &dsaddr->id_node.flags)) + goto out_put; + fl->dsaddr = dsaddr; - if (fl->first_stripe_index < 0 || - fl->first_stripe_index >= dsaddr->stripe_count) { - dprintk("%s Bad first_stripe_index %d\n", + if (fl->first_stripe_index >= dsaddr->stripe_count) { + dprintk("%s Bad first_stripe_index %u\n", __func__, fl->first_stripe_index); goto out_put; } @@ -554,7 +552,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo, /* Note that a zero value for num_fh is legal for STRIPE_SPARSE. * Futher checking is done in filelayout_check_layout */ - if (fl->num_fh < 0 || fl->num_fh > + if (fl->num_fh > max(NFS4_PNFS_MAX_STRIPE_CNT, NFS4_PNFS_MAX_MULTI_CNT)) goto out_err; @@ -659,7 +657,7 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid, * return true : coalesce page * return false : don't coalesce page */ -bool +static bool filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req) { @@ -670,8 +668,6 @@ filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, !nfs_generic_pg_test(pgio, prev, req)) return false; - if (!pgio->pg_lseg) - return 1; p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT; r_stripe = (u64)req->wb_index << PAGE_CACHE_SHIFT; stripe_unit = FILELAYOUT_LSEG(pgio->pg_lseg)->stripe_unit; @@ -682,6 +678,52 @@ filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, return (p_stripe == r_stripe); } +void +filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio, + struct nfs_page *req) +{ + BUG_ON(pgio->pg_lseg != NULL); + + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, + req->wb_context, + 0, + NFS4_MAX_UINT64, + IOMODE_READ, + GFP_KERNEL); + /* If no lseg, fall back to read through mds */ + if (pgio->pg_lseg == NULL) + nfs_pageio_reset_read_mds(pgio); +} + +void +filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio, + struct nfs_page *req) +{ + BUG_ON(pgio->pg_lseg != NULL); + + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, + req->wb_context, + 0, + NFS4_MAX_UINT64, + IOMODE_RW, + GFP_NOFS); + /* If no lseg, fall back to write through mds */ + if (pgio->pg_lseg == NULL) + nfs_pageio_reset_write_mds(pgio); +} + +static const struct nfs_pageio_ops filelayout_pg_read_ops = { + .pg_init = filelayout_pg_init_read, + .pg_test = filelayout_pg_test, + .pg_doio = pnfs_generic_pg_readpages, +}; + +static const struct nfs_pageio_ops filelayout_pg_write_ops = { + .pg_init = filelayout_pg_init_write, + .pg_test = filelayout_pg_test, + .pg_doio = pnfs_generic_pg_writepages, +}; + static bool filelayout_mark_pnfs_commit(struct pnfs_layout_segment *lseg) { return !FILELAYOUT_LSEG(lseg)->commit_through_mds; @@ -879,7 +921,8 @@ static struct pnfs_layoutdriver_type filelayout_type = { .owner = THIS_MODULE, .alloc_lseg = filelayout_alloc_lseg, .free_lseg = filelayout_free_lseg, - .pg_test = filelayout_pg_test, + .pg_read_ops = &filelayout_pg_read_ops, + .pg_write_ops = &filelayout_pg_write_ops, .mark_pnfs_commit = filelayout_mark_pnfs_commit, .choose_commit_list = filelayout_choose_commit_list, .commit_pagelist = filelayout_commit_pagelist, @@ -902,5 +945,7 @@ static void __exit nfs4filelayout_exit(void) pnfs_unregister_layoutdriver(&filelayout_type); } +MODULE_ALIAS("nfs-layouttype4-1"); + module_init(nfs4filelayout_init); module_exit(nfs4filelayout_exit); diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h index cebe01e3795e..2e42284253fa 100644 --- a/fs/nfs/nfs4filelayout.h +++ b/fs/nfs/nfs4filelayout.h @@ -47,10 +47,17 @@ enum stripetype4 { }; /* Individual ip address */ +struct nfs4_pnfs_ds_addr { + struct sockaddr_storage da_addr; + size_t da_addrlen; + struct list_head da_node; /* nfs4_pnfs_dev_hlist dev_dslist */ + char *da_remotestr; /* human readable addr+port */ +}; + struct nfs4_pnfs_ds { struct list_head ds_node; /* nfs4_pnfs_dev_hlist dev_dslist */ - u32 ds_ip_addr; - u32 ds_port; + char *ds_remotestr; /* comma sep list of addrs */ + struct list_head ds_addrs; struct nfs_client *ds_clp; atomic_t ds_count; }; @@ -89,6 +96,12 @@ FILELAYOUT_LSEG(struct pnfs_layout_segment *lseg) generic_hdr); } +static inline struct nfs4_deviceid_node * +FILELAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg) +{ + return &FILELAYOUT_LSEG(lseg)->dsaddr->id_node; +} + extern struct nfs_fh * nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j); diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c index 3b7bf1377264..ed388aae9689 100644 --- a/fs/nfs/nfs4filelayoutdev.c +++ b/fs/nfs/nfs4filelayoutdev.c @@ -56,54 +56,139 @@ print_ds(struct nfs4_pnfs_ds *ds) printk("%s NULL device\n", __func__); return; } - printk(" ip_addr %x port %hu\n" + printk(" ds %s\n" " ref count %d\n" " client %p\n" " cl_exchange_flags %x\n", - ntohl(ds->ds_ip_addr), ntohs(ds->ds_port), + ds->ds_remotestr, atomic_read(&ds->ds_count), ds->ds_clp, ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0); } -/* nfs4_ds_cache_lock is held */ -static struct nfs4_pnfs_ds * -_data_server_lookup_locked(u32 ip_addr, u32 port) +static bool +same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2) { - struct nfs4_pnfs_ds *ds; + struct sockaddr_in *a, *b; + struct sockaddr_in6 *a6, *b6; + + if (addr1->sa_family != addr2->sa_family) + return false; + + switch (addr1->sa_family) { + case AF_INET: + a = (struct sockaddr_in *)addr1; + b = (struct sockaddr_in *)addr2; + + if (a->sin_addr.s_addr == b->sin_addr.s_addr && + a->sin_port == b->sin_port) + return true; + break; + + case AF_INET6: + a6 = (struct sockaddr_in6 *)addr1; + b6 = (struct sockaddr_in6 *)addr2; + + /* LINKLOCAL addresses must have matching scope_id */ + if (ipv6_addr_scope(&a6->sin6_addr) == + IPV6_ADDR_SCOPE_LINKLOCAL && + a6->sin6_scope_id != b6->sin6_scope_id) + return false; + + if (ipv6_addr_equal(&a6->sin6_addr, &b6->sin6_addr) && + a6->sin6_port == b6->sin6_port) + return true; + break; + + default: + dprintk("%s: unhandled address family: %u\n", + __func__, addr1->sa_family); + return false; + } - dprintk("_data_server_lookup: ip_addr=%x port=%hu\n", - ntohl(ip_addr), ntohs(port)); + return false; +} - list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) { - if (ds->ds_ip_addr == ip_addr && - ds->ds_port == port) { - return ds; +/* + * Lookup DS by addresses. The first matching address returns true. + * nfs4_ds_cache_lock is held + */ +static struct nfs4_pnfs_ds * +_data_server_lookup_locked(struct list_head *dsaddrs) +{ + struct nfs4_pnfs_ds *ds; + struct nfs4_pnfs_ds_addr *da1, *da2; + + list_for_each_entry(da1, dsaddrs, da_node) { + list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) { + list_for_each_entry(da2, &ds->ds_addrs, da_node) { + if (same_sockaddr( + (struct sockaddr *)&da1->da_addr, + (struct sockaddr *)&da2->da_addr)) + return ds; + } } } return NULL; } /* + * Compare two lists of addresses. + */ +static bool +_data_server_match_all_addrs_locked(struct list_head *dsaddrs1, + struct list_head *dsaddrs2) +{ + struct nfs4_pnfs_ds_addr *da1, *da2; + size_t count1 = 0, + count2 = 0; + + list_for_each_entry(da1, dsaddrs1, da_node) + count1++; + + list_for_each_entry(da2, dsaddrs2, da_node) { + bool found = false; + count2++; + list_for_each_entry(da1, dsaddrs1, da_node) { + if (same_sockaddr((struct sockaddr *)&da1->da_addr, + (struct sockaddr *)&da2->da_addr)) { + found = true; + break; + } + } + if (!found) + return false; + } + + return (count1 == count2); +} + +/* * Create an rpc connection to the nfs4_pnfs_ds data server - * Currently only support IPv4 + * Currently only supports IPv4 and IPv6 addresses */ static int nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds) { - struct nfs_client *clp; - struct sockaddr_in sin; + struct nfs_client *clp = ERR_PTR(-EIO); + struct nfs4_pnfs_ds_addr *da; int status = 0; - dprintk("--> %s ip:port %x:%hu au_flavor %d\n", __func__, - ntohl(ds->ds_ip_addr), ntohs(ds->ds_port), + dprintk("--> %s DS %s au_flavor %d\n", __func__, ds->ds_remotestr, mds_srv->nfs_client->cl_rpcclient->cl_auth->au_flavor); - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = ds->ds_ip_addr; - sin.sin_port = ds->ds_port; + BUG_ON(list_empty(&ds->ds_addrs)); + + list_for_each_entry(da, &ds->ds_addrs, da_node) { + dprintk("%s: DS %s: trying address %s\n", + __func__, ds->ds_remotestr, da->da_remotestr); + + clp = nfs4_set_ds_client(mds_srv->nfs_client, + (struct sockaddr *)&da->da_addr, + da->da_addrlen, IPPROTO_TCP); + if (!IS_ERR(clp)) + break; + } - clp = nfs4_set_ds_client(mds_srv->nfs_client, (struct sockaddr *)&sin, - sizeof(sin), IPPROTO_TCP); if (IS_ERR(clp)) { status = PTR_ERR(clp); goto out; @@ -115,8 +200,8 @@ nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds) goto out_put; } ds->ds_clp = clp; - dprintk("%s [existing] ip=%x, port=%hu\n", __func__, - ntohl(ds->ds_ip_addr), ntohs(ds->ds_port)); + dprintk("%s [existing] server=%s\n", __func__, + ds->ds_remotestr); goto out; } @@ -135,8 +220,7 @@ nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds) goto out_put; ds->ds_clp = clp; - dprintk("%s [new] ip=%x, port=%hu\n", __func__, ntohl(ds->ds_ip_addr), - ntohs(ds->ds_port)); + dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr); out: return status; out_put: @@ -147,12 +231,25 @@ out_put: static void destroy_ds(struct nfs4_pnfs_ds *ds) { + struct nfs4_pnfs_ds_addr *da; + dprintk("--> %s\n", __func__); ifdebug(FACILITY) print_ds(ds); if (ds->ds_clp) nfs_put_client(ds->ds_clp); + + while (!list_empty(&ds->ds_addrs)) { + da = list_first_entry(&ds->ds_addrs, + struct nfs4_pnfs_ds_addr, + da_node); + list_del_init(&da->da_node); + kfree(da->da_remotestr); + kfree(da); + } + + kfree(ds->ds_remotestr); kfree(ds); } @@ -179,31 +276,96 @@ nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) kfree(dsaddr); } +/* + * Create a string with a human readable address and port to avoid + * complicated setup around many dprinks. + */ +static char * +nfs4_pnfs_remotestr(struct list_head *dsaddrs, gfp_t gfp_flags) +{ + struct nfs4_pnfs_ds_addr *da; + char *remotestr; + size_t len; + char *p; + + len = 3; /* '{', '}' and eol */ + list_for_each_entry(da, dsaddrs, da_node) { + len += strlen(da->da_remotestr) + 1; /* string plus comma */ + } + + remotestr = kzalloc(len, gfp_flags); + if (!remotestr) + return NULL; + + p = remotestr; + *(p++) = '{'; + len--; + list_for_each_entry(da, dsaddrs, da_node) { + size_t ll = strlen(da->da_remotestr); + + if (ll > len) + goto out_err; + + memcpy(p, da->da_remotestr, ll); + p += ll; + len -= ll; + + if (len < 1) + goto out_err; + (*p++) = ','; + len--; + } + if (len < 2) + goto out_err; + *(p++) = '}'; + *p = '\0'; + return remotestr; +out_err: + kfree(remotestr); + return NULL; +} + static struct nfs4_pnfs_ds * -nfs4_pnfs_ds_add(struct inode *inode, u32 ip_addr, u32 port, gfp_t gfp_flags) +nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags) { - struct nfs4_pnfs_ds *tmp_ds, *ds; + struct nfs4_pnfs_ds *tmp_ds, *ds = NULL; + char *remotestr; - ds = kzalloc(sizeof(*tmp_ds), gfp_flags); + if (list_empty(dsaddrs)) { + dprintk("%s: no addresses defined\n", __func__); + goto out; + } + + ds = kzalloc(sizeof(*ds), gfp_flags); if (!ds) goto out; + /* this is only used for debugging, so it's ok if its NULL */ + remotestr = nfs4_pnfs_remotestr(dsaddrs, gfp_flags); + spin_lock(&nfs4_ds_cache_lock); - tmp_ds = _data_server_lookup_locked(ip_addr, port); + tmp_ds = _data_server_lookup_locked(dsaddrs); if (tmp_ds == NULL) { - ds->ds_ip_addr = ip_addr; - ds->ds_port = port; + INIT_LIST_HEAD(&ds->ds_addrs); + list_splice_init(dsaddrs, &ds->ds_addrs); + ds->ds_remotestr = remotestr; atomic_set(&ds->ds_count, 1); INIT_LIST_HEAD(&ds->ds_node); ds->ds_clp = NULL; list_add(&ds->ds_node, &nfs4_data_server_cache); - dprintk("%s add new data server ip 0x%x\n", __func__, - ds->ds_ip_addr); + dprintk("%s add new data server %s\n", __func__, + ds->ds_remotestr); } else { + if (!_data_server_match_all_addrs_locked(&tmp_ds->ds_addrs, + dsaddrs)) { + dprintk("%s: multipath address mismatch: %s != %s", + __func__, tmp_ds->ds_remotestr, remotestr); + } + kfree(remotestr); kfree(ds); atomic_inc(&tmp_ds->ds_count); - dprintk("%s data server found ip 0x%x, inc'ed ds_count to %d\n", - __func__, tmp_ds->ds_ip_addr, + dprintk("%s data server %s found, inc'ed ds_count to %d\n", + __func__, tmp_ds->ds_remotestr, atomic_read(&tmp_ds->ds_count)); ds = tmp_ds; } @@ -213,18 +375,22 @@ out: } /* - * Currently only support ipv4, and one multi-path address. + * Currently only supports ipv4, ipv6 and one multi-path address. */ -static struct nfs4_pnfs_ds * -decode_and_add_ds(struct xdr_stream *streamp, struct inode *inode, gfp_t gfp_flags) +static struct nfs4_pnfs_ds_addr * +decode_ds_addr(struct xdr_stream *streamp, gfp_t gfp_flags) { - struct nfs4_pnfs_ds *ds = NULL; - char *buf; - const char *ipend, *pstr; - u32 ip_addr, port; - int nlen, rlen, i; + struct nfs4_pnfs_ds_addr *da = NULL; + char *buf, *portstr; + u32 port; + int nlen, rlen; int tmp[2]; __be32 *p; + char *netid, *match_netid; + size_t len, match_netid_len; + char *startsep = ""; + char *endsep = ""; + /* r_netid */ p = xdr_inline_decode(streamp, 4); @@ -236,64 +402,123 @@ decode_and_add_ds(struct xdr_stream *streamp, struct inode *inode, gfp_t gfp_fla if (unlikely(!p)) goto out_err; - /* Check that netid is "tcp" */ - if (nlen != 3 || memcmp((char *)p, "tcp", 3)) { - dprintk("%s: ERROR: non ipv4 TCP r_netid\n", __func__); + netid = kmalloc(nlen+1, gfp_flags); + if (unlikely(!netid)) goto out_err; - } - /* r_addr */ + netid[nlen] = '\0'; + memcpy(netid, p, nlen); + + /* r_addr: ip/ip6addr with port in dec octets - see RFC 5665 */ p = xdr_inline_decode(streamp, 4); if (unlikely(!p)) - goto out_err; + goto out_free_netid; rlen = be32_to_cpup(p); p = xdr_inline_decode(streamp, rlen); if (unlikely(!p)) - goto out_err; + goto out_free_netid; - /* ipv6 length plus port is legal */ - if (rlen > INET6_ADDRSTRLEN + 8) { + /* port is ".ABC.DEF", 8 chars max */ + if (rlen > INET6_ADDRSTRLEN + IPV6_SCOPE_ID_LEN + 8) { dprintk("%s: Invalid address, length %d\n", __func__, rlen); - goto out_err; + goto out_free_netid; } buf = kmalloc(rlen + 1, gfp_flags); if (!buf) { dprintk("%s: Not enough memory\n", __func__); - goto out_err; + goto out_free_netid; } buf[rlen] = '\0'; memcpy(buf, p, rlen); - /* replace the port dots with dashes for the in4_pton() delimiter*/ - for (i = 0; i < 2; i++) { - char *res = strrchr(buf, '.'); - if (!res) { - dprintk("%s: Failed finding expected dots in port\n", - __func__); - goto out_free; - } - *res = '-'; + /* replace port '.' with '-' */ + portstr = strrchr(buf, '.'); + if (!portstr) { + dprintk("%s: Failed finding expected dot in port\n", + __func__); + goto out_free_buf; + } + *portstr = '-'; + + /* find '.' between address and port */ + portstr = strrchr(buf, '.'); + if (!portstr) { + dprintk("%s: Failed finding expected dot between address and " + "port\n", __func__); + goto out_free_buf; } + *portstr = '\0'; - /* Currently only support ipv4 address */ - if (in4_pton(buf, rlen, (u8 *)&ip_addr, '-', &ipend) == 0) { - dprintk("%s: Only ipv4 addresses supported\n", __func__); - goto out_free; + da = kzalloc(sizeof(*da), gfp_flags); + if (unlikely(!da)) + goto out_free_buf; + + INIT_LIST_HEAD(&da->da_node); + + if (!rpc_pton(buf, portstr-buf, (struct sockaddr *)&da->da_addr, + sizeof(da->da_addr))) { + dprintk("%s: error parsing address %s\n", __func__, buf); + goto out_free_da; } - /* port */ - pstr = ipend; - sscanf(pstr, "-%d-%d", &tmp[0], &tmp[1]); + portstr++; + sscanf(portstr, "%d-%d", &tmp[0], &tmp[1]); port = htons((tmp[0] << 8) | (tmp[1])); - ds = nfs4_pnfs_ds_add(inode, ip_addr, port, gfp_flags); - dprintk("%s: Decoded address and port %s\n", __func__, buf); -out_free: + switch (da->da_addr.ss_family) { + case AF_INET: + ((struct sockaddr_in *)&da->da_addr)->sin_port = port; + da->da_addrlen = sizeof(struct sockaddr_in); + match_netid = "tcp"; + match_netid_len = 3; + break; + + case AF_INET6: + ((struct sockaddr_in6 *)&da->da_addr)->sin6_port = port; + da->da_addrlen = sizeof(struct sockaddr_in6); + match_netid = "tcp6"; + match_netid_len = 4; + startsep = "["; + endsep = "]"; + break; + + default: + dprintk("%s: unsupported address family: %u\n", + __func__, da->da_addr.ss_family); + goto out_free_da; + } + + if (nlen != match_netid_len || strncmp(netid, match_netid, nlen)) { + dprintk("%s: ERROR: r_netid \"%s\" != \"%s\"\n", + __func__, netid, match_netid); + goto out_free_da; + } + + /* save human readable address */ + len = strlen(startsep) + strlen(buf) + strlen(endsep) + 7; + da->da_remotestr = kzalloc(len, gfp_flags); + + /* NULL is ok, only used for dprintk */ + if (da->da_remotestr) + snprintf(da->da_remotestr, len, "%s%s%s:%u", startsep, + buf, endsep, ntohs(port)); + + dprintk("%s: Parsed DS addr %s\n", __func__, da->da_remotestr); kfree(buf); + kfree(netid); + return da; + +out_free_da: + kfree(da); +out_free_buf: + dprintk("%s: Error parsing DS addr: %s\n", __func__, buf); + kfree(buf); +out_free_netid: + kfree(netid); out_err: - return ds; + return NULL; } /* Decode opaque device data and return the result */ @@ -310,6 +535,8 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags) struct xdr_stream stream; struct xdr_buf buf; struct page *scratch; + struct list_head dsaddrs; + struct nfs4_pnfs_ds_addr *da; /* set up xdr stream */ scratch = alloc_page(gfp_flags); @@ -386,6 +613,8 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags) NFS_SERVER(ino)->nfs_client, &pdev->dev_id); + INIT_LIST_HEAD(&dsaddrs); + for (i = 0; i < dsaddr->ds_num; i++) { int j; u32 mp_count; @@ -395,48 +624,43 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags) goto out_err_free_deviceid; mp_count = be32_to_cpup(p); /* multipath count */ - if (mp_count > 1) { - printk(KERN_WARNING - "%s: Multipath count %d not supported, " - "skipping all greater than 1\n", __func__, - mp_count); - } for (j = 0; j < mp_count; j++) { - if (j == 0) { - dsaddr->ds_list[i] = decode_and_add_ds(&stream, - ino, gfp_flags); - if (dsaddr->ds_list[i] == NULL) - goto out_err_free_deviceid; - } else { - u32 len; - /* skip extra multipath */ - - /* read len, skip */ - p = xdr_inline_decode(&stream, 4); - if (unlikely(!p)) - goto out_err_free_deviceid; - len = be32_to_cpup(p); - - p = xdr_inline_decode(&stream, len); - if (unlikely(!p)) - goto out_err_free_deviceid; - - /* read len, skip */ - p = xdr_inline_decode(&stream, 4); - if (unlikely(!p)) - goto out_err_free_deviceid; - len = be32_to_cpup(p); - - p = xdr_inline_decode(&stream, len); - if (unlikely(!p)) - goto out_err_free_deviceid; - } + da = decode_ds_addr(&stream, gfp_flags); + if (da) + list_add_tail(&da->da_node, &dsaddrs); + } + if (list_empty(&dsaddrs)) { + dprintk("%s: no suitable DS addresses found\n", + __func__); + goto out_err_free_deviceid; + } + + dsaddr->ds_list[i] = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags); + if (!dsaddr->ds_list[i]) + goto out_err_drain_dsaddrs; + + /* If DS was already in cache, free ds addrs */ + while (!list_empty(&dsaddrs)) { + da = list_first_entry(&dsaddrs, + struct nfs4_pnfs_ds_addr, + da_node); + list_del_init(&da->da_node); + kfree(da->da_remotestr); + kfree(da); } } __free_page(scratch); return dsaddr; +out_err_drain_dsaddrs: + while (!list_empty(&dsaddrs)) { + da = list_first_entry(&dsaddrs, struct nfs4_pnfs_ds_addr, + da_node); + list_del_init(&da->da_node); + kfree(da->da_remotestr); + kfree(da); + } out_err_free_deviceid: nfs4_fl_free_deviceid(dsaddr); /* stripe_indicies was part of dsaddr */ @@ -591,13 +815,13 @@ nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j) static void filelayout_mark_devid_negative(struct nfs4_file_layout_dsaddr *dsaddr, - int err, u32 ds_addr) + int err, const char *ds_remotestr) { u32 *p = (u32 *)&dsaddr->id_node.deviceid; - printk(KERN_ERR "NFS: data server %x connection error %d." + printk(KERN_ERR "NFS: data server %s connection error %d." " Deviceid [%x%x%x%x] marked out of use.\n", - ds_addr, err, p[0], p[1], p[2], p[3]); + ds_remotestr, err, p[0], p[1], p[2], p[3]); spin_lock(&nfs4_ds_cache_lock); dsaddr->flags |= NFS4_DEVICE_ID_NEG_ENTRY; @@ -628,7 +852,7 @@ nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx) err = nfs4_ds_connect(s, ds); if (err) { filelayout_mark_devid_negative(dsaddr, err, - ntohl(ds->ds_ip_addr)); + ds->ds_remotestr); return NULL; } } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 26bece8f3083..be2bbac13817 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -73,14 +73,14 @@ static int _nfs4_proc_open(struct nfs4_opendata *data); static int _nfs4_recover_proc_open(struct nfs4_opendata *data); static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *); -static int _nfs4_proc_lookup(struct rpc_clnt *client, struct inode *dir, - const struct qstr *name, struct nfs_fh *fhandle, - struct nfs_fattr *fattr); static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr); static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, struct nfs_fattr *fattr, struct iattr *sattr, struct nfs4_state *state); - +#ifdef CONFIG_NFS_V4_1 +static int nfs41_test_stateid(struct nfs_server *, struct nfs4_state *); +static int nfs41_free_stateid(struct nfs_server *, struct nfs4_state *); +#endif /* Prevent leaks of NFSv4 errors into userland */ static int nfs4_map_errors(int err) { @@ -137,12 +137,13 @@ const u32 nfs4_pathconf_bitmap[2] = { 0 }; -const u32 nfs4_fsinfo_bitmap[2] = { FATTR4_WORD0_MAXFILESIZE +const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE | FATTR4_WORD0_MAXREAD | FATTR4_WORD0_MAXWRITE | FATTR4_WORD0_LEASE_TIME, FATTR4_WORD1_TIME_DELTA - | FATTR4_WORD1_FS_LAYOUT_TYPES + | FATTR4_WORD1_FS_LAYOUT_TYPES, + FATTR4_WORD2_LAYOUT_BLKSIZE }; const u32 nfs4_fs_locations_bitmap[2] = { @@ -749,9 +750,9 @@ static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo) spin_lock(&dir->i_lock); nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA; - if (!cinfo->atomic || cinfo->before != nfsi->change_attr) + if (!cinfo->atomic || cinfo->before != dir->i_version) nfs_force_lookup_revalidate(dir); - nfsi->change_attr = cinfo->after; + dir->i_version = cinfo->after; spin_unlock(&dir->i_lock); } @@ -1592,8 +1593,14 @@ static int _nfs4_proc_open(struct nfs4_opendata *data) int status; status = nfs4_run_open_task(data, 0); - if (status != 0 || !data->rpc_done) + if (!data->rpc_done) + return status; + if (status != 0) { + if (status == -NFS4ERR_BADNAME && + !(o_arg->open_flags & O_CREAT)) + return -ENOENT; return status; + } if (o_arg->open_flags & O_CREAT) { update_changeattr(dir, &o_res->cinfo); @@ -1689,6 +1696,20 @@ static int nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *sta return ret; } +#if defined(CONFIG_NFS_V4_1) +static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state) +{ + int status; + struct nfs_server *server = NFS_SERVER(state->inode); + + status = nfs41_test_stateid(server, state); + if (status == NFS_OK) + return 0; + nfs41_free_stateid(server, state); + return nfs4_open_expired(sp, state); +} +#endif + /* * on an EXCLUSIVE create, the server should send back a bitmask with FATTR4-* * fields corresponding to attributes that were used to store the verifier. @@ -2252,13 +2273,14 @@ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *info) { + int minor_version = server->nfs_client->cl_minorversion; int status = nfs4_lookup_root(server, fhandle, info); if ((status == -NFS4ERR_WRONGSEC) && !(server->flags & NFS_MOUNT_SECFLAVOUR)) /* * A status of -NFS4ERR_WRONGSEC will be mapped to -EPERM * by nfs4_map_errors() as this function exits. */ - status = nfs4_find_root_sec(server, fhandle, info); + status = nfs_v4_minor_ops[minor_version]->find_root_sec(server, fhandle, info); if (status == 0) status = nfs4_server_capabilities(server, fhandle); if (status == 0) @@ -2389,14 +2411,15 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, return status; } -static int _nfs4_proc_lookupfh(struct rpc_clnt *clnt, struct nfs_server *server, - const struct nfs_fh *dirfh, const struct qstr *name, - struct nfs_fh *fhandle, struct nfs_fattr *fattr) +static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, + const struct qstr *name, struct nfs_fh *fhandle, + struct nfs_fattr *fattr) { + struct nfs_server *server = NFS_SERVER(dir); int status; struct nfs4_lookup_arg args = { .bitmask = server->attr_bitmask, - .dir_fh = dirfh, + .dir_fh = NFS_FH(dir), .name = name, }; struct nfs4_lookup_res res = { @@ -2412,40 +2435,8 @@ static int _nfs4_proc_lookupfh(struct rpc_clnt *clnt, struct nfs_server *server, nfs_fattr_init(fattr); - dprintk("NFS call lookupfh %s\n", name->name); - status = nfs4_call_sync(clnt, server, &msg, &args.seq_args, &res.seq_res, 0); - dprintk("NFS reply lookupfh: %d\n", status); - return status; -} - -static int nfs4_proc_lookupfh(struct nfs_server *server, struct nfs_fh *dirfh, - struct qstr *name, struct nfs_fh *fhandle, - struct nfs_fattr *fattr) -{ - struct nfs4_exception exception = { }; - int err; - do { - err = _nfs4_proc_lookupfh(server->client, server, dirfh, name, fhandle, fattr); - /* FIXME: !!!! */ - if (err == -NFS4ERR_MOVED) { - err = -EREMOTE; - break; - } - err = nfs4_handle_exception(server, err, &exception); - } while (exception.retry); - return err; -} - -static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, - const struct qstr *name, struct nfs_fh *fhandle, - struct nfs_fattr *fattr) -{ - int status; - dprintk("NFS call lookup %s\n", name->name); - status = _nfs4_proc_lookupfh(clnt, NFS_SERVER(dir), NFS_FH(dir), name, fhandle, fattr); - if (status == -NFS4ERR_MOVED) - status = nfs4_get_referral(dir, name, fattr, fhandle); + status = nfs4_call_sync(clnt, server, &msg, &args.seq_args, &res.seq_res, 0); dprintk("NFS reply lookup: %d\n", status); return status; } @@ -2466,11 +2457,19 @@ static int nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qst struct nfs4_exception exception = { }; int err; do { - err = nfs4_handle_exception(NFS_SERVER(dir), - _nfs4_proc_lookup(clnt, dir, name, fhandle, fattr), - &exception); - if (err == -EPERM) + int status; + + status = _nfs4_proc_lookup(clnt, dir, name, fhandle, fattr); + switch (status) { + case -NFS4ERR_BADNAME: + return -ENOENT; + case -NFS4ERR_MOVED: + return nfs4_get_referral(dir, name, fattr, fhandle); + case -NFS4ERR_WRONGSEC: nfs_fixup_secinfo_attributes(fattr, fhandle); + } + err = nfs4_handle_exception(NFS_SERVER(dir), + status, &exception); } while (exception.retry); return err; } @@ -3191,7 +3190,7 @@ static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data) struct nfs_server *server = NFS_SERVER(data->inode); if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) { - nfs_restart_rpc(task, server->nfs_client); + rpc_restart_call_prepare(task); return -EAGAIN; } @@ -3241,7 +3240,7 @@ static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data struct inode *inode = data->inode; if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) { - nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); + rpc_restart_call_prepare(task); return -EAGAIN; } if (task->tk_status >= 0) { @@ -3298,7 +3297,7 @@ static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_write_data *dat struct inode *inode = data->inode; if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) { - nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); + rpc_restart_call_prepare(task); return -EAGAIN; } nfs_refresh_inode(inode, data->res.fattr); @@ -3355,9 +3354,13 @@ static void nfs4_renew_done(struct rpc_task *task, void *calldata) if (task->tk_status < 0) { /* Unless we're shutting down, schedule state recovery! */ - if (test_bit(NFS_CS_RENEWD, &clp->cl_res_state) != 0) + if (test_bit(NFS_CS_RENEWD, &clp->cl_res_state) == 0) + return; + if (task->tk_status != NFS4ERR_CB_PATH_DOWN) { nfs4_schedule_lease_recovery(clp); - return; + return; + } + nfs4_schedule_path_down_recovery(clp); } do_renew_lease(clp, timestamp); } @@ -3367,7 +3370,7 @@ static const struct rpc_call_ops nfs4_renew_ops = { .rpc_release = nfs4_renew_release, }; -int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred) +static int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred, unsigned renew_flags) { struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENEW], @@ -3376,9 +3379,11 @@ int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred) }; struct nfs4_renewdata *data; + if (renew_flags == 0) + return 0; if (!atomic_inc_not_zero(&clp->cl_count)) return -EIO; - data = kmalloc(sizeof(*data), GFP_KERNEL); + data = kmalloc(sizeof(*data), GFP_NOFS); if (data == NULL) return -ENOMEM; data->client = clp; @@ -3387,7 +3392,7 @@ int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred) &nfs4_renew_ops, data); } -int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred) +static int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred) { struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENEW], @@ -3832,7 +3837,7 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) default: if (nfs4_async_handle_error(task, data->res.server, NULL) == -EAGAIN) { - nfs_restart_rpc(task, data->res.server->nfs_client); + rpc_restart_call_prepare(task); return; } } @@ -4086,8 +4091,7 @@ static void nfs4_locku_done(struct rpc_task *task, void *data) break; default: if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN) - nfs_restart_rpc(task, - calldata->server->nfs_client); + rpc_restart_call_prepare(task); } } @@ -4441,6 +4445,20 @@ out: return err; } +#if defined(CONFIG_NFS_V4_1) +static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *request) +{ + int status; + struct nfs_server *server = NFS_SERVER(state->inode); + + status = nfs41_test_stateid(server, state); + if (status == NFS_OK) + return 0; + nfs41_free_stateid(server, state); + return nfs4_lock_expired(state, request); +} +#endif + static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) { struct nfs_inode *nfsi = NFS_I(state->inode); @@ -4779,6 +4797,16 @@ out_inval: return -NFS4ERR_INVAL; } +static bool +nfs41_same_server_scope(struct server_scope *a, struct server_scope *b) +{ + if (a->server_scope_sz == b->server_scope_sz && + memcmp(a->server_scope, b->server_scope, a->server_scope_sz) == 0) + return true; + + return false; +} + /* * nfs4_proc_exchange_id() * @@ -4821,9 +4849,31 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred) init_utsname()->domainname, clp->cl_rpcclient->cl_auth->au_flavor); + res.server_scope = kzalloc(sizeof(struct server_scope), GFP_KERNEL); + if (unlikely(!res.server_scope)) + return -ENOMEM; + status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); if (!status) status = nfs4_check_cl_exchange_flags(clp->cl_exchange_flags); + + if (!status) { + if (clp->server_scope && + !nfs41_same_server_scope(clp->server_scope, + res.server_scope)) { + dprintk("%s: server_scope mismatch detected\n", + __func__); + set_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, &clp->cl_state); + kfree(clp->server_scope); + clp->server_scope = NULL; + } + + if (!clp->server_scope) + clp->server_scope = res.server_scope; + else + kfree(res.server_scope); + } + dprintk("<-- %s status= %d\n", __func__, status); return status; } @@ -4874,7 +4924,7 @@ static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata) task->tk_status = 0; /* fall through */ case -NFS4ERR_RETRY_UNCACHED_REP: - nfs_restart_rpc(task, data->clp); + rpc_restart_call_prepare(task); return; } dprintk("<-- %s\n", __func__); @@ -5439,11 +5489,13 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_ return rpc_run_task(&task_setup_data); } -static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cred) +static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cred, unsigned renew_flags) { struct rpc_task *task; int ret = 0; + if ((renew_flags & NFS4_RENEW_TIMEOUT) == 0) + return 0; task = _nfs41_proc_sequence(clp, cred); if (IS_ERR(task)) ret = PTR_ERR(task); @@ -5704,7 +5756,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) { struct nfs4_layoutreturn *lrp = calldata; struct nfs_server *server; - struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout; + struct pnfs_layout_hdr *lo = lrp->args.layout; dprintk("--> %s\n", __func__); @@ -5713,7 +5765,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) server = NFS_SERVER(lrp->args.inode); if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) { - nfs_restart_rpc(task, lrp->clp); + rpc_restart_call_prepare(task); return; } spin_lock(&lo->plh_inode->i_lock); @@ -5733,7 +5785,7 @@ static void nfs4_layoutreturn_release(void *calldata) struct nfs4_layoutreturn *lrp = calldata; dprintk("--> %s\n", __func__); - put_layout_hdr(NFS_I(lrp->args.inode)->layout); + put_layout_hdr(lrp->args.layout); kfree(calldata); dprintk("<-- %s\n", __func__); } @@ -5770,6 +5822,54 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp) return status; } +/* + * Retrieve the list of Data Server devices from the MDS. + */ +static int _nfs4_getdevicelist(struct nfs_server *server, + const struct nfs_fh *fh, + struct pnfs_devicelist *devlist) +{ + struct nfs4_getdevicelist_args args = { + .fh = fh, + .layoutclass = server->pnfs_curr_ld->id, + }; + struct nfs4_getdevicelist_res res = { + .devlist = devlist, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICELIST], + .rpc_argp = &args, + .rpc_resp = &res, + }; + int status; + + dprintk("--> %s\n", __func__); + status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, + &res.seq_res, 0); + dprintk("<-- %s status=%d\n", __func__, status); + return status; +} + +int nfs4_proc_getdevicelist(struct nfs_server *server, + const struct nfs_fh *fh, + struct pnfs_devicelist *devlist) +{ + struct nfs4_exception exception = { }; + int err; + + do { + err = nfs4_handle_exception(server, + _nfs4_getdevicelist(server, fh, devlist), + &exception); + } while (exception.retry); + + dprintk("%s: err=%d, num_devs=%u\n", __func__, + err, devlist->num_devs); + + return err; +} +EXPORT_SYMBOL_GPL(nfs4_proc_getdevicelist); + static int _nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev) { @@ -5836,7 +5936,7 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata) } if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) { - nfs_restart_rpc(task, server->nfs_client); + rpc_restart_call_prepare(task); return; } @@ -5848,9 +5948,22 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata) static void nfs4_layoutcommit_release(void *calldata) { struct nfs4_layoutcommit_data *data = calldata; + struct pnfs_layout_segment *lseg, *tmp; + unsigned long *bitlock = &NFS_I(data->args.inode)->flags; + pnfs_cleanup_layoutcommit(data); /* Matched by references in pnfs_set_layoutcommit */ - put_lseg(data->lseg); + list_for_each_entry_safe(lseg, tmp, &data->lseg_list, pls_lc_list) { + list_del_init(&lseg->pls_lc_list); + if (test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, + &lseg->pls_flags)) + put_lseg(lseg); + } + + clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock); + smp_mb__after_clear_bit(); + wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING); + put_rpccred(data->cred); kfree(data); } @@ -5901,6 +6014,143 @@ out: rpc_put_task(task); return status; } + +static int +_nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *info, struct nfs4_secinfo_flavors *flavors) +{ + struct nfs41_secinfo_no_name_args args = { + .style = SECINFO_STYLE_CURRENT_FH, + }; + struct nfs4_secinfo_res res = { + .flavors = flavors, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SECINFO_NO_NAME], + .rpc_argp = &args, + .rpc_resp = &res, + }; + return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); +} + +static int +nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *info, struct nfs4_secinfo_flavors *flavors) +{ + struct nfs4_exception exception = { }; + int err; + do { + err = _nfs41_proc_secinfo_no_name(server, fhandle, info, flavors); + switch (err) { + case 0: + case -NFS4ERR_WRONGSEC: + case -NFS4ERR_NOTSUPP: + break; + default: + err = nfs4_handle_exception(server, err, &exception); + } + } while (exception.retry); + return err; +} + +static int +nfs41_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *info) +{ + int err; + struct page *page; + rpc_authflavor_t flavor; + struct nfs4_secinfo_flavors *flavors; + + page = alloc_page(GFP_KERNEL); + if (!page) { + err = -ENOMEM; + goto out; + } + + flavors = page_address(page); + err = nfs41_proc_secinfo_no_name(server, fhandle, info, flavors); + + /* + * Fall back on "guess and check" method if + * the server doesn't support SECINFO_NO_NAME + */ + if (err == -NFS4ERR_WRONGSEC || err == -NFS4ERR_NOTSUPP) { + err = nfs4_find_root_sec(server, fhandle, info); + goto out_freepage; + } + if (err) + goto out_freepage; + + flavor = nfs_find_best_sec(flavors); + if (err == 0) + err = nfs4_lookup_root_sec(server, fhandle, info, flavor); + +out_freepage: + put_page(page); + if (err == -EACCES) + return -EPERM; +out: + return err; +} +static int _nfs41_test_stateid(struct nfs_server *server, struct nfs4_state *state) +{ + int status; + struct nfs41_test_stateid_args args = { + .stateid = &state->stateid, + }; + struct nfs41_test_stateid_res res; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_TEST_STATEID], + .rpc_argp = &args, + .rpc_resp = &res, + }; + args.seq_args.sa_session = res.seq_res.sr_session = NULL; + status = nfs4_call_sync_sequence(server->client, server, &msg, &args.seq_args, &res.seq_res, 0, 1); + return status; +} + +static int nfs41_test_stateid(struct nfs_server *server, struct nfs4_state *state) +{ + struct nfs4_exception exception = { }; + int err; + do { + err = nfs4_handle_exception(server, + _nfs41_test_stateid(server, state), + &exception); + } while (exception.retry); + return err; +} + +static int _nfs4_free_stateid(struct nfs_server *server, struct nfs4_state *state) +{ + int status; + struct nfs41_free_stateid_args args = { + .stateid = &state->stateid, + }; + struct nfs41_free_stateid_res res; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FREE_STATEID], + .rpc_argp = &args, + .rpc_resp = &res, + }; + + args.seq_args.sa_session = res.seq_res.sr_session = NULL; + status = nfs4_call_sync_sequence(server->client, server, &msg, &args.seq_args, &res.seq_res, 0, 1); + return status; +} + +static int nfs41_free_stateid(struct nfs_server *server, struct nfs4_state *state) +{ + struct nfs4_exception exception = { }; + int err; + do { + err = nfs4_handle_exception(server, + _nfs4_free_stateid(server, state), + &exception); + } while (exception.retry); + return err; +} #endif /* CONFIG_NFS_V4_1 */ struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = { @@ -5937,8 +6187,8 @@ struct nfs4_state_recovery_ops nfs40_nograce_recovery_ops = { struct nfs4_state_recovery_ops nfs41_nograce_recovery_ops = { .owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE, .state_flag_bit = NFS_STATE_RECLAIM_NOGRACE, - .recover_open = nfs4_open_expired, - .recover_lock = nfs4_lock_expired, + .recover_open = nfs41_open_expired, + .recover_lock = nfs41_lock_expired, .establish_clid = nfs41_init_clientid, .get_clid_cred = nfs4_get_exchange_id_cred, }; @@ -5962,6 +6212,7 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = { .minor_version = 0, .call_sync = _nfs4_call_sync, .validate_stateid = nfs4_validate_delegation_stateid, + .find_root_sec = nfs4_find_root_sec, .reboot_recovery_ops = &nfs40_reboot_recovery_ops, .nograce_recovery_ops = &nfs40_nograce_recovery_ops, .state_renewal_ops = &nfs40_state_renewal_ops, @@ -5972,6 +6223,7 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = { .minor_version = 1, .call_sync = _nfs4_call_sync_session, .validate_stateid = nfs41_validate_delegation_stateid, + .find_root_sec = nfs41_find_root_sec, .reboot_recovery_ops = &nfs41_reboot_recovery_ops, .nograce_recovery_ops = &nfs41_nograce_recovery_ops, .state_renewal_ops = &nfs41_state_renewal_ops, @@ -6000,10 +6252,10 @@ const struct nfs_rpc_ops nfs_v4_clientops = { .dentry_ops = &nfs4_dentry_operations, .dir_inode_ops = &nfs4_dir_inode_operations, .file_inode_ops = &nfs4_file_inode_operations, + .file_ops = &nfs4_file_operations, .getroot = nfs4_proc_get_root, .getattr = nfs4_proc_getattr, .setattr = nfs4_proc_setattr, - .lookupfh = nfs4_proc_lookupfh, .lookup = nfs4_proc_lookup, .access = nfs4_proc_access, .readlink = nfs4_proc_readlink, diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c index df8e7f3ca56d..dc484c0eae7f 100644 --- a/fs/nfs/nfs4renewd.c +++ b/fs/nfs/nfs4renewd.c @@ -60,6 +60,7 @@ nfs4_renew_state(struct work_struct *work) struct rpc_cred *cred; long lease; unsigned long last, now; + unsigned renew_flags = 0; ops = clp->cl_mvops->state_renewal_ops; dprintk("%s: start\n", __func__); @@ -72,18 +73,23 @@ nfs4_renew_state(struct work_struct *work) last = clp->cl_last_renewal; now = jiffies; /* Are we close to a lease timeout? */ - if (time_after(now, last + lease/3)) { + if (time_after(now, last + lease/3)) + renew_flags |= NFS4_RENEW_TIMEOUT; + if (nfs_delegations_present(clp)) + renew_flags |= NFS4_RENEW_DELEGATION_CB; + + if (renew_flags != 0) { cred = ops->get_state_renewal_cred_locked(clp); spin_unlock(&clp->cl_lock); if (cred == NULL) { - if (!nfs_delegations_present(clp)) { + if (!(renew_flags & NFS4_RENEW_DELEGATION_CB)) { set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); goto out; } nfs_expire_all_delegations(clp); } else { /* Queue an asynchronous RENEW. */ - ops->sched_state_renewal(clp, cred); + ops->sched_state_renewal(clp, cred, renew_flags); put_rpccred(cred); goto out_exp; } diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 7acfe8843626..39914be40b03 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1038,6 +1038,12 @@ void nfs4_schedule_lease_recovery(struct nfs_client *clp) nfs4_schedule_state_manager(clp); } +void nfs4_schedule_path_down_recovery(struct nfs_client *clp) +{ + nfs_handle_cb_pathdown(clp); + nfs4_schedule_state_manager(clp); +} + static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state) { @@ -1643,7 +1649,14 @@ static void nfs4_state_manager(struct nfs_client *clp) goto out_error; } clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); - set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state); + + if (test_and_clear_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, + &clp->cl_state)) + nfs4_state_start_reclaim_nograce(clp); + else + set_bit(NFS4CLNT_RECLAIM_REBOOT, + &clp->cl_state); + pnfs_destroy_all_layouts(clp); } diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index e6e8f3b9a1de..e6161b213ed1 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -113,7 +113,11 @@ static int nfs4_stat_to_errno(int); #define encode_restorefh_maxsz (op_encode_hdr_maxsz) #define decode_restorefh_maxsz (op_decode_hdr_maxsz) #define encode_fsinfo_maxsz (encode_getattr_maxsz) -#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + 15) +/* The 5 accounts for the PNFS attributes, and assumes that at most three + * layout types will be returned. + */ +#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + \ + nfs4_fattr_bitmap_maxsz + 4 + 8 + 5) #define encode_renew_maxsz (op_encode_hdr_maxsz + 3) #define decode_renew_maxsz (op_decode_hdr_maxsz) #define encode_setclientid_maxsz \ @@ -314,6 +318,17 @@ static int nfs4_stat_to_errno(int); XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) #define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4) #define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4) +#define encode_getdevicelist_maxsz (op_encode_hdr_maxsz + 4 + \ + encode_verifier_maxsz) +#define decode_getdevicelist_maxsz (op_decode_hdr_maxsz + \ + 2 /* nfs_cookie4 gdlr_cookie */ + \ + decode_verifier_maxsz \ + /* verifier4 gdlr_verifier */ + \ + 1 /* gdlr_deviceid_list count */ + \ + XDR_QUADLEN(NFS4_PNFS_GETDEVLIST_MAXNUM * \ + NFS4_DEVICEID4_SIZE) \ + /* gdlr_deviceid_list */ + \ + 1 /* bool gdlr_eof */) #define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \ XDR_QUADLEN(NFS4_DEVICEID4_SIZE)) #define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \ @@ -343,6 +358,14 @@ static int nfs4_stat_to_errno(int); 1 /* FIXME: opaque lrf_body always empty at the moment */) #define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \ 1 + decode_stateid_maxsz) +#define encode_secinfo_no_name_maxsz (op_encode_hdr_maxsz + 1) +#define decode_secinfo_no_name_maxsz decode_secinfo_maxsz +#define encode_test_stateid_maxsz (op_encode_hdr_maxsz + 2 + \ + XDR_QUADLEN(NFS4_STATEID_SIZE)) +#define decode_test_stateid_maxsz (op_decode_hdr_maxsz + 2 + 1) +#define encode_free_stateid_maxsz (op_encode_hdr_maxsz + 1 + \ + XDR_QUADLEN(NFS4_STATEID_SIZE)) +#define decode_free_stateid_maxsz (op_decode_hdr_maxsz + 1) #else /* CONFIG_NFS_V4_1 */ #define encode_sequence_maxsz 0 #define decode_sequence_maxsz 0 @@ -740,6 +763,14 @@ static int nfs4_stat_to_errno(int); #define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \ decode_sequence_maxsz + \ decode_reclaim_complete_maxsz) +#define NFS4_enc_getdevicelist_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_getdevicelist_maxsz) +#define NFS4_dec_getdevicelist_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_getdevicelist_maxsz) #define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz + \ encode_sequence_maxsz +\ encode_getdeviceinfo_maxsz) @@ -772,6 +803,26 @@ static int nfs4_stat_to_errno(int); decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_layoutreturn_maxsz) +#define NFS4_enc_secinfo_no_name_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putrootfh_maxsz +\ + encode_secinfo_no_name_maxsz) +#define NFS4_dec_secinfo_no_name_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putrootfh_maxsz + \ + decode_secinfo_no_name_maxsz) +#define NFS4_enc_test_stateid_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_test_stateid_maxsz) +#define NFS4_dec_test_stateid_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_test_stateid_maxsz) +#define NFS4_enc_free_stateid_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_free_stateid_maxsz) +#define NFS4_dec_free_stateid_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_free_stateid_maxsz) const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH + compound_encode_hdr_maxsz + @@ -1076,6 +1127,35 @@ static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm hdr->replen += decode_getattr_maxsz; } +static void +encode_getattr_three(struct xdr_stream *xdr, + uint32_t bm0, uint32_t bm1, uint32_t bm2, + struct compound_hdr *hdr) +{ + __be32 *p; + + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_GETATTR); + if (bm2) { + p = reserve_space(xdr, 16); + *p++ = cpu_to_be32(3); + *p++ = cpu_to_be32(bm0); + *p++ = cpu_to_be32(bm1); + *p = cpu_to_be32(bm2); + } else if (bm1) { + p = reserve_space(xdr, 12); + *p++ = cpu_to_be32(2); + *p++ = cpu_to_be32(bm0); + *p = cpu_to_be32(bm1); + } else { + p = reserve_space(xdr, 8); + *p++ = cpu_to_be32(1); + *p = cpu_to_be32(bm0); + } + hdr->nops++; + hdr->replen += decode_getattr_maxsz; +} + static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) { encode_getattr_two(xdr, bitmask[0] & nfs4_fattr_bitmap[0], @@ -1084,8 +1164,11 @@ static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct c static void encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) { - encode_getattr_two(xdr, bitmask[0] & nfs4_fsinfo_bitmap[0], - bitmask[1] & nfs4_fsinfo_bitmap[1], hdr); + encode_getattr_three(xdr, + bitmask[0] & nfs4_fsinfo_bitmap[0], + bitmask[1] & nfs4_fsinfo_bitmap[1], + bitmask[2] & nfs4_fsinfo_bitmap[2], + hdr); } static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) @@ -1827,6 +1910,26 @@ static void encode_sequence(struct xdr_stream *xdr, #ifdef CONFIG_NFS_V4_1 static void +encode_getdevicelist(struct xdr_stream *xdr, + const struct nfs4_getdevicelist_args *args, + struct compound_hdr *hdr) +{ + __be32 *p; + nfs4_verifier dummy = { + .data = "dummmmmy", + }; + + p = reserve_space(xdr, 20); + *p++ = cpu_to_be32(OP_GETDEVICELIST); + *p++ = cpu_to_be32(args->layoutclass); + *p++ = cpu_to_be32(NFS4_PNFS_GETDEVLIST_MAXNUM); + xdr_encode_hyper(p, 0ULL); /* cookie */ + encode_nfs4_verifier(xdr, &dummy); + hdr->nops++; + hdr->replen += decode_getdevicelist_maxsz; +} + +static void encode_getdeviceinfo(struct xdr_stream *xdr, const struct nfs4_getdeviceinfo_args *args, struct compound_hdr *hdr) @@ -1888,7 +1991,7 @@ encode_layoutcommit(struct xdr_stream *xdr, *p++ = cpu_to_be32(OP_LAYOUTCOMMIT); /* Only whole file layouts */ p = xdr_encode_hyper(p, 0); /* offset */ - p = xdr_encode_hyper(p, NFS4_MAX_UINT64); /* length */ + p = xdr_encode_hyper(p, args->lastbytewritten + 1); /* length */ *p++ = cpu_to_be32(0); /* reclaim */ p = xdr_encode_opaque_fixed(p, args->stateid.data, NFS4_STATEID_SIZE); *p++ = cpu_to_be32(1); /* newoffset = TRUE */ @@ -1938,6 +2041,46 @@ encode_layoutreturn(struct xdr_stream *xdr, hdr->nops++; hdr->replen += decode_layoutreturn_maxsz; } + +static int +encode_secinfo_no_name(struct xdr_stream *xdr, + const struct nfs41_secinfo_no_name_args *args, + struct compound_hdr *hdr) +{ + __be32 *p; + p = reserve_space(xdr, 8); + *p++ = cpu_to_be32(OP_SECINFO_NO_NAME); + *p++ = cpu_to_be32(args->style); + hdr->nops++; + hdr->replen += decode_secinfo_no_name_maxsz; + return 0; +} + +static void encode_test_stateid(struct xdr_stream *xdr, + struct nfs41_test_stateid_args *args, + struct compound_hdr *hdr) +{ + __be32 *p; + + p = reserve_space(xdr, 8 + NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(OP_TEST_STATEID); + *p++ = cpu_to_be32(1); + xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE); + hdr->nops++; + hdr->replen += decode_test_stateid_maxsz; +} + +static void encode_free_stateid(struct xdr_stream *xdr, + struct nfs41_free_stateid_args *args, + struct compound_hdr *hdr) +{ + __be32 *p; + p = reserve_space(xdr, 4 + NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(OP_FREE_STATEID); + xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE); + hdr->nops++; + hdr->replen += decode_free_stateid_maxsz; +} #endif /* CONFIG_NFS_V4_1 */ /* @@ -2536,7 +2679,7 @@ static void nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, struct compound_hdr hdr = { .nops = 0, }; - const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 }; + const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME }; encode_compound_hdr(xdr, req, &hdr); encode_setclientid_confirm(xdr, arg, &hdr); @@ -2680,7 +2823,7 @@ static void nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->la_seq_args), }; - const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 }; + const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME }; encode_compound_hdr(xdr, req, &hdr); encode_sequence(xdr, &args->la_seq_args, &hdr); @@ -2707,6 +2850,24 @@ static void nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req, } /* + * Encode GETDEVICELIST request + */ +static void nfs4_xdr_enc_getdevicelist(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_getdevicelist_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_getdevicelist(xdr, args, &hdr); + encode_nops(&hdr); +} + +/* * Encode GETDEVICEINFO request */ static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, @@ -2790,6 +2951,59 @@ static void nfs4_xdr_enc_layoutreturn(struct rpc_rqst *req, encode_layoutreturn(xdr, args, &hdr); encode_nops(&hdr); } + +/* + * Encode SECINFO_NO_NAME request + */ +static int nfs4_xdr_enc_secinfo_no_name(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs41_secinfo_no_name_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putrootfh(xdr, &hdr); + encode_secinfo_no_name(xdr, args, &hdr); + encode_nops(&hdr); + return 0; +} + +/* + * Encode TEST_STATEID request + */ +static void nfs4_xdr_enc_test_stateid(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs41_test_stateid_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_test_stateid(xdr, args, &hdr); + encode_nops(&hdr); +} + +/* + * Encode FREE_STATEID request + */ +static void nfs4_xdr_enc_free_stateid(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs41_free_stateid_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_free_stateid(xdr, args, &hdr); + encode_nops(&hdr); +} #endif /* CONFIG_NFS_V4_1 */ static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) @@ -2890,14 +3104,17 @@ static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap) goto out_overflow; bmlen = be32_to_cpup(p); - bitmap[0] = bitmap[1] = 0; + bitmap[0] = bitmap[1] = bitmap[2] = 0; p = xdr_inline_decode(xdr, (bmlen << 2)); if (unlikely(!p)) goto out_overflow; if (bmlen > 0) { bitmap[0] = be32_to_cpup(p++); - if (bmlen > 1) - bitmap[1] = be32_to_cpup(p); + if (bmlen > 1) { + bitmap[1] = be32_to_cpup(p++); + if (bmlen > 2) + bitmap[2] = be32_to_cpup(p); + } } return 0; out_overflow: @@ -2929,8 +3146,9 @@ static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint3 return ret; bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS; } else - bitmask[0] = bitmask[1] = 0; - dprintk("%s: bitmask=%08x:%08x\n", __func__, bitmask[0], bitmask[1]); + bitmask[0] = bitmask[1] = bitmask[2] = 0; + dprintk("%s: bitmask=%08x:%08x:%08x\n", __func__, + bitmask[0], bitmask[1], bitmask[2]); return 0; } @@ -3984,7 +4202,7 @@ out_overflow: static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res) { __be32 *savep; - uint32_t attrlen, bitmap[2] = {0}; + uint32_t attrlen, bitmap[3] = {0}; int status; if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) @@ -4010,7 +4228,7 @@ xdr_error: static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat) { __be32 *savep; - uint32_t attrlen, bitmap[2] = {0}; + uint32_t attrlen, bitmap[3] = {0}; int status; if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) @@ -4042,7 +4260,7 @@ xdr_error: static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf) { __be32 *savep; - uint32_t attrlen, bitmap[2] = {0}; + uint32_t attrlen, bitmap[3] = {0}; int status; if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) @@ -4182,7 +4400,7 @@ static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fat { __be32 *savep; uint32_t attrlen, - bitmap[2] = {0}; + bitmap[3] = {0}; int status; status = decode_op_hdr(xdr, OP_GETATTR); @@ -4268,10 +4486,32 @@ static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap, return status; } +/* + * The prefered block size for layout directed io + */ +static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap, + uint32_t *res) +{ + __be32 *p; + + dprintk("%s: bitmap is %x\n", __func__, bitmap[2]); + *res = 0; + if (bitmap[2] & FATTR4_WORD2_LAYOUT_BLKSIZE) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) { + print_overflow_msg(__func__, xdr); + return -EIO; + } + *res = be32_to_cpup(p); + bitmap[2] &= ~FATTR4_WORD2_LAYOUT_BLKSIZE; + } + return 0; +} + static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) { __be32 *savep; - uint32_t attrlen, bitmap[2]; + uint32_t attrlen, bitmap[3]; int status; if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) @@ -4299,6 +4539,9 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype); if (status != 0) goto xdr_error; + status = decode_attr_layout_blksize(xdr, bitmap, &fsinfo->blksize); + if (status) + goto xdr_error; status = verify_attr_len(xdr, savep, attrlen); xdr_error: @@ -4718,7 +4961,7 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req, { __be32 *savep; uint32_t attrlen, - bitmap[2] = {0}; + bitmap[3] = {0}; struct kvec *iov = req->rq_rcv_buf.head; int status; @@ -4977,11 +5220,17 @@ static int decode_exchange_id(struct xdr_stream *xdr, if (unlikely(status)) return status; - /* Throw away server_scope */ + /* Save server_scope */ status = decode_opaque_inline(xdr, &dummy, &dummy_str); if (unlikely(status)) return status; + if (unlikely(dummy > NFS4_OPAQUE_LIMIT)) + return -EIO; + + memcpy(res->server_scope->server_scope, dummy_str, dummy); + res->server_scope->server_scope_sz = dummy; + /* Throw away Implementation id array */ status = decode_opaque_inline(xdr, &dummy, &dummy_str); if (unlikely(status)) @@ -5141,6 +5390,53 @@ out_overflow: } #if defined(CONFIG_NFS_V4_1) +/* + * TODO: Need to handle case when EOF != true; + */ +static int decode_getdevicelist(struct xdr_stream *xdr, + struct pnfs_devicelist *res) +{ + __be32 *p; + int status, i; + struct nfs_writeverf verftemp; + + status = decode_op_hdr(xdr, OP_GETDEVICELIST); + if (status) + return status; + + p = xdr_inline_decode(xdr, 8 + 8 + 4); + if (unlikely(!p)) + goto out_overflow; + + /* TODO: Skip cookie for now */ + p += 2; + + /* Read verifier */ + p = xdr_decode_opaque_fixed(p, verftemp.verifier, 8); + + res->num_devs = be32_to_cpup(p); + + dprintk("%s: num_dev %d\n", __func__, res->num_devs); + + if (res->num_devs > NFS4_PNFS_GETDEVLIST_MAXNUM) { + printk(KERN_ERR "%s too many result dev_num %u\n", + __func__, res->num_devs); + return -EIO; + } + + p = xdr_inline_decode(xdr, + res->num_devs * NFS4_DEVICEID4_SIZE + 4); + if (unlikely(!p)) + goto out_overflow; + for (i = 0; i < res->num_devs; i++) + p = xdr_decode_opaque_fixed(p, res->dev_id[i].data, + NFS4_DEVICEID4_SIZE); + res->eof = be32_to_cpup(p); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} static int decode_getdeviceinfo(struct xdr_stream *xdr, struct pnfs_device *pdev) @@ -5303,6 +5599,7 @@ static int decode_layoutcommit(struct xdr_stream *xdr, int status; status = decode_op_hdr(xdr, OP_LAYOUTCOMMIT); + res->status = status; if (status) return status; @@ -5322,6 +5619,55 @@ out_overflow: print_overflow_msg(__func__, xdr); return -EIO; } + +static int decode_test_stateid(struct xdr_stream *xdr, + struct nfs41_test_stateid_res *res) +{ + __be32 *p; + int status; + int num_res; + + status = decode_op_hdr(xdr, OP_TEST_STATEID); + if (status) + return status; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + num_res = be32_to_cpup(p++); + if (num_res != 1) + goto out; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + res->status = be32_to_cpup(p++); + return res->status; +out_overflow: + print_overflow_msg(__func__, xdr); +out: + return -EIO; +} + +static int decode_free_stateid(struct xdr_stream *xdr, + struct nfs41_free_stateid_res *res) +{ + __be32 *p; + int status; + + status = decode_op_hdr(xdr, OP_FREE_STATEID); + if (status) + return status; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + res->status = be32_to_cpup(p++); + return res->status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} #endif /* CONFIG_NFS_V4_1 */ /* @@ -6256,8 +6602,6 @@ static int nfs4_xdr_dec_secinfo(struct rpc_rqst *rqstp, if (status) goto out; status = decode_secinfo(xdr, res); - if (status) - goto out; out: return status; } @@ -6366,6 +6710,32 @@ static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp, } /* + * Decode GETDEVICELIST response + */ +static int nfs4_xdr_dec_getdevicelist(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs4_getdevicelist_res *res) +{ + struct compound_hdr hdr; + int status; + + dprintk("encoding getdevicelist!\n"); + + status = decode_compound_hdr(xdr, &hdr); + if (status != 0) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status != 0) + goto out; + status = decode_putfh(xdr); + if (status != 0) + goto out; + status = decode_getdevicelist(xdr, res->devlist); +out: + return status; +} + +/* * Decode GETDEVINFO response */ static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, @@ -6461,6 +6831,72 @@ static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp, out: return status; } + +/* + * Decode SECINFO_NO_NAME response + */ +static int nfs4_xdr_dec_secinfo_no_name(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs4_secinfo_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putrootfh(xdr); + if (status) + goto out; + status = decode_secinfo(xdr, res); +out: + return status; +} + +/* + * Decode TEST_STATEID response + */ +static int nfs4_xdr_dec_test_stateid(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs41_test_stateid_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_test_stateid(xdr, res); +out: + return status; +} + +/* + * Decode FREE_STATEID response + */ +static int nfs4_xdr_dec_free_stateid(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs41_free_stateid_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_free_stateid(xdr, res); +out: + return status; +} #endif /* CONFIG_NFS_V4_1 */ /** @@ -6480,7 +6916,7 @@ out: int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, int plus) { - uint32_t bitmap[2] = {0}; + uint32_t bitmap[3] = {0}; uint32_t len; __be32 *p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) @@ -6663,6 +7099,10 @@ struct rpc_procinfo nfs4_procedures[] = { PROC(LAYOUTGET, enc_layoutget, dec_layoutget), PROC(LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit), PROC(LAYOUTRETURN, enc_layoutreturn, dec_layoutreturn), + PROC(SECINFO_NO_NAME, enc_secinfo_no_name, dec_secinfo_no_name), + PROC(TEST_STATEID, enc_test_stateid, dec_test_stateid), + PROC(FREE_STATEID, enc_free_stateid, dec_free_stateid), + PROC(GETDEVICELIST, enc_getdevicelist, dec_getdevicelist), #endif /* CONFIG_NFS_V4_1 */ }; diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index 8ff2ea3f10ef..c807ab93140e 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -38,21 +38,15 @@ */ #include <linux/module.h> -#include <scsi/osd_initiator.h> +#include <scsi/osd_ore.h> #include "objlayout.h" #define NFSDBG_FACILITY NFSDBG_PNFS_LD -#define _LLU(x) ((unsigned long long)x) - -enum { BIO_MAX_PAGES_KMALLOC = - (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec), -}; - struct objio_dev_ent { struct nfs4_deviceid_node id_node; - struct osd_dev *od; + struct ore_dev od; }; static void @@ -60,8 +54,8 @@ objio_free_deviceid_node(struct nfs4_deviceid_node *d) { struct objio_dev_ent *de = container_of(d, struct objio_dev_ent, id_node); - dprintk("%s: free od=%p\n", __func__, de->od); - osduld_put_device(de->od); + dprintk("%s: free od=%p\n", __func__, de->od.od); + osduld_put_device(de->od.od); kfree(de); } @@ -98,12 +92,12 @@ _dev_list_add(const struct nfs_server *nfss, nfss->pnfs_curr_ld, nfss->nfs_client, d_id); - de->od = od; + de->od.od = od; d = nfs4_insert_deviceid_node(&de->id_node); n = container_of(d, struct objio_dev_ent, id_node); if (n != de) { - dprintk("%s: Race with other n->od=%p\n", __func__, n->od); + dprintk("%s: Race with other n->od=%p\n", __func__, n->od.od); objio_free_deviceid_node(&de->id_node); de = n; } @@ -111,28 +105,11 @@ _dev_list_add(const struct nfs_server *nfss, return de; } -struct caps_buffers { - u8 caps_key[OSD_CRYPTO_KEYID_SIZE]; - u8 creds[OSD_CAP_LEN]; -}; - struct objio_segment { struct pnfs_layout_segment lseg; - struct pnfs_osd_object_cred *comps; - - unsigned mirrors_p1; - unsigned stripe_unit; - unsigned group_width; /* Data stripe_units without integrity comps */ - u64 group_depth; - unsigned group_count; - - unsigned max_io_size; - - unsigned comps_index; - unsigned num_comps; - /* variable length */ - struct objio_dev_ent *ods[]; + struct ore_layout layout; + struct ore_components oc; }; static inline struct objio_segment * @@ -141,59 +118,44 @@ OBJIO_LSEG(struct pnfs_layout_segment *lseg) return container_of(lseg, struct objio_segment, lseg); } -struct objio_state; -typedef ssize_t (*objio_done_fn)(struct objio_state *ios); - struct objio_state { /* Generic layer */ - struct objlayout_io_state ol_state; - - struct objio_segment *layout; - - struct kref kref; - objio_done_fn done; - void *private; - - unsigned long length; - unsigned numdevs; /* Actually used devs in this IO */ - /* A per-device variable array of size numdevs */ - struct _objio_per_comp { - struct bio *bio; - struct osd_request *or; - unsigned long length; - u64 offset; - unsigned dev; - } per_dev[]; + struct objlayout_io_res oir; + + bool sync; + /*FIXME: Support for extra_bytes at ore_get_rw_state() */ + struct ore_io_state *ios; }; /* Send and wait for a get_device_info of devices in the layout, then look them up with the osd_initiator library */ -static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay, - struct objio_segment *objio_seg, unsigned comp, - gfp_t gfp_flags) +static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay, + struct objio_segment *objio_seg, unsigned c, struct nfs4_deviceid *d_id, + gfp_t gfp_flags) { struct pnfs_osd_deviceaddr *deviceaddr; - struct nfs4_deviceid *d_id; struct objio_dev_ent *ode; struct osd_dev *od; struct osd_dev_info odi; int err; - d_id = &objio_seg->comps[comp].oc_object_id.oid_device_id; - ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id); - if (ode) - return ode; + if (ode) { + objio_seg->oc.ods[c] = &ode->od; /* must use container_of */ + return 0; + } err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags); if (unlikely(err)) { dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n", __func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err); - return ERR_PTR(err); + return err; } odi.systemid_len = deviceaddr->oda_systemid.len; if (odi.systemid_len > sizeof(odi.systemid)) { + dprintk("%s: odi.systemid_len > sizeof(systemid=%zd)\n", + __func__, sizeof(odi.systemid)); err = -EINVAL; goto out; } else if (odi.systemid_len) @@ -218,96 +180,53 @@ static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay, ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od, gfp_flags); - + objio_seg->oc.ods[c] = &ode->od; /* must use container_of */ + dprintk("Adding new dev_id(%llx:%llx)\n", + _DEVID_LO(d_id), _DEVID_HI(d_id)); out: - dprintk("%s: return=%d\n", __func__, err); objlayout_put_deviceinfo(deviceaddr); - return err ? ERR_PTR(err) : ode; + return err; } -static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay, - struct objio_segment *objio_seg, - gfp_t gfp_flags) +static void copy_single_comp(struct ore_components *oc, unsigned c, + struct pnfs_osd_object_cred *src_comp) { - unsigned i; - int err; + struct ore_comp *ocomp = &oc->comps[c]; - /* lookup all devices */ - for (i = 0; i < objio_seg->num_comps; i++) { - struct objio_dev_ent *ode; + WARN_ON(src_comp->oc_cap_key.cred_len > 0); /* libosd is NO_SEC only */ + WARN_ON(src_comp->oc_cap.cred_len > sizeof(ocomp->cred)); - ode = _device_lookup(pnfslay, objio_seg, i, gfp_flags); - if (unlikely(IS_ERR(ode))) { - err = PTR_ERR(ode); - goto out; - } - objio_seg->ods[i] = ode; - } - err = 0; + ocomp->obj.partition = src_comp->oc_object_id.oid_partition_id; + ocomp->obj.id = src_comp->oc_object_id.oid_object_id; -out: - dprintk("%s: return=%d\n", __func__, err); - return err; + memcpy(ocomp->cred, src_comp->oc_cap.cred, sizeof(ocomp->cred)); } -static int _verify_data_map(struct pnfs_osd_layout *layout) +int __alloc_objio_seg(unsigned numdevs, gfp_t gfp_flags, + struct objio_segment **pseg) { - struct pnfs_osd_data_map *data_map = &layout->olo_map; - u64 stripe_length; - u32 group_width; - -/* FIXME: Only raid0 for now. if not go through MDS */ - if (data_map->odm_raid_algorithm != PNFS_OSD_RAID_0) { - printk(KERN_ERR "Only RAID_0 for now\n"); - return -ENOTSUPP; - } - if (0 != (data_map->odm_num_comps % (data_map->odm_mirror_cnt + 1))) { - printk(KERN_ERR "Data Map wrong, num_comps=%u mirrors=%u\n", - data_map->odm_num_comps, data_map->odm_mirror_cnt); - return -EINVAL; - } + struct __alloc_objio_segment { + struct objio_segment olseg; + struct ore_dev *ods[numdevs]; + struct ore_comp comps[numdevs]; + } *aolseg; - if (data_map->odm_group_width) - group_width = data_map->odm_group_width; - else - group_width = data_map->odm_num_comps / - (data_map->odm_mirror_cnt + 1); - - stripe_length = (u64)data_map->odm_stripe_unit * group_width; - if (stripe_length >= (1ULL << 32)) { - printk(KERN_ERR "Total Stripe length(0x%llx)" - " >= 32bit is not supported\n", _LLU(stripe_length)); - return -ENOTSUPP; + aolseg = kzalloc(sizeof(*aolseg), gfp_flags); + if (unlikely(!aolseg)) { + dprintk("%s: Faild allocation numdevs=%d size=%zd\n", __func__, + numdevs, sizeof(*aolseg)); + return -ENOMEM; } - if (0 != (data_map->odm_stripe_unit & ~PAGE_MASK)) { - printk(KERN_ERR "Stripe Unit(0x%llx)" - " must be Multples of PAGE_SIZE(0x%lx)\n", - _LLU(data_map->odm_stripe_unit), PAGE_SIZE); - return -ENOTSUPP; - } + aolseg->olseg.oc.numdevs = numdevs; + aolseg->olseg.oc.single_comp = EC_MULTPLE_COMPS; + aolseg->olseg.oc.comps = aolseg->comps; + aolseg->olseg.oc.ods = aolseg->ods; + *pseg = &aolseg->olseg; return 0; } -static void copy_single_comp(struct pnfs_osd_object_cred *cur_comp, - struct pnfs_osd_object_cred *src_comp, - struct caps_buffers *caps_p) -{ - WARN_ON(src_comp->oc_cap_key.cred_len > sizeof(caps_p->caps_key)); - WARN_ON(src_comp->oc_cap.cred_len > sizeof(caps_p->creds)); - - *cur_comp = *src_comp; - - memcpy(caps_p->caps_key, src_comp->oc_cap_key.cred, - sizeof(caps_p->caps_key)); - cur_comp->oc_cap_key.cred = caps_p->caps_key; - - memcpy(caps_p->creds, src_comp->oc_cap.cred, - sizeof(caps_p->creds)); - cur_comp->oc_cap.cred = caps_p->creds; -} - int objio_alloc_lseg(struct pnfs_layout_segment **outp, struct pnfs_layout_hdr *pnfslay, struct pnfs_layout_range *range, @@ -317,59 +236,43 @@ int objio_alloc_lseg(struct pnfs_layout_segment **outp, struct objio_segment *objio_seg; struct pnfs_osd_xdr_decode_layout_iter iter; struct pnfs_osd_layout layout; - struct pnfs_osd_object_cred *cur_comp, src_comp; - struct caps_buffers *caps_p; + struct pnfs_osd_object_cred src_comp; + unsigned cur_comp; int err; err = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr); if (unlikely(err)) return err; - err = _verify_data_map(&layout); + err = __alloc_objio_seg(layout.olo_num_comps, gfp_flags, &objio_seg); if (unlikely(err)) return err; - objio_seg = kzalloc(sizeof(*objio_seg) + - sizeof(objio_seg->ods[0]) * layout.olo_num_comps + - sizeof(*objio_seg->comps) * layout.olo_num_comps + - sizeof(struct caps_buffers) * layout.olo_num_comps, - gfp_flags); - if (!objio_seg) - return -ENOMEM; + objio_seg->layout.stripe_unit = layout.olo_map.odm_stripe_unit; + objio_seg->layout.group_width = layout.olo_map.odm_group_width; + objio_seg->layout.group_depth = layout.olo_map.odm_group_depth; + objio_seg->layout.mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1; + objio_seg->layout.raid_algorithm = layout.olo_map.odm_raid_algorithm; - objio_seg->comps = (void *)(objio_seg->ods + layout.olo_num_comps); - cur_comp = objio_seg->comps; - caps_p = (void *)(cur_comp + layout.olo_num_comps); - while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) - copy_single_comp(cur_comp++, &src_comp, caps_p++); + err = ore_verify_layout(layout.olo_map.odm_num_comps, + &objio_seg->layout); if (unlikely(err)) goto err; - objio_seg->num_comps = layout.olo_num_comps; - objio_seg->comps_index = layout.olo_comps_index; - err = objio_devices_lookup(pnfslay, objio_seg, gfp_flags); - if (err) - goto err; - - objio_seg->mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1; - objio_seg->stripe_unit = layout.olo_map.odm_stripe_unit; - if (layout.olo_map.odm_group_width) { - objio_seg->group_width = layout.olo_map.odm_group_width; - objio_seg->group_depth = layout.olo_map.odm_group_depth; - objio_seg->group_count = layout.olo_map.odm_num_comps / - objio_seg->mirrors_p1 / - objio_seg->group_width; - } else { - objio_seg->group_width = layout.olo_map.odm_num_comps / - objio_seg->mirrors_p1; - objio_seg->group_depth = -1; - objio_seg->group_count = 1; + objio_seg->oc.first_dev = layout.olo_comps_index; + cur_comp = 0; + while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) { + copy_single_comp(&objio_seg->oc, cur_comp, &src_comp); + err = objio_devices_lookup(pnfslay, objio_seg, cur_comp, + &src_comp.oc_object_id.oid_device_id, + gfp_flags); + if (err) + goto err; + ++cur_comp; } - - /* Cache this calculation it will hit for every page */ - objio_seg->max_io_size = (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - - objio_seg->stripe_unit) * - objio_seg->group_width; + /* pnfs_osd_xdr_decode_layout_comp returns false on error */ + if (unlikely(err)) + goto err; *outp = &objio_seg->lseg; return 0; @@ -386,43 +289,63 @@ void objio_free_lseg(struct pnfs_layout_segment *lseg) int i; struct objio_segment *objio_seg = OBJIO_LSEG(lseg); - for (i = 0; i < objio_seg->num_comps; i++) { - if (!objio_seg->ods[i]) + for (i = 0; i < objio_seg->oc.numdevs; i++) { + struct ore_dev *od = objio_seg->oc.ods[i]; + struct objio_dev_ent *ode; + + if (!od) break; - nfs4_put_deviceid_node(&objio_seg->ods[i]->id_node); + ode = container_of(od, typeof(*ode), od); + nfs4_put_deviceid_node(&ode->id_node); } kfree(objio_seg); } -int objio_alloc_io_state(struct pnfs_layout_segment *lseg, - struct objlayout_io_state **outp, - gfp_t gfp_flags) +static int +objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, bool is_reading, + struct pnfs_layout_segment *lseg, struct page **pages, unsigned pgbase, + loff_t offset, size_t count, void *rpcdata, gfp_t gfp_flags, + struct objio_state **outp) { struct objio_segment *objio_seg = OBJIO_LSEG(lseg); - struct objio_state *ios; - const unsigned first_size = sizeof(*ios) + - objio_seg->num_comps * sizeof(ios->per_dev[0]); - const unsigned sec_size = objio_seg->num_comps * - sizeof(ios->ol_state.ioerrs[0]); - - ios = kzalloc(first_size + sec_size, gfp_flags); - if (unlikely(!ios)) + struct ore_io_state *ios; + int ret; + struct __alloc_objio_state { + struct objio_state objios; + struct pnfs_osd_ioerr ioerrs[objio_seg->oc.numdevs]; + } *aos; + + aos = kzalloc(sizeof(*aos), gfp_flags); + if (unlikely(!aos)) return -ENOMEM; - ios->layout = objio_seg; - ios->ol_state.ioerrs = ((void *)ios) + first_size; - ios->ol_state.num_comps = objio_seg->num_comps; + objlayout_init_ioerrs(&aos->objios.oir, objio_seg->oc.numdevs, + aos->ioerrs, rpcdata, pnfs_layout_type); - *outp = &ios->ol_state; + ret = ore_get_rw_state(&objio_seg->layout, &objio_seg->oc, is_reading, + offset, count, &ios); + if (unlikely(ret)) { + kfree(aos); + return ret; + } + + ios->pages = pages; + ios->pgbase = pgbase; + ios->private = aos; + BUG_ON(ios->nr_pages > (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT); + + aos->objios.sync = 0; + aos->objios.ios = ios; + *outp = &aos->objios; return 0; } -void objio_free_io_state(struct objlayout_io_state *ol_state) +void objio_free_result(struct objlayout_io_res *oir) { - struct objio_state *ios = container_of(ol_state, struct objio_state, - ol_state); + struct objio_state *objios = container_of(oir, struct objio_state, oir); - kfree(ios); + ore_put_io_state(objios->ios); + kfree(objios); } enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep) @@ -455,543 +378,152 @@ enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep) } } -static void _clear_bio(struct bio *bio) +static void __on_dev_error(struct ore_io_state *ios, + struct ore_dev *od, unsigned dev_index, enum osd_err_priority oep, + u64 dev_offset, u64 dev_len) { - struct bio_vec *bv; - unsigned i; - - __bio_for_each_segment(bv, bio, i, 0) { - unsigned this_count = bv->bv_len; - - if (likely(PAGE_SIZE == this_count)) - clear_highpage(bv->bv_page); - else - zero_user(bv->bv_page, bv->bv_offset, this_count); - } -} + struct objio_state *objios = ios->private; + struct pnfs_osd_objid pooid; + struct objio_dev_ent *ode = container_of(od, typeof(*ode), od); + /* FIXME: what to do with more-then-one-group layouts. We need to + * translate from ore_io_state index to oc->comps index + */ + unsigned comp = dev_index; -static int _io_check(struct objio_state *ios, bool is_write) -{ - enum osd_err_priority oep = OSD_ERR_PRI_NO_ERROR; - int lin_ret = 0; - int i; - - for (i = 0; i < ios->numdevs; i++) { - struct osd_sense_info osi; - struct osd_request *or = ios->per_dev[i].or; - unsigned dev; - int ret; - - if (!or) - continue; - - ret = osd_req_decode_sense(or, &osi); - if (likely(!ret)) - continue; - - if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) { - /* start read offset passed endof file */ - BUG_ON(is_write); - _clear_bio(ios->per_dev[i].bio); - dprintk("%s: start read offset passed end of file " - "offset=0x%llx, length=0x%lx\n", __func__, - _LLU(ios->per_dev[i].offset), - ios->per_dev[i].length); - - continue; /* we recovered */ - } - dev = ios->per_dev[i].dev; - objlayout_io_set_result(&ios->ol_state, dev, - &ios->layout->comps[dev].oc_object_id, - osd_pri_2_pnfs_err(osi.osd_err_pri), - ios->per_dev[i].offset, - ios->per_dev[i].length, - is_write); - - if (osi.osd_err_pri >= oep) { - oep = osi.osd_err_pri; - lin_ret = ret; - } - } + pooid.oid_device_id = ode->id_node.deviceid; + pooid.oid_partition_id = ios->oc->comps[comp].obj.partition; + pooid.oid_object_id = ios->oc->comps[comp].obj.id; - return lin_ret; -} - -/* - * Common IO state helpers. - */ -static void _io_free(struct objio_state *ios) -{ - unsigned i; - - for (i = 0; i < ios->numdevs; i++) { - struct _objio_per_comp *per_dev = &ios->per_dev[i]; - - if (per_dev->or) { - osd_end_request(per_dev->or); - per_dev->or = NULL; - } - - if (per_dev->bio) { - bio_put(per_dev->bio); - per_dev->bio = NULL; - } - } -} - -struct osd_dev *_io_od(struct objio_state *ios, unsigned dev) -{ - unsigned min_dev = ios->layout->comps_index; - unsigned max_dev = min_dev + ios->layout->num_comps; - - BUG_ON(dev < min_dev || max_dev <= dev); - return ios->layout->ods[dev - min_dev]->od; -} - -struct _striping_info { - u64 obj_offset; - u64 group_length; - unsigned dev; - unsigned unit_off; -}; - -static void _calc_stripe_info(struct objio_state *ios, u64 file_offset, - struct _striping_info *si) -{ - u32 stripe_unit = ios->layout->stripe_unit; - u32 group_width = ios->layout->group_width; - u64 group_depth = ios->layout->group_depth; - u32 U = stripe_unit * group_width; - - u64 T = U * group_depth; - u64 S = T * ios->layout->group_count; - u64 M = div64_u64(file_offset, S); - - /* - G = (L - (M * S)) / T - H = (L - (M * S)) % T - */ - u64 LmodU = file_offset - M * S; - u32 G = div64_u64(LmodU, T); - u64 H = LmodU - G * T; - - u32 N = div_u64(H, U); - - div_u64_rem(file_offset, stripe_unit, &si->unit_off); - si->obj_offset = si->unit_off + (N * stripe_unit) + - (M * group_depth * stripe_unit); - - /* "H - (N * U)" is just "H % U" so it's bound to u32 */ - si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width; - si->dev *= ios->layout->mirrors_p1; - - si->group_length = T - H; -} - -static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg, - unsigned pgbase, struct _objio_per_comp *per_dev, int cur_len, - gfp_t gfp_flags) -{ - unsigned pg = *cur_pg; - struct request_queue *q = - osd_request_queue(_io_od(ios, per_dev->dev)); - - per_dev->length += cur_len; - - if (per_dev->bio == NULL) { - unsigned stripes = ios->layout->num_comps / - ios->layout->mirrors_p1; - unsigned pages_in_stripe = stripes * - (ios->layout->stripe_unit / PAGE_SIZE); - unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) / - stripes; - - if (BIO_MAX_PAGES_KMALLOC < bio_size) - bio_size = BIO_MAX_PAGES_KMALLOC; - - per_dev->bio = bio_kmalloc(gfp_flags, bio_size); - if (unlikely(!per_dev->bio)) { - dprintk("Faild to allocate BIO size=%u\n", bio_size); - return -ENOMEM; - } - } - - while (cur_len > 0) { - unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len); - unsigned added_len; - - BUG_ON(ios->ol_state.nr_pages <= pg); - cur_len -= pglen; - - added_len = bio_add_pc_page(q, per_dev->bio, - ios->ol_state.pages[pg], pglen, pgbase); - if (unlikely(pglen != added_len)) - return -ENOMEM; - pgbase = 0; - ++pg; - } - BUG_ON(cur_len); - - *cur_pg = pg; - return 0; -} - -static int _prepare_one_group(struct objio_state *ios, u64 length, - struct _striping_info *si, unsigned *last_pg, - gfp_t gfp_flags) -{ - unsigned stripe_unit = ios->layout->stripe_unit; - unsigned mirrors_p1 = ios->layout->mirrors_p1; - unsigned devs_in_group = ios->layout->group_width * mirrors_p1; - unsigned dev = si->dev; - unsigned first_dev = dev - (dev % devs_in_group); - unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0; - unsigned cur_pg = *last_pg; - int ret = 0; - - while (length) { - struct _objio_per_comp *per_dev = &ios->per_dev[dev]; - unsigned cur_len, page_off = 0; - - if (!per_dev->length) { - per_dev->dev = dev; - if (dev < si->dev) { - per_dev->offset = si->obj_offset + stripe_unit - - si->unit_off; - cur_len = stripe_unit; - } else if (dev == si->dev) { - per_dev->offset = si->obj_offset; - cur_len = stripe_unit - si->unit_off; - page_off = si->unit_off & ~PAGE_MASK; - BUG_ON(page_off && - (page_off != ios->ol_state.pgbase)); - } else { /* dev > si->dev */ - per_dev->offset = si->obj_offset - si->unit_off; - cur_len = stripe_unit; - } - - if (max_comp < dev) - max_comp = dev; - } else { - cur_len = stripe_unit; - } - if (cur_len >= length) - cur_len = length; - - ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev, - cur_len, gfp_flags); - if (unlikely(ret)) - goto out; - - dev += mirrors_p1; - dev = (dev % devs_in_group) + first_dev; - - length -= cur_len; - ios->length += cur_len; - } -out: - ios->numdevs = max_comp + mirrors_p1; - *last_pg = cur_pg; - return ret; -} - -static int _io_rw_pagelist(struct objio_state *ios, gfp_t gfp_flags) -{ - u64 length = ios->ol_state.count; - u64 offset = ios->ol_state.offset; - struct _striping_info si; - unsigned last_pg = 0; - int ret = 0; - - while (length) { - _calc_stripe_info(ios, offset, &si); - - if (length < si.group_length) - si.group_length = length; - - ret = _prepare_one_group(ios, si.group_length, &si, &last_pg, gfp_flags); - if (unlikely(ret)) - goto out; - - offset += si.group_length; - length -= si.group_length; - } - -out: - if (!ios->length) - return ret; - - return 0; -} - -static ssize_t _sync_done(struct objio_state *ios) -{ - struct completion *waiting = ios->private; - - complete(waiting); - return 0; -} - -static void _last_io(struct kref *kref) -{ - struct objio_state *ios = container_of(kref, struct objio_state, kref); - - ios->done(ios); -} - -static void _done_io(struct osd_request *or, void *p) -{ - struct objio_state *ios = p; - - kref_put(&ios->kref, _last_io); -} - -static ssize_t _io_exec(struct objio_state *ios) -{ - DECLARE_COMPLETION_ONSTACK(wait); - ssize_t status = 0; /* sync status */ - unsigned i; - objio_done_fn saved_done_fn = ios->done; - bool sync = ios->ol_state.sync; - - if (sync) { - ios->done = _sync_done; - ios->private = &wait; - } - - kref_init(&ios->kref); - - for (i = 0; i < ios->numdevs; i++) { - struct osd_request *or = ios->per_dev[i].or; - - if (!or) - continue; - - kref_get(&ios->kref); - osd_execute_request_async(or, _done_io, ios); - } - - kref_put(&ios->kref, _last_io); - - if (sync) { - wait_for_completion(&wait); - status = saved_done_fn(ios); - } - - return status; + objlayout_io_set_result(&objios->oir, comp, + &pooid, osd_pri_2_pnfs_err(oep), + dev_offset, dev_len, !ios->reading); } /* * read */ -static ssize_t _read_done(struct objio_state *ios) +static void _read_done(struct ore_io_state *ios, void *private) { + struct objio_state *objios = private; ssize_t status; - int ret = _io_check(ios, false); + int ret = ore_check_io(ios, &__on_dev_error); - _io_free(ios); + /* FIXME: _io_free(ios) can we dealocate the libosd resources; */ if (likely(!ret)) status = ios->length; else status = ret; - objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync); - return status; -} - -static int _read_mirrors(struct objio_state *ios, unsigned cur_comp) -{ - struct osd_request *or = NULL; - struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp]; - unsigned dev = per_dev->dev; - struct pnfs_osd_object_cred *cred = - &ios->layout->comps[dev]; - struct osd_obj_id obj = { - .partition = cred->oc_object_id.oid_partition_id, - .id = cred->oc_object_id.oid_object_id, - }; - int ret; - - or = osd_start_request(_io_od(ios, dev), GFP_KERNEL); - if (unlikely(!or)) { - ret = -ENOMEM; - goto err; - } - per_dev->or = or; - - osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length); - - ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL); - if (ret) { - dprintk("%s: Faild to osd_finalize_request() => %d\n", - __func__, ret); - goto err; - } - - dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n", - __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset), - per_dev->length); - -err: - return ret; -} - -static ssize_t _read_exec(struct objio_state *ios) -{ - unsigned i; - int ret; - - for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { - if (!ios->per_dev[i].length) - continue; - ret = _read_mirrors(ios, i); - if (unlikely(ret)) - goto err; - } - - ios->done = _read_done; - return _io_exec(ios); /* In sync mode exec returns the io status */ - -err: - _io_free(ios); - return ret; + objlayout_read_done(&objios->oir, status, objios->sync); } -ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state) +int objio_read_pagelist(struct nfs_read_data *rdata) { - struct objio_state *ios = container_of(ol_state, struct objio_state, - ol_state); + struct objio_state *objios; int ret; - ret = _io_rw_pagelist(ios, GFP_KERNEL); + ret = objio_alloc_io_state(NFS_I(rdata->inode)->layout, true, + rdata->lseg, rdata->args.pages, rdata->args.pgbase, + rdata->args.offset, rdata->args.count, rdata, + GFP_KERNEL, &objios); if (unlikely(ret)) return ret; - return _read_exec(ios); + objios->ios->done = _read_done; + dprintk("%s: offset=0x%llx length=0x%x\n", __func__, + rdata->args.offset, rdata->args.count); + return ore_read(objios->ios); } /* * write */ -static ssize_t _write_done(struct objio_state *ios) +static void _write_done(struct ore_io_state *ios, void *private) { + struct objio_state *objios = private; ssize_t status; - int ret = _io_check(ios, true); + int ret = ore_check_io(ios, &__on_dev_error); - _io_free(ios); + /* FIXME: _io_free(ios) can we dealocate the libosd resources; */ if (likely(!ret)) { /* FIXME: should be based on the OSD's persistence model * See OSD2r05 Section 4.13 Data persistence model */ - ios->ol_state.committed = NFS_FILE_SYNC; + objios->oir.committed = NFS_FILE_SYNC; status = ios->length; } else { status = ret; } - objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync); - return status; + objlayout_write_done(&objios->oir, status, objios->sync); } -static int _write_mirrors(struct objio_state *ios, unsigned cur_comp) +static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate) { - struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp]; - unsigned dev = ios->per_dev[cur_comp].dev; - unsigned last_comp = cur_comp + ios->layout->mirrors_p1; - int ret; + struct objio_state *objios = priv; + struct nfs_write_data *wdata = objios->oir.rpcdata; + pgoff_t index = offset / PAGE_SIZE; + struct page *page = find_get_page(wdata->inode->i_mapping, index); - for (; cur_comp < last_comp; ++cur_comp, ++dev) { - struct osd_request *or = NULL; - struct pnfs_osd_object_cred *cred = - &ios->layout->comps[dev]; - struct osd_obj_id obj = { - .partition = cred->oc_object_id.oid_partition_id, - .id = cred->oc_object_id.oid_object_id, - }; - struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp]; - struct bio *bio; - - or = osd_start_request(_io_od(ios, dev), GFP_NOFS); - if (unlikely(!or)) { - ret = -ENOMEM; - goto err; + if (!page) { + page = find_or_create_page(wdata->inode->i_mapping, + index, GFP_NOFS); + if (unlikely(!page)) { + dprintk("%s: grab_cache_page Failed index=0x%lx\n", + __func__, index); + return NULL; } - per_dev->or = or; - - if (per_dev != master_dev) { - bio = bio_kmalloc(GFP_NOFS, - master_dev->bio->bi_max_vecs); - if (unlikely(!bio)) { - dprintk("Faild to allocate BIO size=%u\n", - master_dev->bio->bi_max_vecs); - ret = -ENOMEM; - goto err; - } - - __bio_clone(bio, master_dev->bio); - bio->bi_bdev = NULL; - bio->bi_next = NULL; - per_dev->bio = bio; - per_dev->dev = dev; - per_dev->length = master_dev->length; - per_dev->offset = master_dev->offset; - } else { - bio = master_dev->bio; - bio->bi_rw |= REQ_WRITE; - } - - osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length); - - ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL); - if (ret) { - dprintk("%s: Faild to osd_finalize_request() => %d\n", - __func__, ret); - goto err; - } - - dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n", - __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset), - per_dev->length); + unlock_page(page); } + if (PageDirty(page) || PageWriteback(page)) + *uptodate = true; + else + *uptodate = PageUptodate(page); + dprintk("%s: index=0x%lx uptodate=%d\n", __func__, index, *uptodate); + return page; +} -err: - return ret; +static void __r4w_put_page(void *priv, struct page *page) +{ + dprintk("%s: index=0x%lx\n", __func__, page->index); + page_cache_release(page); + return; } -static ssize_t _write_exec(struct objio_state *ios) +static const struct _ore_r4w_op _r4w_op = { + .get_page = &__r4w_get_page, + .put_page = &__r4w_put_page, +}; + +int objio_write_pagelist(struct nfs_write_data *wdata, int how) { - unsigned i; + struct objio_state *objios; int ret; - for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { - if (!ios->per_dev[i].length) - continue; - ret = _write_mirrors(ios, i); - if (unlikely(ret)) - goto err; - } + ret = objio_alloc_io_state(NFS_I(wdata->inode)->layout, false, + wdata->lseg, wdata->args.pages, wdata->args.pgbase, + wdata->args.offset, wdata->args.count, wdata, GFP_NOFS, + &objios); + if (unlikely(ret)) + return ret; - ios->done = _write_done; - return _io_exec(ios); /* In sync mode exec returns the io->status */ + objios->sync = 0 != (how & FLUSH_SYNC); + objios->ios->r4w = &_r4w_op; -err: - _io_free(ios); - return ret; -} - -ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable) -{ - struct objio_state *ios = container_of(ol_state, struct objio_state, - ol_state); - int ret; + if (!objios->sync) + objios->ios->done = _write_done; - /* TODO: ios->stable = stable; */ - ret = _io_rw_pagelist(ios, GFP_NOFS); + dprintk("%s: offset=0x%llx length=0x%x\n", __func__, + wdata->args.offset, wdata->args.count); + ret = ore_write(objios->ios); if (unlikely(ret)) return ret; - return _write_exec(ios); + if (objios->sync) + _write_done(objios->ios, objios); + + return 0; } static bool objio_pg_test(struct nfs_pageio_descriptor *pgio, @@ -1000,13 +532,22 @@ static bool objio_pg_test(struct nfs_pageio_descriptor *pgio, if (!pnfs_generic_pg_test(pgio, prev, req)) return false; - if (pgio->pg_lseg == NULL) - return true; - return pgio->pg_count + req->wb_bytes <= - OBJIO_LSEG(pgio->pg_lseg)->max_io_size; + OBJIO_LSEG(pgio->pg_lseg)->layout.max_io_length; } +static const struct nfs_pageio_ops objio_pg_read_ops = { + .pg_init = pnfs_generic_pg_init_read, + .pg_test = objio_pg_test, + .pg_doio = pnfs_generic_pg_readpages, +}; + +static const struct nfs_pageio_ops objio_pg_write_ops = { + .pg_init = pnfs_generic_pg_init_write, + .pg_test = objio_pg_test, + .pg_doio = pnfs_generic_pg_writepages, +}; + static struct pnfs_layoutdriver_type objlayout_type = { .id = LAYOUT_OSD2_OBJECTS, .name = "LAYOUT_OSD2_OBJECTS", @@ -1020,7 +561,8 @@ static struct pnfs_layoutdriver_type objlayout_type = { .read_pagelist = objlayout_read_pagelist, .write_pagelist = objlayout_write_pagelist, - .pg_test = objio_pg_test, + .pg_read_ops = &objio_pg_read_ops, + .pg_write_ops = &objio_pg_write_ops, .free_deviceid_node = objio_free_deviceid_node, @@ -1055,5 +597,7 @@ objlayout_exit(void) __func__); } +MODULE_ALIAS("nfs-layouttype4-2"); + module_init(objlayout_init); module_exit(objlayout_exit); diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c index 1d06f8e2adea..72074e3a04f9 100644 --- a/fs/nfs/objlayout/objlayout.c +++ b/fs/nfs/objlayout/objlayout.c @@ -156,77 +156,39 @@ last_byte_offset(u64 start, u64 len) return end > start ? end - 1 : NFS4_MAX_UINT64; } -static struct objlayout_io_state * -objlayout_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, - struct page **pages, - unsigned pgbase, - loff_t offset, - size_t count, - struct pnfs_layout_segment *lseg, - void *rpcdata, - gfp_t gfp_flags) +void _fix_verify_io_params(struct pnfs_layout_segment *lseg, + struct page ***p_pages, unsigned *p_pgbase, + u64 offset, unsigned long count) { - struct objlayout_io_state *state; u64 lseg_end_offset; - dprintk("%s: allocating io_state\n", __func__); - if (objio_alloc_io_state(lseg, &state, gfp_flags)) - return NULL; - BUG_ON(offset < lseg->pls_range.offset); lseg_end_offset = end_offset(lseg->pls_range.offset, lseg->pls_range.length); BUG_ON(offset >= lseg_end_offset); - if (offset + count > lseg_end_offset) { - count = lseg->pls_range.length - - (offset - lseg->pls_range.offset); - dprintk("%s: truncated count %Zd\n", __func__, count); - } + WARN_ON(offset + count > lseg_end_offset); - if (pgbase > PAGE_SIZE) { - pages += pgbase >> PAGE_SHIFT; - pgbase &= ~PAGE_MASK; + if (*p_pgbase > PAGE_SIZE) { + dprintk("%s: pgbase(0x%x) > PAGE_SIZE\n", __func__, *p_pgbase); + *p_pages += *p_pgbase >> PAGE_SHIFT; + *p_pgbase &= ~PAGE_MASK; } - - INIT_LIST_HEAD(&state->err_list); - state->lseg = lseg; - state->rpcdata = rpcdata; - state->pages = pages; - state->pgbase = pgbase; - state->nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT; - state->offset = offset; - state->count = count; - state->sync = 0; - - return state; -} - -static void -objlayout_free_io_state(struct objlayout_io_state *state) -{ - dprintk("%s: freeing io_state\n", __func__); - if (unlikely(!state)) - return; - - objio_free_io_state(state); } /* * I/O done common code */ static void -objlayout_iodone(struct objlayout_io_state *state) +objlayout_iodone(struct objlayout_io_res *oir) { - dprintk("%s: state %p status\n", __func__, state); - - if (likely(state->status >= 0)) { - objlayout_free_io_state(state); + if (likely(oir->status >= 0)) { + objio_free_result(oir); } else { - struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout); + struct objlayout *objlay = oir->objlay; spin_lock(&objlay->lock); objlay->delta_space_valid = OBJ_DSU_INVALID; - list_add(&objlay->err_list, &state->err_list); + list_add(&objlay->err_list, &oir->err_list); spin_unlock(&objlay->lock); } } @@ -238,13 +200,13 @@ objlayout_iodone(struct objlayout_io_state *state) * the error for later reporting at layout-return. */ void -objlayout_io_set_result(struct objlayout_io_state *state, unsigned index, +objlayout_io_set_result(struct objlayout_io_res *oir, unsigned index, struct pnfs_osd_objid *pooid, int osd_error, u64 offset, u64 length, bool is_write) { - struct pnfs_osd_ioerr *ioerr = &state->ioerrs[index]; + struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[index]; - BUG_ON(index >= state->num_comps); + BUG_ON(index >= oir->num_comps); if (osd_error) { ioerr->oer_component = *pooid; ioerr->oer_comp_offset = offset; @@ -285,21 +247,18 @@ static void _rpc_read_complete(struct work_struct *work) } void -objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync) +objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync) { - int eof = state->eof; - struct nfs_read_data *rdata; + struct nfs_read_data *rdata = oir->rpcdata; - state->status = status; - dprintk("%s: Begin status=%zd eof=%d\n", __func__, status, eof); - rdata = state->rpcdata; - rdata->task.tk_status = status; - if (status >= 0) { + oir->status = rdata->task.tk_status = status; + if (status >= 0) rdata->res.count = status; - rdata->res.eof = eof; - } - objlayout_iodone(state); - /* must not use state after this point */ + objlayout_iodone(oir); + /* must not use oir after this point */ + + dprintk("%s: Return status=%zd eof=%d sync=%d\n", __func__, + status, rdata->res.eof, sync); if (sync) pnfs_ld_read_done(rdata); @@ -317,40 +276,36 @@ objlayout_read_pagelist(struct nfs_read_data *rdata) { loff_t offset = rdata->args.offset; size_t count = rdata->args.count; - struct objlayout_io_state *state; - ssize_t status = 0; + int err; loff_t eof; - dprintk("%s: Begin inode %p offset %llu count %d\n", - __func__, rdata->inode, offset, (int)count); - eof = i_size_read(rdata->inode); if (unlikely(offset + count > eof)) { if (offset >= eof) { - status = 0; + err = 0; rdata->res.count = 0; rdata->res.eof = 1; + /*FIXME: do we need to call pnfs_ld_read_done() */ goto out; } count = eof - offset; } - state = objlayout_alloc_io_state(NFS_I(rdata->inode)->layout, - rdata->args.pages, rdata->args.pgbase, - offset, count, - rdata->lseg, rdata, - GFP_KERNEL); - if (unlikely(!state)) { - status = -ENOMEM; - goto out; - } + rdata->res.eof = (offset + count) >= eof; + _fix_verify_io_params(rdata->lseg, &rdata->args.pages, + &rdata->args.pgbase, + rdata->args.offset, rdata->args.count); - state->eof = state->offset + state->count >= eof; + dprintk("%s: inode(%lx) offset 0x%llx count 0x%Zx eof=%d\n", + __func__, rdata->inode->i_ino, offset, count, rdata->res.eof); - status = objio_read_pagelist(state); + err = objio_read_pagelist(rdata); out: - dprintk("%s: Return status %Zd\n", __func__, status); - rdata->pnfs_error = status; + if (unlikely(err)) { + rdata->pnfs_error = err; + dprintk("%s: Returned Error %d\n", __func__, err); + return PNFS_NOT_ATTEMPTED; + } return PNFS_ATTEMPTED; } @@ -371,26 +326,20 @@ static void _rpc_write_complete(struct work_struct *work) } void -objlayout_write_done(struct objlayout_io_state *state, ssize_t status, - bool sync) +objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync) { - struct nfs_write_data *wdata; + struct nfs_write_data *wdata = oir->rpcdata; - dprintk("%s: Begin\n", __func__); - wdata = state->rpcdata; - state->status = status; - wdata->task.tk_status = status; + oir->status = wdata->task.tk_status = status; if (status >= 0) { wdata->res.count = status; - wdata->verf.committed = state->committed; - dprintk("%s: Return status %d committed %d\n", - __func__, wdata->task.tk_status, - wdata->verf.committed); - } else - dprintk("%s: Return status %d\n", - __func__, wdata->task.tk_status); - objlayout_iodone(state); - /* must not use state after this point */ + wdata->verf.committed = oir->committed; + } + objlayout_iodone(oir); + /* must not use oir after this point */ + + dprintk("%s: Return status %zd committed %d sync=%d\n", __func__, + status, wdata->verf.committed, sync); if (sync) pnfs_ld_write_done(wdata); @@ -407,30 +356,18 @@ enum pnfs_try_status objlayout_write_pagelist(struct nfs_write_data *wdata, int how) { - struct objlayout_io_state *state; - ssize_t status; - - dprintk("%s: Begin inode %p offset %llu count %u\n", - __func__, wdata->inode, wdata->args.offset, wdata->args.count); - - state = objlayout_alloc_io_state(NFS_I(wdata->inode)->layout, - wdata->args.pages, - wdata->args.pgbase, - wdata->args.offset, - wdata->args.count, - wdata->lseg, wdata, - GFP_NOFS); - if (unlikely(!state)) { - status = -ENOMEM; - goto out; - } + int err; - state->sync = how & FLUSH_SYNC; + _fix_verify_io_params(wdata->lseg, &wdata->args.pages, + &wdata->args.pgbase, + wdata->args.offset, wdata->args.count); - status = objio_write_pagelist(state, how & FLUSH_STABLE); - out: - dprintk("%s: Return status %Zd\n", __func__, status); - wdata->pnfs_error = status; + err = objio_write_pagelist(wdata, how); + if (unlikely(err)) { + wdata->pnfs_error = err; + dprintk("%s: Returned Error %d\n", __func__, err); + return PNFS_NOT_ATTEMPTED; + } return PNFS_ATTEMPTED; } @@ -537,14 +474,14 @@ merge_ioerr(struct pnfs_osd_ioerr *dest_err, static void encode_accumulated_error(struct objlayout *objlay, __be32 *p) { - struct objlayout_io_state *state, *tmp; + struct objlayout_io_res *oir, *tmp; struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0}; - list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) { + list_for_each_entry_safe(oir, tmp, &objlay->err_list, err_list) { unsigned i; - for (i = 0; i < state->num_comps; i++) { - struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i]; + for (i = 0; i < oir->num_comps; i++) { + struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[i]; if (!ioerr->oer_errno) continue; @@ -563,8 +500,8 @@ encode_accumulated_error(struct objlayout *objlay, __be32 *p) merge_ioerr(&accumulated_err, ioerr); } - list_del(&state->err_list); - objlayout_free_io_state(state); + list_del(&oir->err_list); + objio_free_result(oir); } pnfs_osd_xdr_encode_ioerr(p, &accumulated_err); @@ -576,7 +513,7 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay, const struct nfs4_layoutreturn_args *args) { struct objlayout *objlay = OBJLAYOUT(pnfslay); - struct objlayout_io_state *state, *tmp; + struct objlayout_io_res *oir, *tmp; __be32 *start; dprintk("%s: Begin\n", __func__); @@ -585,13 +522,13 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay, spin_lock(&objlay->lock); - list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) { + list_for_each_entry_safe(oir, tmp, &objlay->err_list, err_list) { __be32 *last_xdr = NULL, *p; unsigned i; int res = 0; - for (i = 0; i < state->num_comps; i++) { - struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i]; + for (i = 0; i < oir->num_comps; i++) { + struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[i]; if (!ioerr->oer_errno) continue; @@ -615,7 +552,7 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay, } last_xdr = p; - pnfs_osd_xdr_encode_ioerr(p, &state->ioerrs[i]); + pnfs_osd_xdr_encode_ioerr(p, &oir->ioerrs[i]); } /* TODO: use xdr_write_pages */ @@ -631,8 +568,8 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay, encode_accumulated_error(objlay, last_xdr); goto loop_done; } - list_del(&state->err_list); - objlayout_free_io_state(state); + list_del(&oir->err_list); + objio_free_result(oir); } loop_done: spin_unlock(&objlay->lock); diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h index a8244c8e042d..8ec34727ed21 100644 --- a/fs/nfs/objlayout/objlayout.h +++ b/fs/nfs/objlayout/objlayout.h @@ -74,19 +74,11 @@ OBJLAYOUT(struct pnfs_layout_hdr *lo) * per-I/O operation state * embedded in objects provider io_state data structure */ -struct objlayout_io_state { - struct pnfs_layout_segment *lseg; - - struct page **pages; - unsigned pgbase; - unsigned nr_pages; - unsigned long count; - loff_t offset; - bool sync; +struct objlayout_io_res { + struct objlayout *objlay; void *rpcdata; int status; /* res */ - int eof; /* res */ int committed; /* res */ /* Error reporting (layout_return) */ @@ -100,6 +92,18 @@ struct objlayout_io_state { struct pnfs_osd_ioerr *ioerrs; }; +static inline +void objlayout_init_ioerrs(struct objlayout_io_res *oir, unsigned num_comps, + struct pnfs_osd_ioerr *ioerrs, void *rpcdata, + struct pnfs_layout_hdr *pnfs_layout_type) +{ + oir->objlay = OBJLAYOUT(pnfs_layout_type); + oir->rpcdata = rpcdata; + INIT_LIST_HEAD(&oir->err_list); + oir->num_comps = num_comps; + oir->ioerrs = ioerrs; +} + /* * Raid engine I/O API */ @@ -110,28 +114,24 @@ extern int objio_alloc_lseg(struct pnfs_layout_segment **outp, gfp_t gfp_flags); extern void objio_free_lseg(struct pnfs_layout_segment *lseg); -extern int objio_alloc_io_state( - struct pnfs_layout_segment *lseg, - struct objlayout_io_state **outp, - gfp_t gfp_flags); -extern void objio_free_io_state(struct objlayout_io_state *state); +/* objio_free_result will free these @oir structs recieved from + * objlayout_{read,write}_done + */ +extern void objio_free_result(struct objlayout_io_res *oir); -extern ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state); -extern ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, - bool stable); +extern int objio_read_pagelist(struct nfs_read_data *rdata); +extern int objio_write_pagelist(struct nfs_write_data *wdata, int how); /* * callback API */ -extern void objlayout_io_set_result(struct objlayout_io_state *state, +extern void objlayout_io_set_result(struct objlayout_io_res *oir, unsigned index, struct pnfs_osd_objid *pooid, int osd_error, u64 offset, u64 length, bool is_write); static inline void -objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used) +objlayout_add_delta_space_used(struct objlayout *objlay, s64 space_used) { - struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout); - /* If one of the I/Os errored out and the delta_space_used was * invalid we render the complete report as invalid. Protocol mandate * the DSU be accurate or not reported. @@ -144,9 +144,9 @@ objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used) spin_unlock(&objlay->lock); } -extern void objlayout_read_done(struct objlayout_io_state *state, +extern void objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync); -extern void objlayout_write_done(struct objlayout_io_state *state, +extern void objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync); extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay, diff --git a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c index 16fc758e9123..b3918f7ac34d 100644 --- a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c +++ b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c @@ -170,6 +170,9 @@ int pnfs_osd_xdr_decode_layout_map(struct pnfs_osd_layout *layout, p = _osd_xdr_decode_data_map(p, &layout->olo_map); layout->olo_comps_index = be32_to_cpup(p++); layout->olo_num_comps = be32_to_cpup(p++); + dprintk("%s: olo_comps_index=%d olo_num_comps=%d\n", __func__, + layout->olo_comps_index, layout->olo_num_comps); + iter->total_comps = layout->olo_num_comps; return 0; } diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 18449f43c568..5668f7c54c41 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -18,6 +18,7 @@ #include <linux/nfs_page.h> #include <linux/nfs_fs.h> #include <linux/nfs_mount.h> +#include <linux/export.h> #include "internal.h" #include "pnfs.h" @@ -41,7 +42,7 @@ nfs_page_free(struct nfs_page *p) /** * nfs_create_request - Create an NFS read/write request. - * @file: file descriptor to use + * @ctx: open context to use * @inode: inode to which the request is attached * @page: page to write * @offset: starting offset within the page for the write @@ -230,7 +231,7 @@ EXPORT_SYMBOL_GPL(nfs_generic_pg_test); */ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, struct inode *inode, - int (*doio)(struct nfs_pageio_descriptor *), + const struct nfs_pageio_ops *pg_ops, size_t bsize, int io_flags) { @@ -240,13 +241,12 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, desc->pg_bsize = bsize; desc->pg_base = 0; desc->pg_moreio = 0; + desc->pg_recoalesce = 0; desc->pg_inode = inode; - desc->pg_doio = doio; + desc->pg_ops = pg_ops; desc->pg_ioflags = io_flags; desc->pg_error = 0; desc->pg_lseg = NULL; - desc->pg_test = nfs_generic_pg_test; - pnfs_pageio_init(desc, inode); } /** @@ -276,7 +276,7 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev, return false; if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE) return false; - return pgio->pg_test(pgio, prev, req); + return pgio->pg_ops->pg_test(pgio, prev, req); } /** @@ -297,6 +297,8 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc, if (!nfs_can_coalesce_requests(prev, req, desc)) return 0; } else { + if (desc->pg_ops->pg_init) + desc->pg_ops->pg_init(desc, req); desc->pg_base = req->wb_pgbase; } nfs_list_remove_request(req); @@ -311,7 +313,7 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc, static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc) { if (!list_empty(&desc->pg_list)) { - int error = desc->pg_doio(desc); + int error = desc->pg_ops->pg_doio(desc); if (error < 0) desc->pg_error = error; else @@ -331,7 +333,7 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc) * Returns true if the request 'req' was successfully coalesced into the * existing list of pages 'desc'. */ -int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, +static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, struct nfs_page *req) { while (!nfs_pageio_do_add_request(desc, req)) { @@ -340,17 +342,67 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, if (desc->pg_error < 0) return 0; desc->pg_moreio = 0; + if (desc->pg_recoalesce) + return 0; } return 1; } +static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc) +{ + LIST_HEAD(head); + + do { + list_splice_init(&desc->pg_list, &head); + desc->pg_bytes_written -= desc->pg_count; + desc->pg_count = 0; + desc->pg_base = 0; + desc->pg_recoalesce = 0; + + while (!list_empty(&head)) { + struct nfs_page *req; + + req = list_first_entry(&head, struct nfs_page, wb_list); + nfs_list_remove_request(req); + if (__nfs_pageio_add_request(desc, req)) + continue; + if (desc->pg_error < 0) + return 0; + break; + } + } while (desc->pg_recoalesce); + return 1; +} + +int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, + struct nfs_page *req) +{ + int ret; + + do { + ret = __nfs_pageio_add_request(desc, req); + if (ret) + break; + if (desc->pg_error < 0) + break; + ret = nfs_do_recoalesce(desc); + } while (ret); + return ret; +} + /** * nfs_pageio_complete - Complete I/O on an nfs_pageio_descriptor * @desc: pointer to io descriptor */ void nfs_pageio_complete(struct nfs_pageio_descriptor *desc) { - nfs_pageio_doio(desc); + for (;;) { + nfs_pageio_doio(desc); + if (!desc->pg_recoalesce) + break; + if (!nfs_do_recoalesce(desc)) + break; + } } /** @@ -369,7 +421,7 @@ void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index) if (!list_empty(&desc->pg_list)) { struct nfs_page *prev = nfs_list_entry(desc->pg_list.prev); if (index != prev->wb_index + 1) - nfs_pageio_doio(desc); + nfs_pageio_complete(desc); } } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 29c0ca7fc347..8e672a2b2d69 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -28,6 +28,8 @@ */ #include <linux/nfs_fs.h> +#include <linux/nfs_page.h> +#include <linux/module.h> #include "internal.h" #include "pnfs.h" #include "iostat.h" @@ -75,8 +77,11 @@ find_pnfs_driver(u32 id) void unset_pnfs_layoutdriver(struct nfs_server *nfss) { - if (nfss->pnfs_curr_ld) + if (nfss->pnfs_curr_ld) { + if (nfss->pnfs_curr_ld->clear_layoutdriver) + nfss->pnfs_curr_ld->clear_layoutdriver(nfss); module_put(nfss->pnfs_curr_ld->owner); + } nfss->pnfs_curr_ld = NULL; } @@ -87,7 +92,8 @@ unset_pnfs_layoutdriver(struct nfs_server *nfss) * @id layout type. Zero (illegal layout type) indicates pNFS not in use. */ void -set_pnfs_layoutdriver(struct nfs_server *server, u32 id) +set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh, + u32 id) { struct pnfs_layoutdriver_type *ld_type = NULL; @@ -114,6 +120,13 @@ set_pnfs_layoutdriver(struct nfs_server *server, u32 id) goto out_no_driver; } server->pnfs_curr_ld = ld_type; + if (ld_type->set_layoutdriver + && ld_type->set_layoutdriver(server, mntfh)) { + printk(KERN_ERR "%s: Error initializing pNFS layout driver %u.\n", + __func__, id); + module_put(ld_type->owner); + goto out_no_driver; + } dprintk("%s: pNFS module for %u set\n", __func__, id); return; @@ -189,6 +202,7 @@ static void pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo) { struct pnfs_layoutdriver_type *ld = NFS_SERVER(lo->plh_inode)->pnfs_curr_ld; + put_rpccred(lo->plh_lc_cred); return ld->alloc_layout_hdr ? ld->free_layout_hdr(lo) : kfree(lo); } @@ -223,6 +237,7 @@ static void init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg) { INIT_LIST_HEAD(&lseg->pls_list); + INIT_LIST_HEAD(&lseg->pls_lc_list); atomic_set(&lseg->pls_refcount, 1); smp_mb(); set_bit(NFS_LSEG_VALID, &lseg->pls_flags); @@ -448,11 +463,20 @@ pnfs_destroy_layout(struct nfs_inode *nfsi) void pnfs_destroy_all_layouts(struct nfs_client *clp) { + struct nfs_server *server; struct pnfs_layout_hdr *lo; LIST_HEAD(tmp_list); + nfs4_deviceid_mark_client_invalid(clp); + nfs4_deviceid_purge_client(clp); + spin_lock(&clp->cl_lock); - list_splice_init(&clp->cl_layouts, &tmp_list); + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + if (!list_empty(&server->layouts)) + list_splice_init(&server->layouts, &tmp_list); + } + rcu_read_unlock(); spin_unlock(&clp->cl_lock); while (!list_empty(&tmp_list)) { @@ -661,6 +685,7 @@ _pnfs_return_layout(struct inode *ino) lrp->args.stateid = stateid; lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id; lrp->args.inode = ino; + lrp->args.layout = lo; lrp->clp = NFS_SERVER(ino)->nfs_client; status = nfs4_proc_layoutreturn(lrp); @@ -805,7 +830,9 @@ out: } static struct pnfs_layout_hdr * -alloc_init_layout_hdr(struct inode *ino, gfp_t gfp_flags) +alloc_init_layout_hdr(struct inode *ino, + struct nfs_open_context *ctx, + gfp_t gfp_flags) { struct pnfs_layout_hdr *lo; @@ -817,11 +844,14 @@ alloc_init_layout_hdr(struct inode *ino, gfp_t gfp_flags) INIT_LIST_HEAD(&lo->plh_segs); INIT_LIST_HEAD(&lo->plh_bulk_recall); lo->plh_inode = ino; + lo->plh_lc_cred = get_rpccred(ctx->state->owner->so_cred); return lo; } static struct pnfs_layout_hdr * -pnfs_find_alloc_layout(struct inode *ino, gfp_t gfp_flags) +pnfs_find_alloc_layout(struct inode *ino, + struct nfs_open_context *ctx, + gfp_t gfp_flags) { struct nfs_inode *nfsi = NFS_I(ino); struct pnfs_layout_hdr *new = NULL; @@ -836,7 +866,7 @@ pnfs_find_alloc_layout(struct inode *ino, gfp_t gfp_flags) return nfsi->layout; } spin_unlock(&ino->i_lock); - new = alloc_init_layout_hdr(ino, gfp_flags); + new = alloc_init_layout_hdr(ino, ctx, gfp_flags); spin_lock(&ino->i_lock); if (likely(nfsi->layout == NULL)) /* Won the race? */ @@ -920,7 +950,8 @@ pnfs_update_layout(struct inode *ino, }; unsigned pg_offset; struct nfs_inode *nfsi = NFS_I(ino); - struct nfs_client *clp = NFS_SERVER(ino)->nfs_client; + struct nfs_server *server = NFS_SERVER(ino); + struct nfs_client *clp = server->nfs_client; struct pnfs_layout_hdr *lo; struct pnfs_layout_segment *lseg = NULL; bool first = false; @@ -928,7 +959,7 @@ pnfs_update_layout(struct inode *ino, if (!pnfs_enabled_sb(NFS_SERVER(ino))) return NULL; spin_lock(&ino->i_lock); - lo = pnfs_find_alloc_layout(ino, gfp_flags); + lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags); if (lo == NULL) { dprintk("%s ERROR: can't get pnfs_layout_hdr\n", __func__); goto out_unlock; @@ -964,7 +995,7 @@ pnfs_update_layout(struct inode *ino, */ spin_lock(&clp->cl_lock); BUG_ON(!list_empty(&lo->plh_layouts)); - list_add_tail(&lo->plh_layouts, &clp->cl_layouts); + list_add_tail(&lo->plh_layouts, &server->layouts); spin_unlock(&clp->cl_lock); } @@ -973,7 +1004,8 @@ pnfs_update_layout(struct inode *ino, arg.offset -= pg_offset; arg.length += pg_offset; } - arg.length = PAGE_CACHE_ALIGN(arg.length); + if (arg.length != NFS4_MAX_UINT64) + arg.length = PAGE_CACHE_ALIGN(arg.length); lseg = send_layoutget(lo, ctx, &arg, gfp_flags); if (!lseg && first) { @@ -991,6 +1023,7 @@ out_unlock: spin_unlock(&ino->i_lock); goto out; } +EXPORT_SYMBOL_GPL(pnfs_update_layout); int pnfs_layout_process(struct nfs4_layoutget *lgp) @@ -1048,35 +1081,71 @@ out_forget_reply: goto out; } +void +pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) +{ + BUG_ON(pgio->pg_lseg != NULL); + + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, + req->wb_context, + req_offset(req), + req->wb_bytes, + IOMODE_READ, + GFP_KERNEL); + /* If no lseg, fall back to read through mds */ + if (pgio->pg_lseg == NULL) + nfs_pageio_reset_read_mds(pgio); + +} +EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_read); + +void +pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) +{ + BUG_ON(pgio->pg_lseg != NULL); + + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, + req->wb_context, + req_offset(req), + req->wb_bytes, + IOMODE_RW, + GFP_NOFS); + /* If no lseg, fall back to write through mds */ + if (pgio->pg_lseg == NULL) + nfs_pageio_reset_write_mds(pgio); +} +EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_write); + bool -pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, - struct nfs_page *req) +pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode) { - enum pnfs_iomode access_type; - gfp_t gfp_flags; + struct nfs_server *server = NFS_SERVER(inode); + struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld; - /* We assume that pg_ioflags == 0 iff we're reading a page */ - if (pgio->pg_ioflags == 0) { - access_type = IOMODE_READ; - gfp_flags = GFP_KERNEL; - } else { - access_type = IOMODE_RW; - gfp_flags = GFP_NOFS; - } + if (ld == NULL) + return false; + nfs_pageio_init(pgio, inode, ld->pg_read_ops, server->rsize, 0); + return true; +} - if (pgio->pg_lseg == NULL) { - if (pgio->pg_count != prev->wb_bytes) - return true; - /* This is first coelesce call for a series of nfs_pages */ - pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, - prev->wb_context, - req_offset(prev), - pgio->pg_count, - access_type, - gfp_flags); - if (pgio->pg_lseg == NULL) - return true; - } +bool +pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, int ioflags) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld; + + if (ld == NULL) + return false; + nfs_pageio_init(pgio, inode, ld->pg_write_ops, server->wsize, ioflags); + return true; +} + +bool +pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, + struct nfs_page *req) +{ + if (pgio->pg_lseg == NULL) + return nfs_generic_pg_test(pgio, prev, req); /* * Test if a nfs_page is fully contained in the pnfs_layout_range. @@ -1100,35 +1169,44 @@ EXPORT_SYMBOL_GPL(pnfs_generic_pg_test); /* * Called by non rpc-based layout drivers */ -int -pnfs_ld_write_done(struct nfs_write_data *data) +void pnfs_ld_write_done(struct nfs_write_data *data) { - int status; - - if (!data->pnfs_error) { + if (likely(!data->pnfs_error)) { pnfs_set_layoutcommit(data); data->mds_ops->rpc_call_done(&data->task, data); - data->mds_ops->rpc_release(data); - return 0; + } else { + put_lseg(data->lseg); + data->lseg = NULL; + dprintk("pnfs write error = %d\n", data->pnfs_error); } - - dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__, - data->pnfs_error); - status = nfs_initiate_write(data, NFS_CLIENT(data->inode), - data->mds_ops, NFS_FILE_SYNC); - return status ? : -EAGAIN; + data->mds_ops->rpc_release(data); } EXPORT_SYMBOL_GPL(pnfs_ld_write_done); -enum pnfs_try_status +static void +pnfs_write_through_mds(struct nfs_pageio_descriptor *desc, + struct nfs_write_data *data) +{ + list_splice_tail_init(&data->pages, &desc->pg_list); + if (data->req && list_empty(&data->req->wb_list)) + nfs_list_add_request(data->req, &desc->pg_list); + nfs_pageio_reset_write_mds(desc); + desc->pg_recoalesce = 1; + nfs_writedata_release(data); +} + +static enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *wdata, - const struct rpc_call_ops *call_ops, int how) + const struct rpc_call_ops *call_ops, + struct pnfs_layout_segment *lseg, + int how) { struct inode *inode = wdata->inode; enum pnfs_try_status trypnfs; struct nfs_server *nfss = NFS_SERVER(inode); wdata->mds_ops = call_ops; + wdata->lseg = get_lseg(lseg); dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__, inode->i_ino, wdata->args.count, wdata->args.offset, how); @@ -1144,41 +1222,103 @@ pnfs_try_to_write_data(struct nfs_write_data *wdata, return trypnfs; } +static void +pnfs_do_multiple_writes(struct nfs_pageio_descriptor *desc, struct list_head *head, int how) +{ + struct nfs_write_data *data; + const struct rpc_call_ops *call_ops = desc->pg_rpc_callops; + struct pnfs_layout_segment *lseg = desc->pg_lseg; + + desc->pg_lseg = NULL; + while (!list_empty(head)) { + enum pnfs_try_status trypnfs; + + data = list_entry(head->next, struct nfs_write_data, list); + list_del_init(&data->list); + + trypnfs = pnfs_try_to_write_data(data, call_ops, lseg, how); + if (trypnfs == PNFS_NOT_ATTEMPTED) + pnfs_write_through_mds(desc, data); + } + put_lseg(lseg); +} + +int +pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) +{ + LIST_HEAD(head); + int ret; + + ret = nfs_generic_flush(desc, &head); + if (ret != 0) { + put_lseg(desc->pg_lseg); + desc->pg_lseg = NULL; + return ret; + } + pnfs_do_multiple_writes(desc, &head, desc->pg_ioflags); + return 0; +} +EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages); + +static void pnfs_ld_handle_read_error(struct nfs_read_data *data) +{ + struct nfs_pageio_descriptor pgio; + + put_lseg(data->lseg); + data->lseg = NULL; + dprintk("pnfs write error = %d\n", data->pnfs_error); + + nfs_pageio_init_read_mds(&pgio, data->inode); + + while (!list_empty(&data->pages)) { + struct nfs_page *req = nfs_list_entry(data->pages.next); + + nfs_list_remove_request(req); + nfs_pageio_add_request(&pgio, req); + } + nfs_pageio_complete(&pgio); +} + /* * Called by non rpc-based layout drivers */ -int -pnfs_ld_read_done(struct nfs_read_data *data) +void pnfs_ld_read_done(struct nfs_read_data *data) { - int status; - - if (!data->pnfs_error) { + if (likely(!data->pnfs_error)) { __nfs4_read_done_cb(data); data->mds_ops->rpc_call_done(&data->task, data); - data->mds_ops->rpc_release(data); - return 0; - } - - dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__, - data->pnfs_error); - status = nfs_initiate_read(data, NFS_CLIENT(data->inode), - data->mds_ops); - return status ? : -EAGAIN; + } else + pnfs_ld_handle_read_error(data); + data->mds_ops->rpc_release(data); } EXPORT_SYMBOL_GPL(pnfs_ld_read_done); +static void +pnfs_read_through_mds(struct nfs_pageio_descriptor *desc, + struct nfs_read_data *data) +{ + list_splice_tail_init(&data->pages, &desc->pg_list); + if (data->req && list_empty(&data->req->wb_list)) + nfs_list_add_request(data->req, &desc->pg_list); + nfs_pageio_reset_read_mds(desc); + desc->pg_recoalesce = 1; + nfs_readdata_release(data); +} + /* * Call the appropriate parallel I/O subsystem read function. */ -enum pnfs_try_status +static enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *rdata, - const struct rpc_call_ops *call_ops) + const struct rpc_call_ops *call_ops, + struct pnfs_layout_segment *lseg) { struct inode *inode = rdata->inode; struct nfs_server *nfss = NFS_SERVER(inode); enum pnfs_try_status trypnfs; rdata->mds_ops = call_ops; + rdata->lseg = get_lseg(lseg); dprintk("%s: Reading ino:%lu %u@%llu\n", __func__, inode->i_ino, rdata->args.count, rdata->args.offset); @@ -1194,18 +1334,69 @@ pnfs_try_to_read_data(struct nfs_read_data *rdata, return trypnfs; } +static void +pnfs_do_multiple_reads(struct nfs_pageio_descriptor *desc, struct list_head *head) +{ + struct nfs_read_data *data; + const struct rpc_call_ops *call_ops = desc->pg_rpc_callops; + struct pnfs_layout_segment *lseg = desc->pg_lseg; + + desc->pg_lseg = NULL; + while (!list_empty(head)) { + enum pnfs_try_status trypnfs; + + data = list_entry(head->next, struct nfs_read_data, list); + list_del_init(&data->list); + + trypnfs = pnfs_try_to_read_data(data, call_ops, lseg); + if (trypnfs == PNFS_NOT_ATTEMPTED) + pnfs_read_through_mds(desc, data); + } + put_lseg(lseg); +} + +int +pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) +{ + LIST_HEAD(head); + int ret; + + ret = nfs_generic_pagein(desc, &head); + if (ret != 0) { + put_lseg(desc->pg_lseg); + desc->pg_lseg = NULL; + return ret; + } + pnfs_do_multiple_reads(desc, &head); + return 0; +} +EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages); + /* - * Currently there is only one (whole file) write lseg. + * There can be multiple RW segments. */ -static struct pnfs_layout_segment *pnfs_list_write_lseg(struct inode *inode) +static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp) { - struct pnfs_layout_segment *lseg, *rv = NULL; + struct pnfs_layout_segment *lseg; - list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) - if (lseg->pls_range.iomode == IOMODE_RW) - rv = lseg; - return rv; + list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) { + if (lseg->pls_range.iomode == IOMODE_RW && + test_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) + list_add(&lseg->pls_lc_list, listp); + } +} + +void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg) +{ + if (lseg->pls_range.iomode == IOMODE_RW) { + dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__); + set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags); + } else { + dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__); + set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags); + } } +EXPORT_SYMBOL_GPL(pnfs_set_lo_fail); void pnfs_set_layoutcommit(struct nfs_write_data *wdata) @@ -1216,17 +1407,19 @@ pnfs_set_layoutcommit(struct nfs_write_data *wdata) spin_lock(&nfsi->vfs_inode.i_lock); if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { - /* references matched in nfs4_layoutcommit_release */ - get_lseg(wdata->lseg); - wdata->lseg->pls_lc_cred = - get_rpccred(wdata->args.context->state->owner->so_cred); mark_as_dirty = true; dprintk("%s: Set layoutcommit for inode %lu ", __func__, wdata->inode->i_ino); } - if (end_pos > wdata->lseg->pls_end_pos) - wdata->lseg->pls_end_pos = end_pos; + if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &wdata->lseg->pls_flags)) { + /* references matched in nfs4_layoutcommit_release */ + get_lseg(wdata->lseg); + } + if (end_pos > nfsi->layout->plh_lwb) + nfsi->layout->plh_lwb = end_pos; spin_unlock(&nfsi->vfs_inode.i_lock); + dprintk("%s: lseg %p end_pos %llu\n", + __func__, wdata->lseg, nfsi->layout->plh_lwb); /* if pnfs_layoutcommit_inode() runs between inode locks, the next one * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */ @@ -1235,6 +1428,14 @@ pnfs_set_layoutcommit(struct nfs_write_data *wdata) } EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit); +void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data) +{ + struct nfs_server *nfss = NFS_SERVER(data->args.inode); + + if (nfss->pnfs_curr_ld->cleanup_layoutcommit) + nfss->pnfs_curr_ld->cleanup_layoutcommit(data); +} + /* * For the LAYOUT4_NFSV4_1_FILES layout type, NFS_DATA_SYNC WRITEs and * NFS_UNSTABLE WRITEs with a COMMIT to data servers must store enough @@ -1248,8 +1449,6 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) { struct nfs4_layoutcommit_data *data; struct nfs_inode *nfsi = NFS_I(inode); - struct pnfs_layout_segment *lseg; - struct rpc_cred *cred; loff_t end_pos; int status = 0; @@ -1261,35 +1460,44 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */ data = kzalloc(sizeof(*data), GFP_NOFS); if (!data) { - mark_inode_dirty_sync(inode); status = -ENOMEM; goto out; } + if (!test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) + goto out_free; + + if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) { + if (!sync) { + status = -EAGAIN; + goto out_free; + } + status = wait_on_bit_lock(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING, + nfs_wait_bit_killable, TASK_KILLABLE); + if (status) + goto out_free; + } + + INIT_LIST_HEAD(&data->lseg_list); spin_lock(&inode->i_lock); if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { + clear_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags); spin_unlock(&inode->i_lock); - kfree(data); - goto out; + wake_up_bit(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING); + goto out_free; } - /* - * Currently only one (whole file) write lseg which is referenced - * in pnfs_set_layoutcommit and will be found. - */ - lseg = pnfs_list_write_lseg(inode); - end_pos = lseg->pls_end_pos; - cred = lseg->pls_lc_cred; - lseg->pls_end_pos = 0; - lseg->pls_lc_cred = NULL; + pnfs_list_write_lseg(inode, &data->lseg_list); + + end_pos = nfsi->layout->plh_lwb; + nfsi->layout->plh_lwb = 0; memcpy(&data->args.stateid.data, nfsi->layout->plh_stateid.data, sizeof(nfsi->layout->plh_stateid.data)); spin_unlock(&inode->i_lock); data->args.inode = inode; - data->lseg = lseg; - data->cred = cred; + data->cred = get_rpccred(nfsi->layout->plh_lc_cred); nfs_fattr_init(&data->fattr); data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask; data->res.fattr = &data->fattr; @@ -1298,6 +1506,11 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) status = nfs4_proc_layoutcommit(data, sync); out: + if (status) + mark_inode_dirty_sync(inode); dprintk("<-- %s status %d\n", __func__, status); return status; +out_free: + kfree(data); + goto out; } diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 96bf4e6f45be..1509530cb111 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -36,16 +36,16 @@ enum { NFS_LSEG_VALID = 0, /* cleared when lseg is recalled/returned */ NFS_LSEG_ROC, /* roc bit received from server */ + NFS_LSEG_LAYOUTCOMMIT, /* layoutcommit bit set for layoutcommit */ }; struct pnfs_layout_segment { struct list_head pls_list; + struct list_head pls_lc_list; struct pnfs_layout_range pls_range; atomic_t pls_refcount; unsigned long pls_flags; struct pnfs_layout_hdr *pls_layout; - struct rpc_cred *pls_lc_cred; /* LAYOUTCOMMIT credential */ - loff_t pls_end_pos; /* LAYOUTCOMMIT write end */ }; enum pnfs_try_status { @@ -80,6 +80,9 @@ struct pnfs_layoutdriver_type { struct module *owner; unsigned flags; + int (*set_layoutdriver) (struct nfs_server *, const struct nfs_fh *); + int (*clear_layoutdriver) (struct nfs_server *); + struct pnfs_layout_hdr * (*alloc_layout_hdr) (struct inode *inode, gfp_t gfp_flags); void (*free_layout_hdr) (struct pnfs_layout_hdr *); @@ -87,7 +90,8 @@ struct pnfs_layoutdriver_type { void (*free_lseg) (struct pnfs_layout_segment *lseg); /* test for nfs page cache coalescing */ - bool (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *); + const struct nfs_pageio_ops *pg_read_ops; + const struct nfs_pageio_ops *pg_write_ops; /* Returns true if layoutdriver wants to divert this request to * driver's commit routine. @@ -109,6 +113,8 @@ struct pnfs_layoutdriver_type { struct xdr_stream *xdr, const struct nfs4_layoutreturn_args *args); + void (*cleanup_layoutcommit) (struct nfs4_layoutcommit_data *data); + void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid, struct xdr_stream *xdr, const struct nfs4_layoutcommit_args *args); @@ -124,6 +130,8 @@ struct pnfs_layout_hdr { unsigned long plh_block_lgets; /* block LAYOUTGET if >0 */ u32 plh_barrier; /* ignore lower seqids */ unsigned long plh_flags; + loff_t plh_lwb; /* last write byte for layoutcommit */ + struct rpc_cred *plh_lc_cred; /* layoutcommit cred */ struct inode *plh_inode; }; @@ -136,10 +144,21 @@ struct pnfs_device { unsigned int pglen; }; +#define NFS4_PNFS_GETDEVLIST_MAXNUM 16 + +struct pnfs_devicelist { + unsigned int eof; + unsigned int num_devs; + struct nfs4_deviceid dev_id[NFS4_PNFS_GETDEVLIST_MAXNUM]; +}; + extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *); extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *); /* nfs4proc.c */ +extern int nfs4_proc_getdevicelist(struct nfs_server *server, + const struct nfs_fh *fh, + struct pnfs_devicelist *devlist); extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *dev); extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp); @@ -148,17 +167,18 @@ extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp); /* pnfs.c */ void get_layout_hdr(struct pnfs_layout_hdr *lo); void put_lseg(struct pnfs_layout_segment *lseg); -struct pnfs_layout_segment * -pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, - loff_t pos, u64 count, enum pnfs_iomode access_type, - gfp_t gfp_flags); -void set_pnfs_layoutdriver(struct nfs_server *, u32 id); + +bool pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *); +bool pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *, int); + +void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, u32); void unset_pnfs_layoutdriver(struct nfs_server *); -enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *, - const struct rpc_call_ops *, int); -enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *, - const struct rpc_call_ops *); +void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *); +int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc); +void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *, struct nfs_page *); +int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc); bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req); +void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg); int pnfs_layout_process(struct nfs4_layoutget *lgp); void pnfs_free_lseg_list(struct list_head *tmp_list); void pnfs_destroy_layout(struct nfs_inode *); @@ -178,10 +198,24 @@ void pnfs_roc_release(struct inode *ino); void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); bool pnfs_roc_drain(struct inode *ino, u32 *barrier); void pnfs_set_layoutcommit(struct nfs_write_data *wdata); +void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data); int pnfs_layoutcommit_inode(struct inode *inode, bool sync); int _pnfs_return_layout(struct inode *); -int pnfs_ld_write_done(struct nfs_write_data *); -int pnfs_ld_read_done(struct nfs_read_data *); +void pnfs_ld_write_done(struct nfs_write_data *); +void pnfs_ld_read_done(struct nfs_read_data *); +struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino, + struct nfs_open_context *ctx, + loff_t pos, + u64 count, + enum pnfs_iomode iomode, + gfp_t gfp_flags); + +void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp); + +/* nfs4_deviceid_flags */ +enum { + NFS_DEVICEID_INVALID = 0, /* set when MDS clientid recalled */ +}; /* pnfs_dev.c */ struct nfs4_deviceid_node { @@ -189,13 +223,13 @@ struct nfs4_deviceid_node { struct hlist_node tmpnode; const struct pnfs_layoutdriver_type *ld; const struct nfs_client *nfs_client; + unsigned long flags; struct nfs4_deviceid deviceid; atomic_t ref; }; void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id); struct nfs4_deviceid_node *nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *); -struct nfs4_deviceid_node *nfs4_unhash_put_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *); void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *); void nfs4_init_deviceid_node(struct nfs4_deviceid_node *, const struct pnfs_layoutdriver_type *, @@ -293,15 +327,6 @@ static inline int pnfs_return_layout(struct inode *ino) return 0; } -static inline void pnfs_pageio_init(struct nfs_pageio_descriptor *pgio, - struct inode *inode) -{ - struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; - - if (ld) - pgio->pg_test = ld->pg_test; -} - #else /* CONFIG_NFS_V4_1 */ static inline void pnfs_destroy_all_layouts(struct nfs_client *clp) @@ -322,28 +347,6 @@ static inline void put_lseg(struct pnfs_layout_segment *lseg) { } -static inline struct pnfs_layout_segment * -pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, - loff_t pos, u64 count, enum pnfs_iomode access_type, - gfp_t gfp_flags) -{ - return NULL; -} - -static inline enum pnfs_try_status -pnfs_try_to_read_data(struct nfs_read_data *data, - const struct rpc_call_ops *call_ops) -{ - return PNFS_NOT_ATTEMPTED; -} - -static inline enum pnfs_try_status -pnfs_try_to_write_data(struct nfs_write_data *data, - const struct rpc_call_ops *call_ops, int how) -{ - return PNFS_NOT_ATTEMPTED; -} - static inline int pnfs_return_layout(struct inode *ino) { return 0; @@ -377,7 +380,8 @@ pnfs_roc_drain(struct inode *ino, u32 *barrier) return false; } -static inline void set_pnfs_layoutdriver(struct nfs_server *s, u32 id) +static inline void set_pnfs_layoutdriver(struct nfs_server *s, + const struct nfs_fh *mntfh, u32 id) { } @@ -385,9 +389,14 @@ static inline void unset_pnfs_layoutdriver(struct nfs_server *s) { } -static inline void pnfs_pageio_init(struct nfs_pageio_descriptor *pgio, - struct inode *inode) +static inline bool pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode) { + return false; +} + +static inline bool pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, int ioflags) +{ + return false; } static inline void diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c index f0f8e1e22f6c..4f359d2a26eb 100644 --- a/fs/nfs/pnfs_dev.c +++ b/fs/nfs/pnfs_dev.c @@ -28,6 +28,7 @@ * such damages. */ +#include <linux/export.h> #include "pnfs.h" #define NFSDBG_FACILITY NFSDBG_PNFS @@ -100,8 +101,8 @@ _find_get_deviceid(const struct pnfs_layoutdriver_type *ld, rcu_read_lock(); d = _lookup_deviceid(ld, clp, id, hash); - if (d && !atomic_inc_not_zero(&d->ref)) - d = NULL; + if (d != NULL) + atomic_inc(&d->ref); rcu_read_unlock(); return d; } @@ -115,15 +116,15 @@ nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *ld, EXPORT_SYMBOL_GPL(nfs4_find_get_deviceid); /* - * Unhash and put deviceid + * Remove a deviceid from cache * * @clp nfs_client associated with deviceid * @id the deviceid to unhash * * @ret the unhashed node, if found and dereferenced to zero, NULL otherwise. */ -struct nfs4_deviceid_node * -nfs4_unhash_put_deviceid(const struct pnfs_layoutdriver_type *ld, +void +nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *ld, const struct nfs_client *clp, const struct nfs4_deviceid *id) { struct nfs4_deviceid_node *d; @@ -134,7 +135,7 @@ nfs4_unhash_put_deviceid(const struct pnfs_layoutdriver_type *ld, rcu_read_unlock(); if (!d) { spin_unlock(&nfs4_deviceid_lock); - return NULL; + return; } hlist_del_init_rcu(&d->node); spin_unlock(&nfs4_deviceid_lock); @@ -142,28 +143,7 @@ nfs4_unhash_put_deviceid(const struct pnfs_layoutdriver_type *ld, /* balance the initial ref set in pnfs_insert_deviceid */ if (atomic_dec_and_test(&d->ref)) - return d; - - return NULL; -} -EXPORT_SYMBOL_GPL(nfs4_unhash_put_deviceid); - -/* - * Delete a deviceid from cache - * - * @clp struct nfs_client qualifying the deviceid - * @id deviceid to delete - */ -void -nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *ld, - const struct nfs_client *clp, const struct nfs4_deviceid *id) -{ - struct nfs4_deviceid_node *d; - - d = nfs4_unhash_put_deviceid(ld, clp, id); - if (!d) - return; - d->ld->free_deviceid_node(d); + d->ld->free_deviceid_node(d); } EXPORT_SYMBOL_GPL(nfs4_delete_deviceid); @@ -177,6 +157,7 @@ nfs4_init_deviceid_node(struct nfs4_deviceid_node *d, INIT_HLIST_NODE(&d->tmpnode); d->ld = ld; d->nfs_client = nfs_client; + d->flags = 0; d->deviceid = *id; atomic_set(&d->ref, 1); } @@ -221,16 +202,15 @@ EXPORT_SYMBOL_GPL(nfs4_insert_deviceid_node); * * @d deviceid node to put * - * @ret true iff the node was deleted + * return true iff the node was deleted + * Note that since the test for d->ref == 0 is sufficient to establish + * that the node is no longer hashed in the global device id cache. */ bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *d) { - if (!atomic_dec_and_lock(&d->ref, &nfs4_deviceid_lock)) + if (!atomic_dec_and_test(&d->ref)) return false; - hlist_del_init_rcu(&d->node); - spin_unlock(&nfs4_deviceid_lock); - synchronize_rcu(); d->ld->free_deviceid_node(d); return true; } @@ -275,3 +255,22 @@ nfs4_deviceid_purge_client(const struct nfs_client *clp) for (h = 0; h < NFS4_DEVICE_ID_HASH_SIZE; h++) _deviceid_purge_client(clp, h); } + +/* + * Stop use of all deviceids associated with an nfs_client + */ +void +nfs4_deviceid_mark_client_invalid(struct nfs_client *clp) +{ + struct nfs4_deviceid_node *d; + struct hlist_node *n; + int i; + + rcu_read_lock(); + for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i ++){ + hlist_for_each_entry_rcu(d, n, &nfs4_deviceid_cache[i], node) + if (d->nfs_client == clp) + set_bit(NFS_DEVICEID_INVALID, &d->flags); + } + rcu_read_unlock(); +} diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index ac40b8535d7e..f48125da198a 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -710,6 +710,7 @@ const struct nfs_rpc_ops nfs_v2_clientops = { .dentry_ops = &nfs_dentry_operations, .dir_inode_ops = &nfs_dir_inode_operations, .file_inode_ops = &nfs_file_inode_operations, + .file_ops = &nfs_file_operations, .getroot = nfs_proc_get_root, .getattr = nfs_proc_getattr, .setattr = nfs_proc_setattr, diff --git a/fs/nfs/read.c b/fs/nfs/read.c index a68679f538fc..cfa175c223dc 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -30,22 +30,18 @@ #define NFSDBG_FACILITY NFSDBG_PAGECACHE -static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc); -static int nfs_pagein_one(struct nfs_pageio_descriptor *desc); +static const struct nfs_pageio_ops nfs_pageio_read_ops; static const struct rpc_call_ops nfs_read_partial_ops; static const struct rpc_call_ops nfs_read_full_ops; static struct kmem_cache *nfs_rdata_cachep; -static mempool_t *nfs_rdata_mempool; - -#define MIN_POOL_READ (32) struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount) { - struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, GFP_KERNEL); + struct nfs_read_data *p; + p = kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL); if (p) { - memset(p, 0, sizeof(*p)); INIT_LIST_HEAD(&p->pages); p->npages = pagecount; if (pagecount <= ARRAY_SIZE(p->page_array)) @@ -53,7 +49,7 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount) else { p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL); if (!p->pagevec) { - mempool_free(p, nfs_rdata_mempool); + kmem_cache_free(nfs_rdata_cachep, p); p = NULL; } } @@ -65,10 +61,10 @@ void nfs_readdata_free(struct nfs_read_data *p) { if (p && (p->pagevec != &p->page_array[0])) kfree(p->pagevec); - mempool_free(p, nfs_rdata_mempool); + kmem_cache_free(nfs_rdata_cachep, p); } -static void nfs_readdata_release(struct nfs_read_data *rdata) +void nfs_readdata_release(struct nfs_read_data *rdata) { put_lseg(rdata->lseg); put_nfs_open_context(rdata->args.context); @@ -113,6 +109,27 @@ static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data) } } +void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio, + struct inode *inode) +{ + nfs_pageio_init(pgio, inode, &nfs_pageio_read_ops, + NFS_SERVER(inode)->rsize, 0); +} + +void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio) +{ + pgio->pg_ops = &nfs_pageio_read_ops; + pgio->pg_bsize = NFS_SERVER(pgio->pg_inode)->rsize; +} +EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds); + +static void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, + struct inode *inode) +{ + if (!pnfs_pageio_init_read(pgio, inode)) + nfs_pageio_init_read_mds(pgio, inode); +} + int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, struct page *page) { @@ -131,14 +148,9 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, if (len < PAGE_CACHE_SIZE) zero_user_segment(page, len, PAGE_CACHE_SIZE); - nfs_pageio_init(&pgio, inode, NULL, 0, 0); - nfs_list_add_request(new, &pgio.pg_list); - pgio.pg_count = len; - - if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE) - nfs_pagein_multi(&pgio); - else - nfs_pagein_one(&pgio); + nfs_pageio_init_read(&pgio, inode); + nfs_pageio_add_request(&pgio, new); + nfs_pageio_complete(&pgio); return 0; } @@ -202,17 +214,14 @@ EXPORT_SYMBOL_GPL(nfs_initiate_read); /* * Set up the NFS read request struct */ -static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, - const struct rpc_call_ops *call_ops, - unsigned int count, unsigned int offset, - struct pnfs_layout_segment *lseg) +static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, + unsigned int count, unsigned int offset) { struct inode *inode = req->wb_context->dentry->d_inode; data->req = req; data->inode = inode; data->cred = req->wb_context->cred; - data->lseg = get_lseg(lseg); data->args.fh = NFS_FH(inode); data->args.offset = req_offset(req) + offset; @@ -226,14 +235,36 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, data->res.count = count; data->res.eof = 0; nfs_fattr_init(&data->fattr); +} - if (data->lseg && - (pnfs_try_to_read_data(data, call_ops) == PNFS_ATTEMPTED)) - return 0; +static int nfs_do_read(struct nfs_read_data *data, + const struct rpc_call_ops *call_ops) +{ + struct inode *inode = data->args.context->dentry->d_inode; return nfs_initiate_read(data, NFS_CLIENT(inode), call_ops); } +static int +nfs_do_multiple_reads(struct list_head *head, + const struct rpc_call_ops *call_ops) +{ + struct nfs_read_data *data; + int ret = 0; + + while (!list_empty(head)) { + int ret2; + + data = list_entry(head->next, struct nfs_read_data, list); + list_del_init(&data->list); + + ret2 = nfs_do_read(data, call_ops); + if (ret == 0) + ret = ret2; + } + return ret; +} + static void nfs_async_read_error(struct list_head *head) { @@ -242,7 +273,6 @@ nfs_async_read_error(struct list_head *head) while (!list_empty(head)) { req = nfs_list_entry(head->next); nfs_list_remove_request(req); - SetPageError(req->wb_page); nfs_readpage_release(req); } } @@ -260,20 +290,19 @@ nfs_async_read_error(struct list_head *head) * won't see the new data until our attribute cache is updated. This is more * or less conventional NFS client behavior. */ -static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc) +static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc, struct list_head *res) { struct nfs_page *req = nfs_list_entry(desc->pg_list.next); struct page *page = req->wb_page; struct nfs_read_data *data; - size_t rsize = NFS_SERVER(desc->pg_inode)->rsize, nbytes; + size_t rsize = desc->pg_bsize, nbytes; unsigned int offset; int requests = 0; int ret = 0; - struct pnfs_layout_segment *lseg; - LIST_HEAD(list); nfs_list_remove_request(req); + offset = 0; nbytes = desc->pg_count; do { size_t len = min(nbytes,rsize); @@ -281,65 +310,39 @@ static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc) data = nfs_readdata_alloc(1); if (!data) goto out_bad; - list_add(&data->pages, &list); + data->pagevec[0] = page; + nfs_read_rpcsetup(req, data, len, offset); + list_add(&data->list, res); requests++; nbytes -= len; + offset += len; } while(nbytes != 0); atomic_set(&req->wb_complete, requests); - - BUG_ON(desc->pg_lseg != NULL); - lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, - req_offset(req), desc->pg_count, - IOMODE_READ, GFP_KERNEL); - ClearPageError(page); - offset = 0; - nbytes = desc->pg_count; - do { - int ret2; - - data = list_entry(list.next, struct nfs_read_data, pages); - list_del_init(&data->pages); - - data->pagevec[0] = page; - - if (nbytes < rsize) - rsize = nbytes; - ret2 = nfs_read_rpcsetup(req, data, &nfs_read_partial_ops, - rsize, offset, lseg); - if (ret == 0) - ret = ret2; - offset += rsize; - nbytes -= rsize; - } while (nbytes != 0); - put_lseg(lseg); - desc->pg_lseg = NULL; - + desc->pg_rpc_callops = &nfs_read_partial_ops; return ret; - out_bad: - while (!list_empty(&list)) { - data = list_entry(list.next, struct nfs_read_data, pages); - list_del(&data->pages); + while (!list_empty(res)) { + data = list_entry(res->next, struct nfs_read_data, list); + list_del(&data->list); nfs_readdata_free(data); } - SetPageError(page); nfs_readpage_release(req); return -ENOMEM; } -static int nfs_pagein_one(struct nfs_pageio_descriptor *desc) +static int nfs_pagein_one(struct nfs_pageio_descriptor *desc, struct list_head *res) { struct nfs_page *req; struct page **pages; struct nfs_read_data *data; struct list_head *head = &desc->pg_list; - struct pnfs_layout_segment *lseg = desc->pg_lseg; - int ret = -ENOMEM; + int ret = 0; data = nfs_readdata_alloc(nfs_page_array_len(desc->pg_base, desc->pg_count)); if (!data) { nfs_async_read_error(head); + ret = -ENOMEM; goto out; } @@ -348,23 +351,40 @@ static int nfs_pagein_one(struct nfs_pageio_descriptor *desc) req = nfs_list_entry(head->next); nfs_list_remove_request(req); nfs_list_add_request(req, &data->pages); - ClearPageError(req->wb_page); *pages++ = req->wb_page; } req = nfs_list_entry(data->pages.next); - if ((!lseg) && list_is_singular(&data->pages)) - lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, - req_offset(req), desc->pg_count, - IOMODE_READ, GFP_KERNEL); - ret = nfs_read_rpcsetup(req, data, &nfs_read_full_ops, desc->pg_count, - 0, lseg); + nfs_read_rpcsetup(req, data, desc->pg_count, 0); + list_add(&data->list, res); + desc->pg_rpc_callops = &nfs_read_full_ops; out: - put_lseg(lseg); - desc->pg_lseg = NULL; return ret; } +int nfs_generic_pagein(struct nfs_pageio_descriptor *desc, struct list_head *head) +{ + if (desc->pg_bsize < PAGE_CACHE_SIZE) + return nfs_pagein_multi(desc, head); + return nfs_pagein_one(desc, head); +} + +static int nfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) +{ + LIST_HEAD(head); + int ret; + + ret = nfs_generic_pagein(desc, &head); + if (ret == 0) + ret = nfs_do_multiple_reads(&head, desc->pg_rpc_callops); + return ret; +} + +static const struct nfs_pageio_ops nfs_pageio_read_ops = { + .pg_test = nfs_generic_pg_test, + .pg_doio = nfs_generic_pg_readpages, +}; + /* * This is the callback from RPC telling us whether a reply was * received or some error occurred (timeout or socket shutdown). @@ -408,7 +428,7 @@ static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data argp->offset += resp->count; argp->pgbase += resp->count; argp->count -= resp->count; - nfs_restart_rpc(task, NFS_SERVER(data->inode)->nfs_client); + rpc_restart_call_prepare(task); } /* @@ -435,10 +455,10 @@ static void nfs_readpage_release_partial(void *calldata) int status = data->task.tk_status; if (status < 0) - SetPageError(page); + set_bit(PG_PARTIAL_READ_FAILED, &req->wb_flags); if (atomic_dec_and_test(&req->wb_complete)) { - if (!PageError(page)) + if (!test_bit(PG_PARTIAL_READ_FAILED, &req->wb_flags)) SetPageUptodate(page); nfs_readpage_release(req); } @@ -621,7 +641,6 @@ readpage_async_filler(void *data, struct page *page) return 0; out_error: error = PTR_ERR(new); - SetPageError(page); out_unlock: unlock_page(page); return error; @@ -635,8 +654,6 @@ int nfs_readpages(struct file *filp, struct address_space *mapping, .pgio = &pgio, }; struct inode *inode = mapping->host; - struct nfs_server *server = NFS_SERVER(inode); - size_t rsize = server->rsize; unsigned long npages; int ret = -ESTALE; @@ -664,10 +681,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping, if (ret == 0) goto read_complete; /* all pages were read */ - if (rsize < PAGE_CACHE_SIZE) - nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0); - else - nfs_pageio_init(&pgio, inode, nfs_pagein_one, rsize, 0); + nfs_pageio_init_read(&pgio, inode); ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc); @@ -689,16 +703,10 @@ int __init nfs_init_readpagecache(void) if (nfs_rdata_cachep == NULL) return -ENOMEM; - nfs_rdata_mempool = mempool_create_slab_pool(MIN_POOL_READ, - nfs_rdata_cachep); - if (nfs_rdata_mempool == NULL) - return -ENOMEM; - return 0; } void nfs_destroy_readpagecache(void) { - mempool_destroy(nfs_rdata_mempool); kmem_cache_destroy(nfs_rdata_cachep); } diff --git a/fs/nfs/super.c b/fs/nfs/super.c index b961ceac66b4..134777406ee3 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -733,18 +733,22 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt) return 0; } + +#ifdef CONFIG_NFS_V4 #ifdef CONFIG_NFS_V4_1 -void show_sessions(struct seq_file *m, struct nfs_server *server) +static void show_sessions(struct seq_file *m, struct nfs_server *server) { if (nfs4_has_session(server->nfs_client)) seq_printf(m, ",sessions"); } #else -void show_sessions(struct seq_file *m, struct nfs_server *server) {} +static void show_sessions(struct seq_file *m, struct nfs_server *server) {} +#endif #endif +#ifdef CONFIG_NFS_V4 #ifdef CONFIG_NFS_V4_1 -void show_pnfs(struct seq_file *m, struct nfs_server *server) +static void show_pnfs(struct seq_file *m, struct nfs_server *server) { seq_printf(m, ",pnfs="); if (server->pnfs_curr_ld) @@ -752,9 +756,10 @@ void show_pnfs(struct seq_file *m, struct nfs_server *server) else seq_printf(m, "not configured"); } -#else /* CONFIG_NFS_V4_1 */ -void show_pnfs(struct seq_file *m, struct nfs_server *server) {} -#endif /* CONFIG_NFS_V4_1 */ +#else +static void show_pnfs(struct seq_file *m, struct nfs_server *server) {} +#endif +#endif static int nfs_show_devname(struct seq_file *m, struct vfsmount *mnt) { @@ -2035,9 +2040,6 @@ static inline void nfs_initialise_sb(struct super_block *sb) sb->s_blocksize = nfs_block_bits(server->wsize, &sb->s_blocksize_bits); - if (server->flags & NFS_MOUNT_NOAC) - sb->s_flags |= MS_SYNCHRONOUS; - sb->s_bdi = &server->backing_dev_info; nfs_super_set_maxbytes(sb, server->maxfilesize); @@ -2249,6 +2251,10 @@ static struct dentry *nfs_fs_mount(struct file_system_type *fs_type, if (server->flags & NFS_MOUNT_UNSHARED) compare_super = NULL; + /* -o noac implies -o sync */ + if (server->flags & NFS_MOUNT_NOAC) + sb_mntdata.mntflags |= MS_SYNCHRONOUS; + /* Get a superblock - note that we may end up sharing one that already exists */ s = sget(fs_type, compare_super, nfs_set_super, &sb_mntdata); if (IS_ERR(s)) { @@ -2361,6 +2367,10 @@ nfs_xdev_mount(struct file_system_type *fs_type, int flags, if (server->flags & NFS_MOUNT_UNSHARED) compare_super = NULL; + /* -o noac implies -o sync */ + if (server->flags & NFS_MOUNT_NOAC) + sb_mntdata.mntflags |= MS_SYNCHRONOUS; + /* Get a superblock - note that we may end up sharing one that already exists */ s = sget(&nfs_fs_type, compare_super, nfs_set_super, &sb_mntdata); if (IS_ERR(s)) { @@ -2628,6 +2638,10 @@ nfs4_remote_mount(struct file_system_type *fs_type, int flags, if (server->flags & NFS4_MOUNT_UNSHARED) compare_super = NULL; + /* -o noac implies -o sync */ + if (server->flags & NFS_MOUNT_NOAC) + sb_mntdata.mntflags |= MS_SYNCHRONOUS; + /* Get a superblock - note that we may end up sharing one that already exists */ s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata); if (IS_ERR(s)) { @@ -2773,43 +2787,18 @@ static void nfs_referral_loop_unprotect(void) static struct dentry *nfs_follow_remote_path(struct vfsmount *root_mnt, const char *export_path) { - struct mnt_namespace *ns_private; - struct super_block *s; struct dentry *dentry; - struct path path; - int ret; - - ns_private = create_mnt_ns(root_mnt); - ret = PTR_ERR(ns_private); - if (IS_ERR(ns_private)) - goto out_mntput; + int ret = nfs_referral_loop_protect(); - ret = nfs_referral_loop_protect(); - if (ret != 0) - goto out_put_mnt_ns; - - ret = vfs_path_lookup(root_mnt->mnt_root, root_mnt, - export_path, LOOKUP_FOLLOW, &path); + if (ret) { + mntput(root_mnt); + return ERR_PTR(ret); + } + dentry = mount_subtree(root_mnt, export_path); nfs_referral_loop_unprotect(); - put_mnt_ns(ns_private); - if (ret != 0) - goto out_err; - - s = path.mnt->mnt_sb; - atomic_inc(&s->s_active); - dentry = dget(path.dentry); - - path_put(&path); - down_write(&s->s_umount); return dentry; -out_put_mnt_ns: - put_mnt_ns(ns_private); -out_mntput: - mntput(root_mnt); -out_err: - return ERR_PTR(ret); } static struct dentry *nfs4_try_mount(int flags, const char *dev_name, @@ -2916,6 +2905,10 @@ nfs4_xdev_mount(struct file_system_type *fs_type, int flags, if (server->flags & NFS4_MOUNT_UNSHARED) compare_super = NULL; + /* -o noac implies -o sync */ + if (server->flags & NFS_MOUNT_NOAC) + sb_mntdata.mntflags |= MS_SYNCHRONOUS; + /* Get a superblock - note that we may end up sharing one that already exists */ s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata); if (IS_ERR(s)) { @@ -3003,6 +2996,10 @@ nfs4_remote_referral_mount(struct file_system_type *fs_type, int flags, if (server->flags & NFS4_MOUNT_UNSHARED) compare_super = NULL; + /* -o noac implies -o sync */ + if (server->flags & NFS_MOUNT_NOAC) + sb_mntdata.mntflags |= MS_SYNCHRONOUS; + /* Get a superblock - note that we may end up sharing one that already exists */ s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata); if (IS_ERR(s)) { diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index 8d6864c2a5fa..4f9319a2e567 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -87,7 +87,7 @@ static void nfs_async_unlink_done(struct rpc_task *task, void *calldata) struct inode *dir = data->dir; if (!NFS_PROTO(dir)->unlink_done(task, dir)) - nfs_restart_rpc(task, NFS_SERVER(dir)->nfs_client); + rpc_restart_call_prepare(task); } /** @@ -147,7 +147,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n alias = d_lookup(parent, &data->args.name); if (alias != NULL) { - int ret = 0; + int ret; void *devname_garbage = NULL; /* @@ -155,14 +155,16 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n * the sillyrename information to the aliased dentry. */ nfs_free_dname(data); + ret = nfs_copy_dname(alias, data); spin_lock(&alias->d_lock); - if (alias->d_inode != NULL && + if (ret == 0 && alias->d_inode != NULL && !(alias->d_flags & DCACHE_NFSFS_RENAMED)) { devname_garbage = alias->d_fsdata; alias->d_fsdata = data; alias->d_flags |= DCACHE_NFSFS_RENAMED; ret = 1; - } + } else + ret = 0; spin_unlock(&alias->d_lock); nfs_dec_sillycount(dir); dput(alias); @@ -171,8 +173,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n * point dentry is definitely not a root, so we won't need * that anymore. */ - if (devname_garbage) - kfree(devname_garbage); + kfree(devname_garbage); return ret; } data->dir = igrab(dir); @@ -204,8 +205,6 @@ static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data) if (parent == NULL) goto out_free; dir = parent->d_inode; - if (nfs_copy_dname(dentry, data) != 0) - goto out_dput; /* Non-exclusive lock protects against concurrent lookup() calls */ spin_lock(&dir->i_lock); if (atomic_inc_not_zero(&NFS_I(dir)->silly_count) == 0) { @@ -366,19 +365,21 @@ static void nfs_async_rename_done(struct rpc_task *task, void *calldata) struct nfs_renamedata *data = calldata; struct inode *old_dir = data->old_dir; struct inode *new_dir = data->new_dir; + struct dentry *old_dentry = data->old_dentry; + struct dentry *new_dentry = data->new_dentry; if (!NFS_PROTO(old_dir)->rename_done(task, old_dir, new_dir)) { - nfs_restart_rpc(task, NFS_SERVER(old_dir)->nfs_client); + rpc_restart_call_prepare(task); return; } if (task->tk_status != 0) { - nfs_cancel_async_unlink(data->old_dentry); + nfs_cancel_async_unlink(old_dentry); return; } - nfs_set_verifier(data->old_dentry, nfs_save_change_attribute(old_dir)); - d_move(data->old_dentry, data->new_dentry); + d_drop(old_dentry); + d_drop(new_dentry); } /** @@ -501,6 +502,14 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir, * and only performs the unlink once the last reference to it is put. * * The final cleanup is done during dentry_iput. + * + * (Note: NFSv4 is stateful, and has opens, so in theory an NFSv4 server + * could take responsibility for keeping open files referenced. The server + * would also need to ensure that opened-but-deleted files were kept over + * reboots. However, we may not assume a server does so. (RFC 5661 + * does provide an OPEN4_RESULT_PRESERVE_UNLINKED flag that a server can + * use to advertise that it does this; some day we may take advantage of + * it.)) */ int nfs_sillyrename(struct inode *dir, struct dentry *dentry) @@ -560,6 +569,14 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry) if (error) goto out_dput; + /* populate unlinkdata with the right dname */ + error = nfs_copy_dname(sdentry, + (struct nfs_unlinkdata *)dentry->d_fsdata); + if (error) { + nfs_cancel_async_unlink(dentry); + goto out_dput; + } + /* run the rename task, undo unlink if it fails */ task = nfs_async_rename(dir, dir, dentry, sdentry); if (IS_ERR(task)) { diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 08579312c57b..1dda78db6a73 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -20,6 +20,7 @@ #include <linux/nfs_mount.h> #include <linux/nfs_page.h> #include <linux/backing-dev.h> +#include <linux/export.h> #include <asm/uaccess.h> @@ -97,7 +98,7 @@ void nfs_writedata_free(struct nfs_write_data *p) mempool_free(p, nfs_wdata_mempool); } -static void nfs_writedata_release(struct nfs_write_data *wdata) +void nfs_writedata_release(struct nfs_write_data *wdata) { put_lseg(wdata->lseg); put_nfs_open_context(wdata->args.context); @@ -390,7 +391,7 @@ static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req) error = radix_tree_insert(&nfsi->nfs_page_tree, req->wb_index, req); BUG_ON(error); if (!nfsi->npages && nfs_have_delegation(inode, FMODE_WRITE)) - nfsi->change_attr++; + inode->i_version++; set_bit(PG_MAPPED, &req->wb_flags); SetPagePrivate(req->wb_page); set_page_private(req->wb_page, (unsigned long)req); @@ -428,7 +429,6 @@ static void nfs_mark_request_dirty(struct nfs_page *req) { __set_page_dirty_nobuffers(req->wb_page); - __mark_inode_dirty(req->wb_page->mapping->host, I_DIRTY_DATASYNC); } #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) @@ -762,6 +762,8 @@ int nfs_updatepage(struct file *file, struct page *page, status = nfs_writepage_setup(ctx, page, offset, count); if (status < 0) nfs_set_pageerror(page); + else + __set_page_dirty_nobuffers(page); dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n", status, (long long)i_size_read(inode)); @@ -845,11 +847,9 @@ EXPORT_SYMBOL_GPL(nfs_initiate_write); /* * Set up the argument/result storage required for the RPC call. */ -static int nfs_write_rpcsetup(struct nfs_page *req, +static void nfs_write_rpcsetup(struct nfs_page *req, struct nfs_write_data *data, - const struct rpc_call_ops *call_ops, unsigned int count, unsigned int offset, - struct pnfs_layout_segment *lseg, int how) { struct inode *inode = req->wb_context->dentry->d_inode; @@ -860,7 +860,6 @@ static int nfs_write_rpcsetup(struct nfs_page *req, data->req = req; data->inode = inode = req->wb_context->dentry->d_inode; data->cred = req->wb_context->cred; - data->lseg = get_lseg(lseg); data->args.fh = NFS_FH(inode); data->args.offset = req_offset(req) + offset; @@ -872,24 +871,51 @@ static int nfs_write_rpcsetup(struct nfs_page *req, data->args.context = get_nfs_open_context(req->wb_context); data->args.lock_context = req->wb_lock_context; data->args.stable = NFS_UNSTABLE; - if (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) { - data->args.stable = NFS_DATA_SYNC; - if (!nfs_need_commit(NFS_I(inode))) - data->args.stable = NFS_FILE_SYNC; + switch (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) { + case 0: + break; + case FLUSH_COND_STABLE: + if (nfs_need_commit(NFS_I(inode))) + break; + default: + data->args.stable = NFS_FILE_SYNC; } data->res.fattr = &data->fattr; data->res.count = count; data->res.verf = &data->verf; nfs_fattr_init(&data->fattr); +} - if (data->lseg && - (pnfs_try_to_write_data(data, call_ops, how) == PNFS_ATTEMPTED)) - return 0; +static int nfs_do_write(struct nfs_write_data *data, + const struct rpc_call_ops *call_ops, + int how) +{ + struct inode *inode = data->args.context->dentry->d_inode; return nfs_initiate_write(data, NFS_CLIENT(inode), call_ops, how); } +static int nfs_do_multiple_writes(struct list_head *head, + const struct rpc_call_ops *call_ops, + int how) +{ + struct nfs_write_data *data; + int ret = 0; + + while (!list_empty(head)) { + int ret2; + + data = list_entry(head->next, struct nfs_write_data, list); + list_del_init(&data->list); + + ret2 = nfs_do_write(data, call_ops, how); + if (ret == 0) + ret = ret2; + } + return ret; +} + /* If a nfs_flush_* function fails, it should remove reqs from @head and * call this on each, which will prepare them to be retried on next * writeback using standard nfs. @@ -907,17 +933,15 @@ static void nfs_redirty_request(struct nfs_page *req) * Generate multiple small requests to write out a single * contiguous dirty area on one page. */ -static int nfs_flush_multi(struct nfs_pageio_descriptor *desc) +static int nfs_flush_multi(struct nfs_pageio_descriptor *desc, struct list_head *res) { struct nfs_page *req = nfs_list_entry(desc->pg_list.next); struct page *page = req->wb_page; struct nfs_write_data *data; - size_t wsize = NFS_SERVER(desc->pg_inode)->wsize, nbytes; + size_t wsize = desc->pg_bsize, nbytes; unsigned int offset; int requests = 0; int ret = 0; - struct pnfs_layout_segment *lseg; - LIST_HEAD(list); nfs_list_remove_request(req); @@ -927,6 +951,7 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc) desc->pg_ioflags &= ~FLUSH_COND_STABLE; + offset = 0; nbytes = desc->pg_count; do { size_t len = min(nbytes, wsize); @@ -934,45 +959,21 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc) data = nfs_writedata_alloc(1); if (!data) goto out_bad; - list_add(&data->pages, &list); + data->pagevec[0] = page; + nfs_write_rpcsetup(req, data, len, offset, desc->pg_ioflags); + list_add(&data->list, res); requests++; nbytes -= len; + offset += len; } while (nbytes != 0); atomic_set(&req->wb_complete, requests); - - BUG_ON(desc->pg_lseg); - lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, - req_offset(req), desc->pg_count, - IOMODE_RW, GFP_NOFS); - ClearPageError(page); - offset = 0; - nbytes = desc->pg_count; - do { - int ret2; - - data = list_entry(list.next, struct nfs_write_data, pages); - list_del_init(&data->pages); - - data->pagevec[0] = page; - - if (nbytes < wsize) - wsize = nbytes; - ret2 = nfs_write_rpcsetup(req, data, &nfs_write_partial_ops, - wsize, offset, lseg, desc->pg_ioflags); - if (ret == 0) - ret = ret2; - offset += wsize; - nbytes -= wsize; - } while (nbytes != 0); - - put_lseg(lseg); - desc->pg_lseg = NULL; + desc->pg_rpc_callops = &nfs_write_partial_ops; return ret; out_bad: - while (!list_empty(&list)) { - data = list_entry(list.next, struct nfs_write_data, pages); - list_del(&data->pages); + while (!list_empty(res)) { + data = list_entry(res->next, struct nfs_write_data, list); + list_del(&data->list); nfs_writedata_free(data); } nfs_redirty_request(req); @@ -987,14 +988,13 @@ out_bad: * This is the case if nfs_updatepage detects a conflicting request * that has been written but not committed. */ -static int nfs_flush_one(struct nfs_pageio_descriptor *desc) +static int nfs_flush_one(struct nfs_pageio_descriptor *desc, struct list_head *res) { struct nfs_page *req; struct page **pages; struct nfs_write_data *data; struct list_head *head = &desc->pg_list; - struct pnfs_layout_segment *lseg = desc->pg_lseg; - int ret; + int ret = 0; data = nfs_writedata_alloc(nfs_page_array_len(desc->pg_base, desc->pg_count)); @@ -1012,36 +1012,65 @@ static int nfs_flush_one(struct nfs_pageio_descriptor *desc) req = nfs_list_entry(head->next); nfs_list_remove_request(req); nfs_list_add_request(req, &data->pages); - ClearPageError(req->wb_page); *pages++ = req->wb_page; } req = nfs_list_entry(data->pages.next); - if ((!lseg) && list_is_singular(&data->pages)) - lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, - req_offset(req), desc->pg_count, - IOMODE_RW, GFP_NOFS); if ((desc->pg_ioflags & FLUSH_COND_STABLE) && (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit)) desc->pg_ioflags &= ~FLUSH_COND_STABLE; /* Set up the argument struct */ - ret = nfs_write_rpcsetup(req, data, &nfs_write_full_ops, desc->pg_count, 0, lseg, desc->pg_ioflags); + nfs_write_rpcsetup(req, data, desc->pg_count, 0, desc->pg_ioflags); + list_add(&data->list, res); + desc->pg_rpc_callops = &nfs_write_full_ops; out: - put_lseg(lseg); /* Cleans any gotten in ->pg_test */ - desc->pg_lseg = NULL; return ret; } -static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, +int nfs_generic_flush(struct nfs_pageio_descriptor *desc, struct list_head *head) +{ + if (desc->pg_bsize < PAGE_CACHE_SIZE) + return nfs_flush_multi(desc, head); + return nfs_flush_one(desc, head); +} + +static int nfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) +{ + LIST_HEAD(head); + int ret; + + ret = nfs_generic_flush(desc, &head); + if (ret == 0) + ret = nfs_do_multiple_writes(&head, desc->pg_rpc_callops, + desc->pg_ioflags); + return ret; +} + +static const struct nfs_pageio_ops nfs_pageio_write_ops = { + .pg_test = nfs_generic_pg_test, + .pg_doio = nfs_generic_pg_writepages, +}; + +static void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio, struct inode *inode, int ioflags) { - size_t wsize = NFS_SERVER(inode)->wsize; + nfs_pageio_init(pgio, inode, &nfs_pageio_write_ops, + NFS_SERVER(inode)->wsize, ioflags); +} - if (wsize < PAGE_CACHE_SIZE) - nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags); - else - nfs_pageio_init(pgio, inode, nfs_flush_one, wsize, ioflags); +void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio) +{ + pgio->pg_ops = &nfs_pageio_write_ops; + pgio->pg_bsize = NFS_SERVER(pgio->pg_inode)->wsize; +} +EXPORT_SYMBOL_GPL(nfs_pageio_reset_write_mds); + +static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, + struct inode *inode, int ioflags) +{ + if (!pnfs_pageio_init_write(pgio, inode, ioflags)) + nfs_pageio_init_write_mds(pgio, inode, ioflags); } /* @@ -1137,7 +1166,13 @@ static void nfs_writeback_done_full(struct rpc_task *task, void *calldata) static void nfs_writeback_release_full(void *calldata) { struct nfs_write_data *data = calldata; - int status = data->task.tk_status; + int ret, status = data->task.tk_status; + struct nfs_pageio_descriptor pgio; + + if (data->pnfs_error) { + nfs_pageio_init_write_mds(&pgio, data->inode, FLUSH_STABLE); + pgio.pg_recoalesce = 1; + } /* Update attributes as result of writeback. */ while (!list_empty(&data->pages)) { @@ -1153,6 +1188,11 @@ static void nfs_writeback_release_full(void *calldata) req->wb_bytes, (long long)req_offset(req)); + if (data->pnfs_error) { + dprintk(", pnfs error = %d\n", data->pnfs_error); + goto next; + } + if (status < 0) { nfs_set_pageerror(page); nfs_context_set_write_error(req->wb_context, status); @@ -1172,7 +1212,19 @@ remove_request: next: nfs_clear_page_tag_locked(req); nfs_end_page_writeback(page); + if (data->pnfs_error) { + lock_page(page); + nfs_pageio_cond_complete(&pgio, page->index); + ret = nfs_page_async_flush(&pgio, page, 0); + if (ret) { + nfs_set_pageerror(page); + dprintk("rewrite to MDS error = %d\n", ret); + } + unlock_page(page); + } } + if (data->pnfs_error) + nfs_pageio_complete(&pgio); nfs_writedata_release(calldata); } @@ -1192,7 +1244,6 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) { struct nfs_writeargs *argp = &data->args; struct nfs_writeres *resp = &data->res; - struct nfs_server *server = NFS_SERVER(data->inode); int status; dprintk("NFS: %5u nfs_writeback_done (status %d)\n", @@ -1226,7 +1277,7 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) if (time_before(complain, jiffies)) { dprintk("NFS: faulty NFS server %s:" " (committed = %d) != (stable = %d)\n", - server->nfs_client->cl_hostname, + NFS_SERVER(data->inode)->nfs_client->cl_hostname, resp->verf->committed, argp->stable); complain = jiffies + 300 * HZ; } @@ -1253,7 +1304,7 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) */ argp->stable = NFS_FILE_SYNC; } - nfs_restart_rpc(task, server->nfs_client); + rpc_restart_call_prepare(task); return; } if (time_before(complain, jiffies)) { @@ -1525,6 +1576,10 @@ static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_contr int flags = FLUSH_SYNC; int ret = 0; + /* no commits means nothing needs to be done */ + if (!nfsi->ncommit) + return ret; + if (wbc->sync_mode == WB_SYNC_NONE) { /* Don't commit yet if this is a non-blocking flush and there * are a lot of outstanding writes for this mapping. @@ -1566,8 +1621,7 @@ int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) int status; bool sync = true; - if (wbc->sync_mode == WB_SYNC_NONE || wbc->nonblocking || - wbc->for_background) + if (wbc->sync_mode == WB_SYNC_NONE) sync = false; status = pnfs_layoutcommit_inode(inode, sync); @@ -1659,34 +1713,20 @@ out_error: int nfs_migrate_page(struct address_space *mapping, struct page *newpage, struct page *page) { - struct nfs_page *req; - int ret; + /* + * If PagePrivate is set, then the page is currently associated with + * an in-progress read or write request. Don't try to migrate it. + * + * FIXME: we could do this in principle, but we'll need a way to ensure + * that we can safely release the inode reference while holding + * the page lock. + */ + if (PagePrivate(page)) + return -EBUSY; nfs_fscache_release_page(page, GFP_KERNEL); - req = nfs_find_and_lock_request(page, false); - ret = PTR_ERR(req); - if (IS_ERR(req)) - goto out; - - ret = migrate_page(mapping, newpage, page); - if (!req) - goto out; - if (ret) - goto out_unlock; - page_cache_get(newpage); - spin_lock(&mapping->host->i_lock); - req->wb_page = newpage; - SetPagePrivate(newpage); - set_page_private(newpage, (unsigned long)req); - ClearPagePrivate(page); - set_page_private(page, 0); - spin_unlock(&mapping->host->i_lock); - page_cache_release(page); -out_unlock: - nfs_clear_page_tag_locked(req); -out: - return ret; + return migrate_page(mapping, newpage, page); } #endif diff --git a/fs/nfsctl.c b/fs/nfsctl.c deleted file mode 100644 index 124e8fcb0dd6..000000000000 --- a/fs/nfsctl.c +++ /dev/null @@ -1,100 +0,0 @@ -/* - * fs/nfsctl.c - * - * This should eventually move to userland. - * - */ -#include <linux/types.h> -#include <linux/file.h> -#include <linux/fs.h> -#include <linux/nfsd/syscall.h> -#include <linux/cred.h> -#include <linux/sched.h> -#include <linux/linkage.h> -#include <linux/namei.h> -#include <linux/mount.h> -#include <linux/syscalls.h> -#include <asm/uaccess.h> - -/* - * open a file on nfsd fs - */ - -static struct file *do_open(char *name, int flags) -{ - struct vfsmount *mnt; - struct file *file; - - mnt = do_kern_mount("nfsd", 0, "nfsd", NULL); - if (IS_ERR(mnt)) - return (struct file *)mnt; - - file = file_open_root(mnt->mnt_root, mnt, name, flags); - - mntput(mnt); /* drop do_kern_mount reference */ - return file; -} - -static struct { - char *name; int wsize; int rsize; -} map[] = { - [NFSCTL_SVC] = { - .name = ".svc", - .wsize = sizeof(struct nfsctl_svc) - }, - [NFSCTL_ADDCLIENT] = { - .name = ".add", - .wsize = sizeof(struct nfsctl_client) - }, - [NFSCTL_DELCLIENT] = { - .name = ".del", - .wsize = sizeof(struct nfsctl_client) - }, - [NFSCTL_EXPORT] = { - .name = ".export", - .wsize = sizeof(struct nfsctl_export) - }, - [NFSCTL_UNEXPORT] = { - .name = ".unexport", - .wsize = sizeof(struct nfsctl_export) - }, - [NFSCTL_GETFD] = { - .name = ".getfd", - .wsize = sizeof(struct nfsctl_fdparm), - .rsize = NFS_FHSIZE - }, - [NFSCTL_GETFS] = { - .name = ".getfs", - .wsize = sizeof(struct nfsctl_fsparm), - .rsize = sizeof(struct knfsd_fh) - }, -}; - -SYSCALL_DEFINE3(nfsservctl, int, cmd, struct nfsctl_arg __user *, arg, - void __user *, res) -{ - struct file *file; - void __user *p = &arg->u; - int version; - int err; - - if (copy_from_user(&version, &arg->ca_version, sizeof(int))) - return -EFAULT; - - if (version != NFSCTL_VERSION) - return -EINVAL; - - if (cmd < 0 || cmd >= ARRAY_SIZE(map) || !map[cmd].name) - return -EINVAL; - - file = do_open(map[cmd].name, map[cmd].rsize ? O_RDWR : O_WRONLY); - if (IS_ERR(file)) - return PTR_ERR(file); - err = file->f_op->write(file, p, map[cmd].wsize, &file->f_pos); - if (err >= 0 && map[cmd].rsize) - err = file->f_op->read(file, res, map[cmd].rsize, &file->f_pos); - if (err >= 0) - err = 0; - fput(file); - return err; -} diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index fbb2a5ef5817..10e6366608f2 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -28,18 +28,6 @@ config NFSD If unsure, say N. -config NFSD_DEPRECATED - bool "Include support for deprecated syscall interface to NFSD" - depends on NFSD - default y - help - The syscall interface to nfsd was obsoleted in 2.6.0 by a new - filesystem based interface. The old interface is due for removal - in 2.6.40. If you wish to remove the interface before then - say N. - - In unsure, say Y. - config NFSD_V2_ACL bool depends on NFSD diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h index d892be61016c..93cc9d34c459 100644 --- a/fs/nfsd/cache.h +++ b/fs/nfsd/cache.h @@ -69,7 +69,7 @@ enum { int nfsd_reply_cache_init(void); void nfsd_reply_cache_shutdown(void); -int nfsd_cache_lookup(struct svc_rqst *, int); +int nfsd_cache_lookup(struct svc_rqst *); void nfsd_cache_update(struct svc_rqst *, int, __be32 *); #ifdef CONFIG_NFSD_V4 diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index b9566e46219f..62f3b9074e84 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -16,7 +16,6 @@ #include <linux/module.h> #include <linux/exportfs.h> -#include <linux/nfsd/syscall.h> #include <net/ipv6.h> #include "nfsd.h" @@ -318,7 +317,6 @@ static void svc_export_put(struct kref *ref) struct svc_export *exp = container_of(ref, struct svc_export, h.ref); path_put(&exp->ex_path); auth_domain_put(exp->ex_client); - kfree(exp->ex_pathname); nfsd4_fslocs_free(&exp->ex_fslocs); kfree(exp); } @@ -528,11 +526,6 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) exp.ex_client = dom; - err = -ENOMEM; - exp.ex_pathname = kstrdup(buf, GFP_KERNEL); - if (!exp.ex_pathname) - goto out2; - /* expiry */ err = -EINVAL; exp.h.expiry_time = get_expiry(&mesg); @@ -613,8 +606,6 @@ out4: nfsd4_fslocs_free(&exp.ex_fslocs); kfree(exp.ex_uuid); out3: - kfree(exp.ex_pathname); -out2: path_put(&exp.ex_path); out1: auth_domain_put(dom); @@ -678,7 +669,6 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem) new->ex_client = item->ex_client; new->ex_path.dentry = dget(item->ex_path.dentry); new->ex_path.mnt = mntget(item->ex_path.mnt); - new->ex_pathname = NULL; new->ex_fslocs.locations = NULL; new->ex_fslocs.locations_count = 0; new->ex_fslocs.migrated = 0; @@ -696,8 +686,6 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem) new->ex_fsid = item->ex_fsid; new->ex_uuid = item->ex_uuid; item->ex_uuid = NULL; - new->ex_pathname = item->ex_pathname; - item->ex_pathname = NULL; new->ex_fslocs.locations = item->ex_fslocs.locations; item->ex_fslocs.locations = NULL; new->ex_fslocs.locations_count = item->ex_fslocs.locations_count; @@ -797,58 +785,6 @@ exp_find_key(svc_client *clp, int fsid_type, u32 *fsidv, struct cache_req *reqp) return ek; } -#ifdef CONFIG_NFSD_DEPRECATED -static int exp_set_key(svc_client *clp, int fsid_type, u32 *fsidv, - struct svc_export *exp) -{ - struct svc_expkey key, *ek; - - key.ek_client = clp; - key.ek_fsidtype = fsid_type; - memcpy(key.ek_fsid, fsidv, key_len(fsid_type)); - key.ek_path = exp->ex_path; - key.h.expiry_time = NEVER; - key.h.flags = 0; - - ek = svc_expkey_lookup(&key); - if (ek) - ek = svc_expkey_update(&key,ek); - if (ek) { - cache_put(&ek->h, &svc_expkey_cache); - return 0; - } - return -ENOMEM; -} - -/* - * Find the client's export entry matching xdev/xino. - */ -static inline struct svc_expkey * -exp_get_key(svc_client *clp, dev_t dev, ino_t ino) -{ - u32 fsidv[3]; - - if (old_valid_dev(dev)) { - mk_fsid(FSID_DEV, fsidv, dev, ino, 0, NULL); - return exp_find_key(clp, FSID_DEV, fsidv, NULL); - } - mk_fsid(FSID_ENCODE_DEV, fsidv, dev, ino, 0, NULL); - return exp_find_key(clp, FSID_ENCODE_DEV, fsidv, NULL); -} - -/* - * Find the client's export entry matching fsid - */ -static inline struct svc_expkey * -exp_get_fsid_key(svc_client *clp, int fsid) -{ - u32 fsidv[2]; - - mk_fsid(FSID_NUM, fsidv, 0, 0, fsid, NULL); - - return exp_find_key(clp, FSID_NUM, fsidv, NULL); -} -#endif static svc_export *exp_get_by_name(svc_client *clp, const struct path *path, struct cache_req *reqp) @@ -890,275 +826,7 @@ static struct svc_export *exp_parent(svc_client *clp, struct path *path) return exp; } -#ifdef CONFIG_NFSD_DEPRECATED -/* - * Hashtable locking. Write locks are placed only by user processes - * wanting to modify export information. - * Write locking only done in this file. Read locking - * needed externally. - */ -static DECLARE_RWSEM(hash_sem); - -void -exp_readlock(void) -{ - down_read(&hash_sem); -} - -static inline void -exp_writelock(void) -{ - down_write(&hash_sem); -} - -void -exp_readunlock(void) -{ - up_read(&hash_sem); -} - -static inline void -exp_writeunlock(void) -{ - up_write(&hash_sem); -} -#else - -/* hash_sem not needed once deprecated interface is removed */ -void exp_readlock(void) {} -static inline void exp_writelock(void){} -void exp_readunlock(void) {} -static inline void exp_writeunlock(void){} - -#endif - -#ifdef CONFIG_NFSD_DEPRECATED -static void exp_do_unexport(svc_export *unexp); -static int exp_verify_string(char *cp, int max); - -static void exp_fsid_unhash(struct svc_export *exp) -{ - struct svc_expkey *ek; - - if ((exp->ex_flags & NFSEXP_FSID) == 0) - return; - - ek = exp_get_fsid_key(exp->ex_client, exp->ex_fsid); - if (!IS_ERR(ek)) { - sunrpc_invalidate(&ek->h, &svc_expkey_cache); - cache_put(&ek->h, &svc_expkey_cache); - } -} - -static int exp_fsid_hash(svc_client *clp, struct svc_export *exp) -{ - u32 fsid[2]; - - if ((exp->ex_flags & NFSEXP_FSID) == 0) - return 0; - - mk_fsid(FSID_NUM, fsid, 0, 0, exp->ex_fsid, NULL); - return exp_set_key(clp, FSID_NUM, fsid, exp); -} - -static int exp_hash(struct auth_domain *clp, struct svc_export *exp) -{ - u32 fsid[2]; - struct inode *inode = exp->ex_path.dentry->d_inode; - dev_t dev = inode->i_sb->s_dev; - - if (old_valid_dev(dev)) { - mk_fsid(FSID_DEV, fsid, dev, inode->i_ino, 0, NULL); - return exp_set_key(clp, FSID_DEV, fsid, exp); - } - mk_fsid(FSID_ENCODE_DEV, fsid, dev, inode->i_ino, 0, NULL); - return exp_set_key(clp, FSID_ENCODE_DEV, fsid, exp); -} - -static void exp_unhash(struct svc_export *exp) -{ - struct svc_expkey *ek; - struct inode *inode = exp->ex_path.dentry->d_inode; - - ek = exp_get_key(exp->ex_client, inode->i_sb->s_dev, inode->i_ino); - if (!IS_ERR(ek)) { - sunrpc_invalidate(&ek->h, &svc_expkey_cache); - cache_put(&ek->h, &svc_expkey_cache); - } -} - -/* - * Export a file system. - */ -int -exp_export(struct nfsctl_export *nxp) -{ - svc_client *clp; - struct svc_export *exp = NULL; - struct svc_export new; - struct svc_expkey *fsid_key = NULL; - struct path path; - int err; - - /* Consistency check */ - err = -EINVAL; - if (!exp_verify_string(nxp->ex_path, NFS_MAXPATHLEN) || - !exp_verify_string(nxp->ex_client, NFSCLNT_IDMAX)) - goto out; - - dprintk("exp_export called for %s:%s (%x/%ld fl %x).\n", - nxp->ex_client, nxp->ex_path, - (unsigned)nxp->ex_dev, (long)nxp->ex_ino, - nxp->ex_flags); - - /* Try to lock the export table for update */ - exp_writelock(); - - /* Look up client info */ - if (!(clp = auth_domain_find(nxp->ex_client))) - goto out_unlock; - - - /* Look up the dentry */ - err = kern_path(nxp->ex_path, 0, &path); - if (err) - goto out_put_clp; - err = -EINVAL; - - exp = exp_get_by_name(clp, &path, NULL); - - memset(&new, 0, sizeof(new)); - - /* must make sure there won't be an ex_fsid clash */ - if ((nxp->ex_flags & NFSEXP_FSID) && - (!IS_ERR(fsid_key = exp_get_fsid_key(clp, nxp->ex_dev))) && - fsid_key->ek_path.mnt && - (fsid_key->ek_path.mnt != path.mnt || - fsid_key->ek_path.dentry != path.dentry)) - goto finish; - - if (!IS_ERR(exp)) { - /* just a flags/id/fsid update */ - - exp_fsid_unhash(exp); - exp->ex_flags = nxp->ex_flags; - exp->ex_anon_uid = nxp->ex_anon_uid; - exp->ex_anon_gid = nxp->ex_anon_gid; - exp->ex_fsid = nxp->ex_dev; - - err = exp_fsid_hash(clp, exp); - goto finish; - } - - err = check_export(path.dentry->d_inode, &nxp->ex_flags, NULL); - if (err) goto finish; - - err = -ENOMEM; - - dprintk("nfsd: creating export entry %p for client %p\n", exp, clp); - - new.h.expiry_time = NEVER; - new.h.flags = 0; - new.ex_pathname = kstrdup(nxp->ex_path, GFP_KERNEL); - if (!new.ex_pathname) - goto finish; - new.ex_client = clp; - new.ex_path = path; - new.ex_flags = nxp->ex_flags; - new.ex_anon_uid = nxp->ex_anon_uid; - new.ex_anon_gid = nxp->ex_anon_gid; - new.ex_fsid = nxp->ex_dev; - - exp = svc_export_lookup(&new); - if (exp) - exp = svc_export_update(&new, exp); - - if (!exp) - goto finish; - - if (exp_hash(clp, exp) || - exp_fsid_hash(clp, exp)) { - /* failed to create at least one index */ - exp_do_unexport(exp); - cache_flush(); - } else - err = 0; -finish: - kfree(new.ex_pathname); - if (!IS_ERR_OR_NULL(exp)) - exp_put(exp); - if (!IS_ERR_OR_NULL(fsid_key)) - cache_put(&fsid_key->h, &svc_expkey_cache); - path_put(&path); -out_put_clp: - auth_domain_put(clp); -out_unlock: - exp_writeunlock(); -out: - return err; -} - -/* - * Unexport a file system. The export entry has already - * been removed from the client's list of exported fs's. - */ -static void -exp_do_unexport(svc_export *unexp) -{ - sunrpc_invalidate(&unexp->h, &svc_export_cache); - exp_unhash(unexp); - exp_fsid_unhash(unexp); -} - - -/* - * unexport syscall. - */ -int -exp_unexport(struct nfsctl_export *nxp) -{ - struct auth_domain *dom; - svc_export *exp; - struct path path; - int err; - - /* Consistency check */ - if (!exp_verify_string(nxp->ex_path, NFS_MAXPATHLEN) || - !exp_verify_string(nxp->ex_client, NFSCLNT_IDMAX)) - return -EINVAL; - - exp_writelock(); - - err = -EINVAL; - dom = auth_domain_find(nxp->ex_client); - if (!dom) { - dprintk("nfsd: unexport couldn't find %s\n", nxp->ex_client); - goto out_unlock; - } - - err = kern_path(nxp->ex_path, 0, &path); - if (err) - goto out_domain; - - err = -EINVAL; - exp = exp_get_by_name(dom, &path, NULL); - path_put(&path); - if (IS_ERR(exp)) - goto out_domain; - - exp_do_unexport(exp); - exp_put(exp); - err = 0; - -out_domain: - auth_domain_put(dom); - cache_flush(); -out_unlock: - exp_writeunlock(); - return err; -} -#endif /* CONFIG_NFSD_DEPRECATED */ /* * Obtain the root fh on behalf of a client. @@ -1330,7 +998,7 @@ rqst_exp_parent(struct svc_rqst *rqstp, struct path *path) return exp; } -static struct svc_export *find_fsidzero_export(struct svc_rqst *rqstp) +struct svc_export *rqst_find_fsidzero_export(struct svc_rqst *rqstp) { u32 fsidv[2]; @@ -1350,7 +1018,7 @@ exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp) struct svc_export *exp; __be32 rv; - exp = find_fsidzero_export(rqstp); + exp = rqst_find_fsidzero_export(rqstp); if (IS_ERR(exp)) return nfserrno(PTR_ERR(exp)); rv = fh_compose(fhp, exp, exp->ex_path.dentry, NULL); @@ -1367,7 +1035,6 @@ static void *e_start(struct seq_file *m, loff_t *pos) unsigned hash, export; struct cache_head *ch; - exp_readlock(); read_lock(&svc_export_cache.hash_lock); if (!n--) return SEQ_START_TOKEN; @@ -1418,7 +1085,6 @@ static void e_stop(struct seq_file *m, void *p) __releases(svc_export_cache.hash_lock) { read_unlock(&svc_export_cache.hash_lock); - exp_readunlock(); } static struct flags { @@ -1550,97 +1216,6 @@ const struct seq_operations nfs_exports_op = { .show = e_show, }; -#ifdef CONFIG_NFSD_DEPRECATED -/* - * Add or modify a client. - * Change requests may involve the list of host addresses. The list of - * exports and possibly existing uid maps are left untouched. - */ -int -exp_addclient(struct nfsctl_client *ncp) -{ - struct auth_domain *dom; - int i, err; - struct in6_addr addr6; - - /* First, consistency check. */ - err = -EINVAL; - if (! exp_verify_string(ncp->cl_ident, NFSCLNT_IDMAX)) - goto out; - if (ncp->cl_naddr > NFSCLNT_ADDRMAX) - goto out; - - /* Lock the hashtable */ - exp_writelock(); - - dom = unix_domain_find(ncp->cl_ident); - - err = -ENOMEM; - if (!dom) - goto out_unlock; - - /* Insert client into hashtable. */ - for (i = 0; i < ncp->cl_naddr; i++) { - ipv6_addr_set_v4mapped(ncp->cl_addrlist[i].s_addr, &addr6); - auth_unix_add_addr(&init_net, &addr6, dom); - } - auth_unix_forget_old(dom); - auth_domain_put(dom); - - err = 0; - -out_unlock: - exp_writeunlock(); -out: - return err; -} - -/* - * Delete a client given an identifier. - */ -int -exp_delclient(struct nfsctl_client *ncp) -{ - int err; - struct auth_domain *dom; - - err = -EINVAL; - if (!exp_verify_string(ncp->cl_ident, NFSCLNT_IDMAX)) - goto out; - - /* Lock the hashtable */ - exp_writelock(); - - dom = auth_domain_find(ncp->cl_ident); - /* just make sure that no addresses work - * and that it will expire soon - */ - if (dom) { - err = auth_unix_forget_old(dom); - auth_domain_put(dom); - } - - exp_writeunlock(); -out: - return err; -} - -/* - * Verify that string is non-empty and does not exceed max length. - */ -static int -exp_verify_string(char *cp, int max) -{ - int i; - - for (i = 0; i < max; i++) - if (!cp[i]) - return i; - cp[i] = 0; - printk(KERN_NOTICE "nfsd: couldn't validate string %s\n", cp); - return 0; -} -#endif /* CONFIG_NFSD_DEPRECATED */ /* * Initialize the exports module. @@ -1667,10 +1242,8 @@ nfsd_export_init(void) void nfsd_export_flush(void) { - exp_writelock(); cache_purge(&svc_expkey_cache); cache_purge(&svc_export_cache); - exp_writeunlock(); } /* @@ -1682,12 +1255,9 @@ nfsd_export_shutdown(void) dprintk("nfsd: shutting down export module.\n"); - exp_writelock(); - cache_unregister(&svc_expkey_cache); cache_unregister(&svc_export_cache); svcauth_unix_purge(); - exp_writeunlock(); dprintk("nfsd: export shutdown complete.\n"); } diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c index 7c831a2731fa..77e7a5cca888 100644 --- a/fs/nfsd/lockd.c +++ b/fs/nfsd/lockd.c @@ -35,10 +35,8 @@ nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp) memcpy((char*)&fh.fh_handle.fh_base, f->data, f->size); fh.fh_export = NULL; - exp_readlock(); nfserr = nfsd_open(rqstp, &fh, S_IFREG, NFSD_MAY_LOCK, filp); fh_put(&fh); - exp_readunlock(); /* We return nlm error codes as nlm doesn't know * about nfsd, but nfsd does know about nlm.. */ diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index ad88f1c0a4c3..9c51aff02ae2 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -36,6 +36,7 @@ #include <linux/slab.h> #include <linux/nfs_fs.h> +#include <linux/export.h> #include "acl.h" diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 02eb4edf0ece..7748d6a18d97 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -39,6 +39,8 @@ #define NFSDDBG_FACILITY NFSDDBG_PROC +static void nfsd4_mark_cb_fault(struct nfs4_client *, int reason); + #define NFSPROC4_CB_NULL 0 #define NFSPROC4_CB_COMPOUND 1 @@ -351,7 +353,7 @@ static void encode_cb_recall4args(struct xdr_stream *xdr, __be32 *p; encode_nfs_cb_opnum4(xdr, OP_CB_RECALL); - encode_stateid4(xdr, &dp->dl_stateid); + encode_stateid4(xdr, &dp->dl_stid.sc_stateid); p = xdr_reserve_space(xdr, 4); *p++ = xdr_zero; /* truncate */ @@ -460,6 +462,8 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr, */ status = 0; out: + if (status) + nfsd4_mark_cb_fault(cb->cb_clp, status); return status; out_overflow: print_overflow_msg(__func__, xdr); @@ -686,6 +690,12 @@ static void nfsd4_mark_cb_down(struct nfs4_client *clp, int reason) warn_no_callback_path(clp, reason); } +static void nfsd4_mark_cb_fault(struct nfs4_client *clp, int reason) +{ + clp->cl_cb_state = NFSD4_CB_FAULT; + warn_no_callback_path(clp, reason); +} + static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata) { struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null); @@ -787,7 +797,7 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) { struct nfsd4_callback *cb = calldata; struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall); - struct nfs4_client *clp = dp->dl_client; + struct nfs4_client *clp = dp->dl_stid.sc_client; u32 minorversion = clp->cl_minorversion; cb->cb_minorversion = minorversion; @@ -809,7 +819,7 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata) { struct nfsd4_callback *cb = calldata; struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall); - struct nfs4_client *clp = dp->dl_client; + struct nfs4_client *clp = dp->dl_stid.sc_client; dprintk("%s: minorversion=%d\n", __func__, clp->cl_minorversion); @@ -832,7 +842,7 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata) { struct nfsd4_callback *cb = calldata; struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall); - struct nfs4_client *clp = dp->dl_client; + struct nfs4_client *clp = dp->dl_stid.sc_client; struct rpc_clnt *current_rpc_client = clp->cl_cb_client; nfsd4_cb_done(task, calldata); @@ -1006,7 +1016,7 @@ void nfsd4_do_callback_rpc(struct work_struct *w) void nfsd4_cb_recall(struct nfs4_delegation *dp) { struct nfsd4_callback *cb = &dp->dl_recall; - struct nfs4_client *clp = dp->dl_client; + struct nfs4_client *clp = dp->dl_stid.sc_client; dp->dl_retries = 1; cb->cb_op = dp; diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 3a6dbd70b34b..fa383361bc61 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -35,6 +35,7 @@ #include <linux/file.h> #include <linux/slab.h> +#include "idmap.h" #include "cache.h" #include "xdr4.h" #include "vfs.h" @@ -156,6 +157,8 @@ do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfs !(open->op_share_access & NFS4_SHARE_ACCESS_WRITE)) return nfserr_inval; + accmode |= NFSD_MAY_READ_IF_EXEC; + if (open->op_share_access & NFS4_SHARE_ACCESS_READ) accmode |= NFSD_MAY_READ; if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) @@ -168,12 +171,29 @@ do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfs return status; } +static __be32 nfsd_check_obj_isreg(struct svc_fh *fh) +{ + umode_t mode = fh->fh_dentry->d_inode->i_mode; + + if (S_ISREG(mode)) + return nfs_ok; + if (S_ISDIR(mode)) + return nfserr_isdir; + /* + * Using err_symlink as our catch-all case may look odd; but + * there's no other obvious error for this case in 4.0, and we + * happen to know that it will cause the linux v4 client to do + * the right thing on attempts to open something other than a + * regular file. + */ + return nfserr_symlink; +} + static __be32 do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open) { struct svc_fh resfh; __be32 status; - int created = 0; fh_init(&resfh, NFS4_FHSIZE); open->op_truncate = 0; @@ -202,7 +222,7 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o open->op_fname.len, &open->op_iattr, &resfh, open->op_createmode, (u32 *)open->op_verf.data, - &open->op_truncate, &created); + &open->op_truncate, &open->op_created); /* * Following rfc 3530 14.2.16, use the returned bitmask @@ -216,6 +236,9 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o status = nfsd_lookup(rqstp, current_fh, open->op_fname.data, open->op_fname.len, &resfh); fh_unlock(current_fh); + if (status) + goto out; + status = nfsd_check_obj_isreg(&resfh); } if (status) goto out; @@ -227,9 +250,9 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o fh_dup2(current_fh, &resfh); /* set reply cache */ - fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh, + fh_copy_shallow(&open->op_openowner->oo_owner.so_replay.rp_openfh, &resfh.fh_handle); - if (!created) + if (!open->op_created) status = do_open_permission(rqstp, current_fh, open, NFSD_MAY_NOP); @@ -254,7 +277,7 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_ memset(&open->op_cinfo, 0, sizeof(struct nfsd4_change_info)); /* set replay cache */ - fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh, + fh_copy_shallow(&open->op_openowner->oo_owner.so_replay.rp_openfh, ¤t_fh->fh_handle); open->op_truncate = (open->op_iattr.ia_valid & ATTR_SIZE) && @@ -283,14 +306,27 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, __be32 status; struct nfsd4_compoundres *resp; - dprintk("NFSD: nfsd4_open filename %.*s op_stateowner %p\n", + dprintk("NFSD: nfsd4_open filename %.*s op_openowner %p\n", (int)open->op_fname.len, open->op_fname.data, - open->op_stateowner); + open->op_openowner); /* This check required by spec. */ if (open->op_create && open->op_claim_type != NFS4_OPEN_CLAIM_NULL) return nfserr_inval; + /* We don't yet support WANT bits: */ + open->op_share_access &= NFS4_SHARE_ACCESS_MASK; + + open->op_created = 0; + /* + * RFC5661 18.51.3 + * Before RECLAIM_COMPLETE done, server should deny new lock + */ + if (nfsd4_has_session(cstate) && + !cstate->session->se_client->cl_firststate && + open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS) + return nfserr_grace; + if (nfsd4_has_session(cstate)) copy_clientid(&open->op_clientid, cstate->session); @@ -300,7 +336,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, resp = rqstp->rq_resp; status = nfsd4_process_open1(&resp->cstate, open); if (status == nfserr_replay_me) { - struct nfs4_replay *rp = &open->op_stateowner->so_replay; + struct nfs4_replay *rp = &open->op_openowner->oo_owner.so_replay; fh_put(&cstate->current_fh); fh_copy_shallow(&cstate->current_fh.fh_handle, &rp->rp_openfh); @@ -330,32 +366,23 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, switch (open->op_claim_type) { case NFS4_OPEN_CLAIM_DELEGATE_CUR: case NFS4_OPEN_CLAIM_NULL: - /* - * (1) set CURRENT_FH to the file being opened, - * creating it if necessary, (2) set open->op_cinfo, - * (3) set open->op_truncate if the file is to be - * truncated after opening, (4) do permission checking. - */ status = do_open_lookup(rqstp, &cstate->current_fh, open); if (status) goto out; break; case NFS4_OPEN_CLAIM_PREVIOUS: - open->op_stateowner->so_confirmed = 1; - /* - * The CURRENT_FH is already set to the file being - * opened. (1) set open->op_cinfo, (2) set - * open->op_truncate if the file is to be truncated - * after opening, (3) do permission checking. - */ + open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; + case NFS4_OPEN_CLAIM_FH: + case NFS4_OPEN_CLAIM_DELEG_CUR_FH: status = do_open_fhandle(rqstp, &cstate->current_fh, open); if (status) goto out; break; + case NFS4_OPEN_CLAIM_DELEG_PREV_FH: case NFS4_OPEN_CLAIM_DELEGATE_PREV: - open->op_stateowner->so_confirmed = 1; + open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; dprintk("NFSD: unsupported OPEN claim type %d\n", open->op_claim_type); status = nfserr_notsupp; @@ -372,12 +399,13 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * set, (2) sets open->op_stateid, (3) sets open->op_delegation. */ status = nfsd4_process_open2(rqstp, &cstate->current_fh, open); + WARN_ON(status && open->op_created); out: - if (open->op_stateowner) { - nfs4_get_stateowner(open->op_stateowner); - cstate->replay_owner = open->op_stateowner; - } - nfs4_unlock_state(); + nfsd4_cleanup_open_state(open, status); + if (open->op_openowner) + cstate->replay_owner = &open->op_openowner->oo_owner; + else + nfs4_unlock_state(); return status; } @@ -458,17 +486,12 @@ static __be32 nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_commit *commit) { - __be32 status; - u32 *p = (u32 *)commit->co_verf.data; *p++ = nfssvc_boot.tv_sec; *p++ = nfssvc_boot.tv_usec; - status = nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset, + return nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset, commit->co_count); - if (status == nfserr_symlink) - status = nfserr_inval; - return status; } static __be32 @@ -483,8 +506,6 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = fh_verify(rqstp, &cstate->current_fh, S_IFDIR, NFSD_MAY_CREATE); - if (status == nfserr_symlink) - status = nfserr_notdir; if (status) return status; @@ -682,7 +703,7 @@ nfsd4_readdir(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, readdir->rd_bmval[1] &= nfsd_suppattrs1(cstate->minorversion); readdir->rd_bmval[2] &= nfsd_suppattrs2(cstate->minorversion); - if ((cookie > ~(u32)0) || (cookie == 1) || (cookie == 2) || + if ((cookie == 1) || (cookie == 2) || (cookie == 0 && memcmp(readdir->rd_verf.data, zeroverf.data, NFS4_VERIFIER_SIZE))) return nfserr_bad_cookie; @@ -710,8 +731,6 @@ nfsd4_remove(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return nfserr_grace; status = nfsd_unlink(rqstp, &cstate->current_fh, 0, remove->rm_name, remove->rm_namelen); - if (status == nfserr_symlink) - return nfserr_notdir; if (!status) { fh_unlock(&cstate->current_fh); set_change_info(&remove->rm_cinfo, &cstate->current_fh); @@ -742,8 +761,6 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, (S_ISDIR(cstate->save_fh.fh_dentry->d_inode->i_mode) && S_ISDIR(cstate->current_fh.fh_dentry->d_inode->i_mode))) status = nfserr_exist; - else if (status == nfserr_symlink) - status = nfserr_notdir; if (!status) { set_change_info(&rename->rn_sinfo, &cstate->current_fh); @@ -883,8 +900,6 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, write->wr_bytes_written = cnt; - if (status == nfserr_symlink) - status = nfserr_inval; return status; } @@ -921,7 +936,7 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, count = 4 + (verify->ve_attrlen >> 2); buf = kmalloc(count << 2, GFP_KERNEL); if (!buf) - return nfserr_resource; + return nfserr_jukebox; status = nfsd4_encode_fattr(&cstate->current_fh, cstate->current_fh.fh_export, @@ -985,6 +1000,8 @@ static inline void nfsd4_increment_op_stats(u32 opnum) typedef __be32(*nfsd4op_func)(struct svc_rqst *, struct nfsd4_compound_state *, void *); +typedef u32(*nfsd4op_rsize)(struct svc_rqst *, struct nfsd4_op *op); + enum nfsd4_op_flags { ALLOWED_WITHOUT_FH = 1 << 0, /* No current filehandle required */ ALLOWED_ON_ABSENT_FS = 1 << 1, /* ops processed on absent fs */ @@ -992,12 +1009,31 @@ enum nfsd4_op_flags { /* For rfc 5661 section 2.6.3.1.1: */ OP_HANDLES_WRONGSEC = 1 << 3, OP_IS_PUTFH_LIKE = 1 << 4, + /* + * These are the ops whose result size we estimate before + * encoding, to avoid performing an op then not being able to + * respond or cache a response. This includes writes and setattrs + * as well as the operations usually called "nonidempotent": + */ + OP_MODIFIES_SOMETHING = 1 << 5, + /* + * Cache compounds containing these ops in the xid-based drc: + * We use the DRC for compounds containing non-idempotent + * operations, *except* those that are 4.1-specific (since + * sessions provide their own EOS), and except for stateful + * operations other than setclientid and setclientid_confirm + * (since sequence numbers provide EOS for open, lock, etc in + * the v4.0 case). + */ + OP_CACHEME = 1 << 6, }; struct nfsd4_operation { nfsd4op_func op_func; u32 op_flags; char *op_name; + /* Try to get response size before operation */ + nfsd4op_rsize op_rsize_bop; }; static struct nfsd4_operation nfsd4_ops[]; @@ -1042,6 +1078,11 @@ static inline struct nfsd4_operation *OPDESC(struct nfsd4_op *op) return &nfsd4_ops[op->opnum]; } +bool nfsd4_cache_this_op(struct nfsd4_op *op) +{ + return OPDESC(op)->op_flags & OP_CACHEME; +} + static bool need_wrongsec_check(struct svc_rqst *rqstp) { struct nfsd4_compoundres *resp = rqstp->rq_resp; @@ -1087,6 +1128,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, struct nfsd4_operation *opdesc; struct nfsd4_compound_state *cstate = &resp->cstate; int slack_bytes; + u32 plen = 0; __be32 status; resp->xbuf = &rqstp->rq_res; @@ -1165,6 +1207,15 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, goto encode_op; } + /* If op is non-idempotent */ + if (opdesc->op_flags & OP_MODIFIES_SOMETHING) { + plen = opdesc->op_rsize_bop(rqstp, op); + op->status = nfsd4_check_resp_size(resp, plen); + } + + if (op->status) + goto encode_op; + if (opdesc->op_func) op->status = opdesc->op_func(rqstp, cstate, &op->u); else @@ -1194,7 +1245,7 @@ encode_op: be32_to_cpu(status)); if (cstate->replay_owner) { - nfs4_put_stateowner(cstate->replay_owner); + nfs4_unlock_state(); cstate->replay_owner = NULL; } /* XXX Ugh, we need to get rid of this kind of special case: */ @@ -1209,13 +1260,150 @@ encode_op: fh_put(&resp->cstate.save_fh); BUG_ON(resp->cstate.replay_owner); out: - nfsd4_release_compoundargs(args); /* Reset deferral mechanism for RPC deferrals */ rqstp->rq_usedeferral = 1; dprintk("nfsv4 compound returned %d\n", ntohl(status)); return status; } +#define op_encode_hdr_size (2) +#define op_encode_stateid_maxsz (XDR_QUADLEN(NFS4_STATEID_SIZE)) +#define op_encode_verifier_maxsz (XDR_QUADLEN(NFS4_VERIFIER_SIZE)) +#define op_encode_change_info_maxsz (5) +#define nfs4_fattr_bitmap_maxsz (4) + +#define op_encode_lockowner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ)) +#define op_encode_lock_denied_maxsz (8 + op_encode_lockowner_maxsz) + +#define nfs4_owner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ)) + +#define op_encode_ace_maxsz (3 + nfs4_owner_maxsz) +#define op_encode_delegation_maxsz (1 + op_encode_stateid_maxsz + 1 + \ + op_encode_ace_maxsz) + +#define op_encode_channel_attrs_maxsz (6 + 1 + 1) + +static inline u32 nfsd4_only_status_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size) * sizeof(__be32); +} + +static inline u32 nfsd4_status_stateid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_stateid_maxsz)* sizeof(__be32); +} + +static inline u32 nfsd4_commit_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_verifier_maxsz) * sizeof(__be32); +} + +static inline u32 nfsd4_create_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_change_info_maxsz + + nfs4_fattr_bitmap_maxsz) * sizeof(__be32); +} + +static inline u32 nfsd4_link_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_change_info_maxsz) + * sizeof(__be32); +} + +static inline u32 nfsd4_lock_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_lock_denied_maxsz) + * sizeof(__be32); +} + +static inline u32 nfsd4_open_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_stateid_maxsz + + op_encode_change_info_maxsz + 1 + + nfs4_fattr_bitmap_maxsz + + op_encode_delegation_maxsz) * sizeof(__be32); +} + +static inline u32 nfsd4_read_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + u32 maxcount = 0, rlen = 0; + + maxcount = svc_max_payload(rqstp); + rlen = op->u.read.rd_length; + + if (rlen > maxcount) + rlen = maxcount; + + return (op_encode_hdr_size + 2) * sizeof(__be32) + rlen; +} + +static inline u32 nfsd4_readdir_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + u32 rlen = op->u.readdir.rd_maxcount; + + if (rlen > PAGE_SIZE) + rlen = PAGE_SIZE; + + return (op_encode_hdr_size + op_encode_verifier_maxsz) + * sizeof(__be32) + rlen; +} + +static inline u32 nfsd4_remove_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_change_info_maxsz) + * sizeof(__be32); +} + +static inline u32 nfsd4_rename_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_change_info_maxsz + + op_encode_change_info_maxsz) * sizeof(__be32); +} + +static inline u32 nfsd4_setattr_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + nfs4_fattr_bitmap_maxsz) * sizeof(__be32); +} + +static inline u32 nfsd4_setclientid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + 2 + 1024) * sizeof(__be32); +} + +static inline u32 nfsd4_write_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_verifier_maxsz) * sizeof(__be32); +} + +static inline u32 nfsd4_exchange_id_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + 2 + 1 + /* eir_clientid, eir_sequenceid */\ + 1 + 1 + 0 + /* eir_flags, spr_how, SP4_NONE (for now) */\ + 2 + /*eir_server_owner.so_minor_id */\ + /* eir_server_owner.so_major_id<> */\ + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 +\ + /* eir_server_scope<> */\ + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 +\ + 1 + /* eir_server_impl_id array length */\ + 0 /* ignored eir_server_impl_id contents */) * sizeof(__be32); +} + +static inline u32 nfsd4_bind_conn_to_session_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + \ + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + /* bctsr_sessid */\ + 2 /* bctsr_dir, use_conn_in_rdma_mode */) * sizeof(__be32); +} + +static inline u32 nfsd4_create_session_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + \ + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + /* sessionid */\ + 2 + /* csr_sequence, csr_flags */\ + op_encode_channel_attrs_maxsz + \ + op_encode_channel_attrs_maxsz) * sizeof(__be32); +} + static struct nfsd4_operation nfsd4_ops[] = { [OP_ACCESS] = { .op_func = (nfsd4op_func)nfsd4_access, @@ -1223,19 +1411,27 @@ static struct nfsd4_operation nfsd4_ops[] = { }, [OP_CLOSE] = { .op_func = (nfsd4op_func)nfsd4_close, + .op_flags = OP_MODIFIES_SOMETHING, .op_name = "OP_CLOSE", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize, }, [OP_COMMIT] = { .op_func = (nfsd4op_func)nfsd4_commit, + .op_flags = OP_MODIFIES_SOMETHING, .op_name = "OP_COMMIT", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_commit_rsize, }, [OP_CREATE] = { .op_func = (nfsd4op_func)nfsd4_create, + .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, .op_name = "OP_CREATE", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_create_rsize, }, [OP_DELEGRETURN] = { .op_func = (nfsd4op_func)nfsd4_delegreturn, + .op_flags = OP_MODIFIES_SOMETHING, .op_name = "OP_DELEGRETURN", + .op_rsize_bop = nfsd4_only_status_rsize, }, [OP_GETATTR] = { .op_func = (nfsd4op_func)nfsd4_getattr, @@ -1248,11 +1444,16 @@ static struct nfsd4_operation nfsd4_ops[] = { }, [OP_LINK] = { .op_func = (nfsd4op_func)nfsd4_link, + .op_flags = ALLOWED_ON_ABSENT_FS | OP_MODIFIES_SOMETHING + | OP_CACHEME, .op_name = "OP_LINK", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_link_rsize, }, [OP_LOCK] = { .op_func = (nfsd4op_func)nfsd4_lock, + .op_flags = OP_MODIFIES_SOMETHING, .op_name = "OP_LOCK", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_lock_rsize, }, [OP_LOCKT] = { .op_func = (nfsd4op_func)nfsd4_lockt, @@ -1260,7 +1461,9 @@ static struct nfsd4_operation nfsd4_ops[] = { }, [OP_LOCKU] = { .op_func = (nfsd4op_func)nfsd4_locku, + .op_flags = OP_MODIFIES_SOMETHING, .op_name = "OP_LOCKU", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize, }, [OP_LOOKUP] = { .op_func = (nfsd4op_func)nfsd4_lookup, @@ -1278,42 +1481,54 @@ static struct nfsd4_operation nfsd4_ops[] = { }, [OP_OPEN] = { .op_func = (nfsd4op_func)nfsd4_open, - .op_flags = OP_HANDLES_WRONGSEC, + .op_flags = OP_HANDLES_WRONGSEC | OP_MODIFIES_SOMETHING, .op_name = "OP_OPEN", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_open_rsize, }, [OP_OPEN_CONFIRM] = { .op_func = (nfsd4op_func)nfsd4_open_confirm, + .op_flags = OP_MODIFIES_SOMETHING, .op_name = "OP_OPEN_CONFIRM", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize, }, [OP_OPEN_DOWNGRADE] = { .op_func = (nfsd4op_func)nfsd4_open_downgrade, + .op_flags = OP_MODIFIES_SOMETHING, .op_name = "OP_OPEN_DOWNGRADE", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize, }, [OP_PUTFH] = { .op_func = (nfsd4op_func)nfsd4_putfh, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS - | OP_IS_PUTFH_LIKE, + | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING, .op_name = "OP_PUTFH", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_PUTPUBFH] = { .op_func = (nfsd4op_func)nfsd4_putrootfh, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS - | OP_IS_PUTFH_LIKE, + | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING, .op_name = "OP_PUTPUBFH", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_PUTROOTFH] = { .op_func = (nfsd4op_func)nfsd4_putrootfh, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS - | OP_IS_PUTFH_LIKE, + | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING, .op_name = "OP_PUTROOTFH", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_READ] = { .op_func = (nfsd4op_func)nfsd4_read, + .op_flags = OP_MODIFIES_SOMETHING, .op_name = "OP_READ", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_read_rsize, }, [OP_READDIR] = { .op_func = (nfsd4op_func)nfsd4_readdir, + .op_flags = OP_MODIFIES_SOMETHING, .op_name = "OP_READDIR", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_readdir_rsize, }, [OP_READLINK] = { .op_func = (nfsd4op_func)nfsd4_readlink, @@ -1321,27 +1536,36 @@ static struct nfsd4_operation nfsd4_ops[] = { }, [OP_REMOVE] = { .op_func = (nfsd4op_func)nfsd4_remove, + .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, .op_name = "OP_REMOVE", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_remove_rsize, }, [OP_RENAME] = { - .op_name = "OP_RENAME", .op_func = (nfsd4op_func)nfsd4_rename, + .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, + .op_name = "OP_RENAME", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_rename_rsize, }, [OP_RENEW] = { .op_func = (nfsd4op_func)nfsd4_renew, - .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS + | OP_MODIFIES_SOMETHING, .op_name = "OP_RENEW", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, + }, [OP_RESTOREFH] = { .op_func = (nfsd4op_func)nfsd4_restorefh, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS - | OP_IS_PUTFH_LIKE, + | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING, .op_name = "OP_RESTOREFH", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_SAVEFH] = { .op_func = (nfsd4op_func)nfsd4_savefh, - .op_flags = OP_HANDLES_WRONGSEC, + .op_flags = OP_HANDLES_WRONGSEC | OP_MODIFIES_SOMETHING, .op_name = "OP_SAVEFH", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_SECINFO] = { .op_func = (nfsd4op_func)nfsd4_secinfo, @@ -1351,16 +1575,22 @@ static struct nfsd4_operation nfsd4_ops[] = { [OP_SETATTR] = { .op_func = (nfsd4op_func)nfsd4_setattr, .op_name = "OP_SETATTR", + .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, + .op_rsize_bop = (nfsd4op_rsize)nfsd4_setattr_rsize, }, [OP_SETCLIENTID] = { .op_func = (nfsd4op_func)nfsd4_setclientid, - .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS + | OP_MODIFIES_SOMETHING | OP_CACHEME, .op_name = "OP_SETCLIENTID", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_setclientid_rsize, }, [OP_SETCLIENTID_CONFIRM] = { .op_func = (nfsd4op_func)nfsd4_setclientid_confirm, - .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS + | OP_MODIFIES_SOMETHING | OP_CACHEME, .op_name = "OP_SETCLIENTID_CONFIRM", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_VERIFY] = { .op_func = (nfsd4op_func)nfsd4_verify, @@ -1368,50 +1598,81 @@ static struct nfsd4_operation nfsd4_ops[] = { }, [OP_WRITE] = { .op_func = (nfsd4op_func)nfsd4_write, + .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, .op_name = "OP_WRITE", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_write_rsize, }, [OP_RELEASE_LOCKOWNER] = { .op_func = (nfsd4op_func)nfsd4_release_lockowner, - .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS + | OP_MODIFIES_SOMETHING, .op_name = "OP_RELEASE_LOCKOWNER", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, /* NFSv4.1 operations */ [OP_EXCHANGE_ID] = { .op_func = (nfsd4op_func)nfsd4_exchange_id, - .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP + | OP_MODIFIES_SOMETHING, .op_name = "OP_EXCHANGE_ID", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_exchange_id_rsize, }, [OP_BIND_CONN_TO_SESSION] = { .op_func = (nfsd4op_func)nfsd4_bind_conn_to_session, - .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP + | OP_MODIFIES_SOMETHING, .op_name = "OP_BIND_CONN_TO_SESSION", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_bind_conn_to_session_rsize, }, [OP_CREATE_SESSION] = { .op_func = (nfsd4op_func)nfsd4_create_session, - .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP + | OP_MODIFIES_SOMETHING, .op_name = "OP_CREATE_SESSION", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_create_session_rsize, }, [OP_DESTROY_SESSION] = { .op_func = (nfsd4op_func)nfsd4_destroy_session, - .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP + | OP_MODIFIES_SOMETHING, .op_name = "OP_DESTROY_SESSION", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_SEQUENCE] = { .op_func = (nfsd4op_func)nfsd4_sequence, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, .op_name = "OP_SEQUENCE", }, + [OP_DESTROY_CLIENTID] = { + .op_func = (nfsd4op_func)nfsd4_destroy_clientid, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP + | OP_MODIFIES_SOMETHING, + .op_name = "OP_DESTROY_CLIENTID", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, + }, [OP_RECLAIM_COMPLETE] = { .op_func = (nfsd4op_func)nfsd4_reclaim_complete, - .op_flags = ALLOWED_WITHOUT_FH, + .op_flags = ALLOWED_WITHOUT_FH | OP_MODIFIES_SOMETHING, .op_name = "OP_RECLAIM_COMPLETE", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_SECINFO_NO_NAME] = { .op_func = (nfsd4op_func)nfsd4_secinfo_no_name, .op_flags = OP_HANDLES_WRONGSEC, .op_name = "OP_SECINFO_NO_NAME", }, + [OP_TEST_STATEID] = { + .op_func = (nfsd4op_func)nfsd4_test_stateid, + .op_flags = ALLOWED_WITHOUT_FH, + .op_name = "OP_TEST_STATEID", + }, + [OP_FREE_STATEID] = { + .op_func = (nfsd4op_func)nfsd4_free_stateid, + .op_flags = ALLOWED_WITHOUT_FH | OP_MODIFIES_SOMETHING, + .op_name = "OP_FREE_STATEID", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, + }, }; static const char *nfsd4_op_name(unsigned opnum) @@ -1424,16 +1685,6 @@ static const char *nfsd4_op_name(unsigned opnum) #define nfsd4_voidres nfsd4_voidargs struct nfsd4_voidargs { int dummy; }; -/* - * TODO: At the present time, the NFSv4 server does not do XID caching - * of requests. Implementing XID caching would not be a serious problem, - * although it would require a mild change in interfaces since one - * doesn't know whether an NFSv4 request is idempotent until after the - * XDR decode. However, XID caching totally confuses pynfs (Peter - * Astrand's regression testsuite for NFSv4 servers), which reuses - * XID's liberally, so I've left it unimplemented until pynfs generates - * better XID's. - */ static struct svc_procedure nfsd_procedures4[2] = { [NFSPROC4_NULL] = { .pc_func = (svc_procfunc) nfsd4_proc_null, @@ -1449,6 +1700,7 @@ static struct svc_procedure nfsd_procedures4[2] = { .pc_encode = (kxdrproc_t) nfs4svc_encode_compoundres, .pc_argsize = sizeof(struct nfsd4_compoundargs), .pc_ressize = sizeof(struct nfsd4_compoundres), + .pc_release = nfsd4_release_compoundargs, .pc_cachetype = RC_NOCACHE, .pc_xdrressize = NFSD_BUFSIZE/4, }, diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 29d77f60585b..ed083b9a731b 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -45,6 +45,7 @@ /* Globals */ static struct file *rec_file; +static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery"; static int nfs4_save_creds(const struct cred **original_creds) @@ -88,7 +89,7 @@ nfs4_make_rec_clidname(char *dname, struct xdr_netobj *clname) struct xdr_netobj cksum; struct hash_desc desc; struct scatterlist sg; - __be32 status = nfserr_resource; + __be32 status = nfserr_jukebox; dprintk("NFSD: nfs4_make_rec_clidname for %.*s\n", clname->len, clname->data); @@ -129,6 +130,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) if (!rec_file || clp->cl_firststate) return 0; + clp->cl_firststate = 1; status = nfs4_save_creds(&original_cred); if (status < 0) return status; @@ -143,10 +145,8 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) goto out_unlock; } status = -EEXIST; - if (dentry->d_inode) { - dprintk("NFSD: nfsd4_create_clid_dir: DIRECTORY EXISTS\n"); + if (dentry->d_inode) goto out_put; - } status = mnt_want_write(rec_file->f_path.mnt); if (status) goto out_put; @@ -156,12 +156,14 @@ out_put: dput(dentry); out_unlock: mutex_unlock(&dir->d_inode->i_mutex); - if (status == 0) { - clp->cl_firststate = 1; + if (status == 0) vfs_fsync(rec_file, 0); - } + else + printk(KERN_ERR "NFSD: failed to write recovery record" + " (err %d); please check that %s exists" + " and is writeable", status, + user_recovery_dirname); nfs4_reset_creds(original_cred); - dprintk("NFSD: nfsd4_create_clid_dir returns %d\n", status); return status; } @@ -354,13 +356,13 @@ nfsd4_recdir_load(void) { */ void -nfsd4_init_recdir(char *rec_dirname) +nfsd4_init_recdir() { const struct cred *original_cred; int status; printk("NFSD: Using %s as the NFSv4 state recovery directory\n", - rec_dirname); + user_recovery_dirname); BUG_ON(rec_file); @@ -372,10 +374,10 @@ nfsd4_init_recdir(char *rec_dirname) return; } - rec_file = filp_open(rec_dirname, O_RDONLY | O_DIRECTORY, 0); + rec_file = filp_open(user_recovery_dirname, O_RDONLY | O_DIRECTORY, 0); if (IS_ERR(rec_file)) { printk("NFSD: unable to find recovery directory %s\n", - rec_dirname); + user_recovery_dirname); rec_file = NULL; } @@ -390,3 +392,30 @@ nfsd4_shutdown_recdir(void) fput(rec_file); rec_file = NULL; } + +/* + * Change the NFSv4 recovery directory to recdir. + */ +int +nfs4_reset_recoverydir(char *recdir) +{ + int status; + struct path path; + + status = kern_path(recdir, LOOKUP_FOLLOW, &path); + if (status) + return status; + status = -ENOTDIR; + if (S_ISDIR(path.dentry->d_inode->i_mode)) { + strcpy(user_recovery_dirname, recdir); + status = 0; + } + path_put(&path); + return status; +} + +char * +nfs4_recoverydir(void) +{ + return user_recovery_dirname; +} diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index e98f3c2e9492..47e94e33a975 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -37,6 +37,7 @@ #include <linux/slab.h> #include <linux/namei.h> #include <linux/swap.h> +#include <linux/pagemap.h> #include <linux/sunrpc/svcauth_gss.h> #include <linux/sunrpc/clnt.h> #include "xdr4.h" @@ -48,9 +49,6 @@ time_t nfsd4_lease = 90; /* default lease time */ time_t nfsd4_grace = 90; static time_t boot_time; -static u32 current_ownerid = 1; -static u32 current_fileid = 1; -static u32 current_delegid = 1; static stateid_t zerostateid; /* bits all 0 */ static stateid_t onestateid; /* bits all 1 */ static u64 current_sessionid = 1; @@ -59,10 +57,7 @@ static u64 current_sessionid = 1; #define ONE_STATEID(stateid) (!memcmp((stateid), &onestateid, sizeof(stateid_t))) /* forward declarations */ -static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags); -static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid); -static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery"; -static void nfs4_set_recdir(char *recdir); +static int check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner); /* Locking: */ @@ -76,7 +71,8 @@ static DEFINE_MUTEX(client_mutex); */ static DEFINE_SPINLOCK(recall_lock); -static struct kmem_cache *stateowner_slab = NULL; +static struct kmem_cache *openowner_slab = NULL; +static struct kmem_cache *lockowner_slab = NULL; static struct kmem_cache *file_slab = NULL; static struct kmem_cache *stateid_slab = NULL; static struct kmem_cache *deleg_slab = NULL; @@ -108,6 +104,11 @@ opaque_hashval(const void *ptr, int nbytes) static struct list_head del_recall_lru; +static void nfsd4_free_file(struct nfs4_file *f) +{ + kmem_cache_free(file_slab, f); +} + static inline void put_nfs4_file(struct nfs4_file *fi) { @@ -115,7 +116,7 @@ put_nfs4_file(struct nfs4_file *fi) list_del(&fi->fi_hash); spin_unlock(&recall_lock); iput(fi->fi_inode); - kmem_cache_free(file_slab, fi); + nfsd4_free_file(fi); } } @@ -132,35 +133,33 @@ unsigned int max_delegations; * Open owner state (share locks) */ -/* hash tables for nfs4_stateowner */ -#define OWNER_HASH_BITS 8 -#define OWNER_HASH_SIZE (1 << OWNER_HASH_BITS) -#define OWNER_HASH_MASK (OWNER_HASH_SIZE - 1) +/* hash tables for open owners */ +#define OPEN_OWNER_HASH_BITS 8 +#define OPEN_OWNER_HASH_SIZE (1 << OPEN_OWNER_HASH_BITS) +#define OPEN_OWNER_HASH_MASK (OPEN_OWNER_HASH_SIZE - 1) -#define ownerid_hashval(id) \ - ((id) & OWNER_HASH_MASK) -#define ownerstr_hashval(clientid, ownername) \ - (((clientid) + opaque_hashval((ownername.data), (ownername.len))) & OWNER_HASH_MASK) +static unsigned int open_ownerstr_hashval(u32 clientid, struct xdr_netobj *ownername) +{ + unsigned int ret; -static struct list_head ownerid_hashtbl[OWNER_HASH_SIZE]; -static struct list_head ownerstr_hashtbl[OWNER_HASH_SIZE]; + ret = opaque_hashval(ownername->data, ownername->len); + ret += clientid; + return ret & OPEN_OWNER_HASH_MASK; +} + +static struct list_head open_ownerstr_hashtbl[OPEN_OWNER_HASH_SIZE]; /* hash table for nfs4_file */ #define FILE_HASH_BITS 8 #define FILE_HASH_SIZE (1 << FILE_HASH_BITS) -/* hash table for (open)nfs4_stateid */ -#define STATEID_HASH_BITS 10 -#define STATEID_HASH_SIZE (1 << STATEID_HASH_BITS) -#define STATEID_HASH_MASK (STATEID_HASH_SIZE - 1) - -#define file_hashval(x) \ - hash_ptr(x, FILE_HASH_BITS) -#define stateid_hashval(owner_id, file_id) \ - (((owner_id) + (file_id)) & STATEID_HASH_MASK) +static unsigned int file_hashval(struct inode *ino) +{ + /* XXX: why are we hashing on inode pointer, anyway? */ + return hash_ptr(ino, FILE_HASH_BITS); +} static struct list_head file_hashtbl[FILE_HASH_SIZE]; -static struct list_head stateid_hashtbl[STATEID_HASH_SIZE]; static void __nfs4_file_get_access(struct nfs4_file *fp, int oflag) { @@ -188,8 +187,15 @@ static void nfs4_file_put_fd(struct nfs4_file *fp, int oflag) static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag) { if (atomic_dec_and_test(&fp->fi_access[oflag])) { - nfs4_file_put_fd(fp, O_RDWR); nfs4_file_put_fd(fp, oflag); + /* + * It's also safe to get rid of the RDWR open *if* + * we no longer have need of the other kind of access + * or if we already have the other kind of open: + */ + if (fp->fi_fds[1-oflag] + || atomic_read(&fp->fi_access[1 - oflag]) == 0) + nfs4_file_put_fd(fp, O_RDWR); } } @@ -202,8 +208,73 @@ static void nfs4_file_put_access(struct nfs4_file *fp, int oflag) __nfs4_file_put_access(fp, oflag); } +static inline int get_new_stid(struct nfs4_stid *stid) +{ + static int min_stateid = 0; + struct idr *stateids = &stid->sc_client->cl_stateids; + int new_stid; + int error; + + error = idr_get_new_above(stateids, stid, min_stateid, &new_stid); + /* + * Note: the necessary preallocation was done in + * nfs4_alloc_stateid(). The idr code caps the number of + * preallocations that can exist at a time, but the state lock + * prevents anyone from using ours before we get here: + */ + BUG_ON(error); + /* + * It shouldn't be a problem to reuse an opaque stateid value. + * I don't think it is for 4.1. But with 4.0 I worry that, for + * example, a stray write retransmission could be accepted by + * the server when it should have been rejected. Therefore, + * adopt a trick from the sctp code to attempt to maximize the + * amount of time until an id is reused, by ensuring they always + * "increase" (mod INT_MAX): + */ + + min_stateid = new_stid+1; + if (min_stateid == INT_MAX) + min_stateid = 0; + return new_stid; +} + +static void init_stid(struct nfs4_stid *stid, struct nfs4_client *cl, unsigned char type) +{ + stateid_t *s = &stid->sc_stateid; + int new_id; + + stid->sc_type = type; + stid->sc_client = cl; + s->si_opaque.so_clid = cl->cl_clientid; + new_id = get_new_stid(stid); + s->si_opaque.so_id = (u32)new_id; + /* Will be incremented before return to client: */ + s->si_generation = 0; +} + +static struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab) +{ + struct idr *stateids = &cl->cl_stateids; + + if (!idr_pre_get(stateids, GFP_KERNEL)) + return NULL; + /* + * Note: if we fail here (or any time between now and the time + * we actually get the new idr), we won't need to undo the idr + * preallocation, since the idr code caps the number of + * preallocated entries. + */ + return kmem_cache_alloc(slab, GFP_KERNEL); +} + +static struct nfs4_ol_stateid * nfs4_alloc_stateid(struct nfs4_client *clp) +{ + return openlockstateid(nfs4_alloc_stid(clp, stateid_slab)); +} + static struct nfs4_delegation * -alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_fh *current_fh, u32 type) +alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct svc_fh *current_fh, u32 type) { struct nfs4_delegation *dp; struct nfs4_file *fp = stp->st_file; @@ -220,21 +291,23 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f return NULL; if (num_delegations > max_delegations) return NULL; - dp = kmem_cache_alloc(deleg_slab, GFP_KERNEL); + dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab)); if (dp == NULL) return dp; + init_stid(&dp->dl_stid, clp, NFS4_DELEG_STID); + /* + * delegation seqid's are never incremented. The 4.1 special + * meaning of seqid 0 isn't meaningful, really, but let's avoid + * 0 anyway just for consistency and use 1: + */ + dp->dl_stid.sc_stateid.si_generation = 1; num_delegations++; INIT_LIST_HEAD(&dp->dl_perfile); INIT_LIST_HEAD(&dp->dl_perclnt); INIT_LIST_HEAD(&dp->dl_recall_lru); - dp->dl_client = clp; get_nfs4_file(fp); dp->dl_file = fp; dp->dl_type = type; - dp->dl_stateid.si_boot = boot_time; - dp->dl_stateid.si_stateownerid = current_delegid++; - dp->dl_stateid.si_fileid = 0; - dp->dl_stateid.si_generation = 0; fh_copy_shallow(&dp->dl_fh, ¤t_fh->fh_handle); dp->dl_time = 0; atomic_set(&dp->dl_count, 1); @@ -263,10 +336,18 @@ static void nfs4_put_deleg_lease(struct nfs4_file *fp) } } +static void unhash_stid(struct nfs4_stid *s) +{ + struct idr *stateids = &s->sc_client->cl_stateids; + + idr_remove(stateids, s->sc_stateid.si_opaque.so_id); +} + /* Called under the state lock. */ static void unhash_delegation(struct nfs4_delegation *dp) { + unhash_stid(&dp->dl_stid); list_del_init(&dp->dl_perclnt); spin_lock(&recall_lock); list_del_init(&dp->dl_perfile); @@ -288,10 +369,16 @@ static DEFINE_SPINLOCK(client_lock); #define CLIENT_HASH_SIZE (1 << CLIENT_HASH_BITS) #define CLIENT_HASH_MASK (CLIENT_HASH_SIZE - 1) -#define clientid_hashval(id) \ - ((id) & CLIENT_HASH_MASK) -#define clientstr_hashval(name) \ - (opaque_hashval((name), 8) & CLIENT_HASH_MASK) +static unsigned int clientid_hashval(u32 id) +{ + return id & CLIENT_HASH_MASK; +} + +static unsigned int clientstr_hashval(const char *name) +{ + return opaque_hashval(name, 8) & CLIENT_HASH_MASK; +} + /* * reclaim_str_hashtbl[] holds known client info from previous reset/reboot * used in reboot/reset lease grace period processing @@ -358,7 +445,7 @@ set_deny(unsigned int *deny, unsigned long bmap) { } static int -test_share(struct nfs4_stateid *stp, struct nfsd4_open *open) { +test_share(struct nfs4_ol_stateid *stp, struct nfsd4_open *open) { unsigned int access, deny; set_access(&access, stp->st_access_bmap); @@ -381,105 +468,121 @@ static int nfs4_access_to_omode(u32 access) BUG(); } -static int nfs4_access_bmap_to_omode(struct nfs4_stateid *stp) +static void unhash_generic_stateid(struct nfs4_ol_stateid *stp) { - unsigned int access; - - set_access(&access, stp->st_access_bmap); - return nfs4_access_to_omode(access); -} - -static void unhash_generic_stateid(struct nfs4_stateid *stp) -{ - list_del(&stp->st_hash); list_del(&stp->st_perfile); list_del(&stp->st_perstateowner); } -static void free_generic_stateid(struct nfs4_stateid *stp) +static void close_generic_stateid(struct nfs4_ol_stateid *stp) { - int oflag; + int i; if (stp->st_access_bmap) { - oflag = nfs4_access_bmap_to_omode(stp); - nfs4_file_put_access(stp->st_file, oflag); + for (i = 1; i < 4; i++) { + if (test_bit(i, &stp->st_access_bmap)) + nfs4_file_put_access(stp->st_file, + nfs4_access_to_omode(i)); + __clear_bit(i, &stp->st_access_bmap); + } } put_nfs4_file(stp->st_file); + stp->st_file = NULL; +} + +static void free_generic_stateid(struct nfs4_ol_stateid *stp) +{ kmem_cache_free(stateid_slab, stp); } -static void release_lock_stateid(struct nfs4_stateid *stp) +static void release_lock_stateid(struct nfs4_ol_stateid *stp) { struct file *file; unhash_generic_stateid(stp); + unhash_stid(&stp->st_stid); file = find_any_file(stp->st_file); if (file) - locks_remove_posix(file, (fl_owner_t)stp->st_stateowner); + locks_remove_posix(file, (fl_owner_t)lockowner(stp->st_stateowner)); + close_generic_stateid(stp); free_generic_stateid(stp); } -static void unhash_lockowner(struct nfs4_stateowner *sop) +static void unhash_lockowner(struct nfs4_lockowner *lo) { - struct nfs4_stateid *stp; + struct nfs4_ol_stateid *stp; - list_del(&sop->so_idhash); - list_del(&sop->so_strhash); - list_del(&sop->so_perstateid); - while (!list_empty(&sop->so_stateids)) { - stp = list_first_entry(&sop->so_stateids, - struct nfs4_stateid, st_perstateowner); + list_del(&lo->lo_owner.so_strhash); + list_del(&lo->lo_perstateid); + while (!list_empty(&lo->lo_owner.so_stateids)) { + stp = list_first_entry(&lo->lo_owner.so_stateids, + struct nfs4_ol_stateid, st_perstateowner); release_lock_stateid(stp); } } -static void release_lockowner(struct nfs4_stateowner *sop) +static void release_lockowner(struct nfs4_lockowner *lo) { - unhash_lockowner(sop); - nfs4_put_stateowner(sop); + unhash_lockowner(lo); + nfs4_free_lockowner(lo); } static void -release_stateid_lockowners(struct nfs4_stateid *open_stp) +release_stateid_lockowners(struct nfs4_ol_stateid *open_stp) { - struct nfs4_stateowner *lock_sop; + struct nfs4_lockowner *lo; while (!list_empty(&open_stp->st_lockowners)) { - lock_sop = list_entry(open_stp->st_lockowners.next, - struct nfs4_stateowner, so_perstateid); - /* list_del(&open_stp->st_lockowners); */ - BUG_ON(lock_sop->so_is_open_owner); - release_lockowner(lock_sop); + lo = list_entry(open_stp->st_lockowners.next, + struct nfs4_lockowner, lo_perstateid); + release_lockowner(lo); } } -static void release_open_stateid(struct nfs4_stateid *stp) +static void unhash_open_stateid(struct nfs4_ol_stateid *stp) { unhash_generic_stateid(stp); release_stateid_lockowners(stp); + close_generic_stateid(stp); +} + +static void release_open_stateid(struct nfs4_ol_stateid *stp) +{ + unhash_open_stateid(stp); + unhash_stid(&stp->st_stid); free_generic_stateid(stp); } -static void unhash_openowner(struct nfs4_stateowner *sop) +static void unhash_openowner(struct nfs4_openowner *oo) { - struct nfs4_stateid *stp; + struct nfs4_ol_stateid *stp; - list_del(&sop->so_idhash); - list_del(&sop->so_strhash); - list_del(&sop->so_perclient); - list_del(&sop->so_perstateid); /* XXX: necessary? */ - while (!list_empty(&sop->so_stateids)) { - stp = list_first_entry(&sop->so_stateids, - struct nfs4_stateid, st_perstateowner); + list_del(&oo->oo_owner.so_strhash); + list_del(&oo->oo_perclient); + while (!list_empty(&oo->oo_owner.so_stateids)) { + stp = list_first_entry(&oo->oo_owner.so_stateids, + struct nfs4_ol_stateid, st_perstateowner); release_open_stateid(stp); } } -static void release_openowner(struct nfs4_stateowner *sop) +static void release_last_closed_stateid(struct nfs4_openowner *oo) +{ + struct nfs4_ol_stateid *s = oo->oo_last_closed_stid; + + if (s) { + unhash_stid(&s->st_stid); + free_generic_stateid(s); + oo->oo_last_closed_stid = NULL; + } +} + +static void release_openowner(struct nfs4_openowner *oo) { - unhash_openowner(sop); - list_del(&sop->so_close_lru); - nfs4_put_stateowner(sop); + unhash_openowner(oo); + list_del(&oo->oo_close_lru); + release_last_closed_stateid(oo); + nfs4_free_openowner(oo); } #define SESSION_HASH_SIZE 512 @@ -844,9 +947,6 @@ renew_client_locked(struct nfs4_client *clp) return; } - /* - * Move client to the end to the LRU list. - */ dprintk("renewing client (clientid %08x/%08x)\n", clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id); @@ -944,7 +1044,7 @@ unhash_client_locked(struct nfs4_client *clp) static void expire_client(struct nfs4_client *clp) { - struct nfs4_stateowner *sop; + struct nfs4_openowner *oo; struct nfs4_delegation *dp; struct list_head reaplist; @@ -962,8 +1062,8 @@ expire_client(struct nfs4_client *clp) unhash_delegation(dp); } while (!list_empty(&clp->cl_openowners)) { - sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient); - release_openowner(sop); + oo = list_entry(clp->cl_openowners.next, struct nfs4_openowner, oo_perclient); + release_openowner(oo); } nfsd4_shutdown_callback(clp); if (clp->cl_cb_conn.cb_xprt) @@ -1039,6 +1139,23 @@ static void gen_confirm(struct nfs4_client *clp) *p++ = i++; } +static struct nfs4_stid *find_stateid(struct nfs4_client *cl, stateid_t *t) +{ + return idr_find(&cl->cl_stateids, t->si_opaque.so_id); +} + +static struct nfs4_stid *find_stateid_by_type(struct nfs4_client *cl, stateid_t *t, char typemask) +{ + struct nfs4_stid *s; + + s = find_stateid(cl, t); + if (!s) + return NULL; + if (typemask & s->sc_type) + return s; + return NULL; +} + static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir, struct svc_rqst *rqstp, nfs4_verifier *verf) { @@ -1061,6 +1178,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir, } } + idr_init(&clp->cl_stateids); memcpy(clp->cl_recdir, recdir, HEXDIR_LEN); atomic_set(&clp->cl_refcount, 0); clp->cl_cb_state = NFSD4_CB_UNKNOWN; @@ -1084,17 +1202,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir, return clp; } -static int check_name(struct xdr_netobj name) -{ - if (name.len == 0) - return 0; - if (name.len > NFS4_OPAQUE_LIMIT) { - dprintk("NFSD: check_name: name too long(%d)!\n", name.len); - return 0; - } - return 1; -} - static void add_to_unconfirmed(struct nfs4_client *clp, unsigned int strhashval) { @@ -1126,8 +1233,10 @@ find_confirmed_client(clientid_t *clid) unsigned int idhashval = clientid_hashval(clid->cl_id); list_for_each_entry(clp, &conf_id_hashtbl[idhashval], cl_idhash) { - if (same_clid(&clp->cl_clientid, clid)) + if (same_clid(&clp->cl_clientid, clid)) { + renew_client(clp); return clp; + } } return NULL; } @@ -1174,20 +1283,6 @@ find_unconfirmed_client_by_str(const char *dname, unsigned int hashval) return NULL; } -static void rpc_svcaddr2sockaddr(struct sockaddr *sa, unsigned short family, union svc_addr_u *svcaddr) -{ - switch (family) { - case AF_INET: - ((struct sockaddr_in *)sa)->sin_family = AF_INET; - ((struct sockaddr_in *)sa)->sin_addr = svcaddr->addr; - return; - case AF_INET6: - ((struct sockaddr_in6 *)sa)->sin6_family = AF_INET6; - ((struct sockaddr_in6 *)sa)->sin6_addr = svcaddr->addr6; - return; - } -} - static void gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_rqst *rqstp) { @@ -1219,7 +1314,7 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_r conn->cb_prog = se->se_callback_prog; conn->cb_ident = se->se_callback_ident; - rpc_svcaddr2sockaddr((struct sockaddr *)&conn->cb_saddr, expected_family, &rqstp->rq_daddr); + memcpy(&conn->cb_saddr, &rqstp->rq_daddr, rqstp->rq_daddrlen); return; out_err: conn->cb_addr.ss_family = AF_UNSPEC; @@ -1351,7 +1446,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, __func__, rqstp, exid, exid->clname.len, exid->clname.data, addr_str, exid->flags, exid->spa_how); - if (!check_name(exid->clname) || (exid->flags & ~EXCHGID4_FLAG_MASK_A)) + if (exid->flags & ~EXCHGID4_FLAG_MASK_A) return nfserr_inval; /* Currently only support SP4_NONE */ @@ -1507,6 +1602,29 @@ nfsd4_replay_create_session(struct nfsd4_create_session *cr_ses, return slot->sl_status; } +#define NFSD_MIN_REQ_HDR_SEQ_SZ ((\ + 2 * 2 + /* credential,verifier: AUTH_NULL, length 0 */ \ + 1 + /* MIN tag is length with zero, only length */ \ + 3 + /* version, opcount, opcode */ \ + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + \ + /* seqid, slotID, slotID, cache */ \ + 4 ) * sizeof(__be32)) + +#define NFSD_MIN_RESP_HDR_SEQ_SZ ((\ + 2 + /* verifier: AUTH_NULL, length 0 */\ + 1 + /* status */ \ + 1 + /* MIN tag is length with zero, only length */ \ + 3 + /* opcount, opcode, opstatus*/ \ + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + \ + /* seqid, slotID, slotID, slotID, status */ \ + 5 ) * sizeof(__be32)) + +static __be32 check_forechannel_attrs(struct nfsd4_channel_attrs fchannel) +{ + return fchannel.maxreq_sz < NFSD_MIN_REQ_HDR_SEQ_SZ + || fchannel.maxresp_sz < NFSD_MIN_RESP_HDR_SEQ_SZ; +} + __be32 nfsd4_create_session(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, @@ -1575,6 +1693,10 @@ nfsd4_create_session(struct svc_rqst *rqstp, cr_ses->flags &= ~SESSION4_PERSIST; cr_ses->flags &= ~SESSION4_RDMA; + status = nfserr_toosmall; + if (check_forechannel_attrs(cr_ses->fore_channel)) + goto out; + status = nfserr_jukebox; new = alloc_init_session(rqstp, conf, cr_ses); if (!new) @@ -1736,6 +1858,14 @@ static bool nfsd4_session_too_many_ops(struct svc_rqst *rqstp, struct nfsd4_sess return args->opcnt > session->se_fchannel.maxops; } +static bool nfsd4_request_too_big(struct svc_rqst *rqstp, + struct nfsd4_session *session) +{ + struct xdr_buf *xb = &rqstp->rq_arg; + + return xb->len > session->se_fchannel.maxreq_sz; +} + __be32 nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, @@ -1768,6 +1898,10 @@ nfsd4_sequence(struct svc_rqst *rqstp, if (nfsd4_session_too_many_ops(rqstp, session)) goto out; + status = nfserr_req_too_big; + if (nfsd4_request_too_big(rqstp, session)) + goto out; + status = nfserr_badslot; if (seq->slotid >= session->se_fchannel.maxreqs) goto out; @@ -1811,8 +1945,16 @@ out: nfsd4_get_session(cstate->session); atomic_inc(&clp->cl_refcount); - if (clp->cl_cb_state == NFSD4_CB_DOWN) - seq->status_flags |= SEQ4_STATUS_CB_PATH_DOWN; + switch (clp->cl_cb_state) { + case NFSD4_CB_DOWN: + seq->status_flags = SEQ4_STATUS_CB_PATH_DOWN; + break; + case NFSD4_CB_FAULT: + seq->status_flags = SEQ4_STATUS_BACKCHANNEL_FAULT; + break; + default: + seq->status_flags = 0; + } } kfree(conn); spin_unlock(&client_lock); @@ -1820,6 +1962,50 @@ out: return status; } +static inline bool has_resources(struct nfs4_client *clp) +{ + return !list_empty(&clp->cl_openowners) + || !list_empty(&clp->cl_delegations) + || !list_empty(&clp->cl_sessions); +} + +__be32 +nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_destroy_clientid *dc) +{ + struct nfs4_client *conf, *unconf, *clp; + int status = 0; + + nfs4_lock_state(); + unconf = find_unconfirmed_client(&dc->clientid); + conf = find_confirmed_client(&dc->clientid); + + if (conf) { + clp = conf; + + if (!is_client_expired(conf) && has_resources(conf)) { + status = nfserr_clientid_busy; + goto out; + } + + /* rfc5661 18.50.3 */ + if (cstate->session && conf == cstate->session->se_client) { + status = nfserr_clientid_busy; + goto out; + } + } else if (unconf) + clp = unconf; + else { + status = nfserr_stale_clientid; + goto out; + } + + expire_client(clp); +out: + nfs4_unlock_state(); + dprintk("%s return %d\n", __func__, ntohl(status)); + return status; +} + __be32 nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_reclaim_complete *rc) { @@ -1862,19 +2048,13 @@ __be32 nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_setclientid *setclid) { - struct xdr_netobj clname = { - .len = setclid->se_namelen, - .data = setclid->se_name, - }; + struct xdr_netobj clname = setclid->se_name; nfs4_verifier clverifier = setclid->se_verf; unsigned int strhashval; struct nfs4_client *conf, *unconf, *new; __be32 status; char dname[HEXDIR_LEN]; - if (!check_name(clname)) - return nfserr_inval; - status = nfs4_make_rec_clidname(dname, &clname); if (status) return status; @@ -1908,7 +2088,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * of 5 bullet points, labeled as CASE0 - CASE4 below. */ unconf = find_unconfirmed_client_by_str(dname, strhashval); - status = nfserr_resource; + status = nfserr_jukebox; if (!conf) { /* * RFC 3530 14.2.33 CASE 4: @@ -2078,31 +2258,28 @@ out: return status; } +static struct nfs4_file *nfsd4_alloc_file(void) +{ + return kmem_cache_alloc(file_slab, GFP_KERNEL); +} + /* OPEN Share state helper functions */ -static inline struct nfs4_file * -alloc_init_file(struct inode *ino) +static void nfsd4_init_file(struct nfs4_file *fp, struct inode *ino) { - struct nfs4_file *fp; unsigned int hashval = file_hashval(ino); - fp = kmem_cache_alloc(file_slab, GFP_KERNEL); - if (fp) { - atomic_set(&fp->fi_ref, 1); - INIT_LIST_HEAD(&fp->fi_hash); - INIT_LIST_HEAD(&fp->fi_stateids); - INIT_LIST_HEAD(&fp->fi_delegations); - fp->fi_inode = igrab(ino); - fp->fi_id = current_fileid++; - fp->fi_had_conflict = false; - fp->fi_lease = NULL; - memset(fp->fi_fds, 0, sizeof(fp->fi_fds)); - memset(fp->fi_access, 0, sizeof(fp->fi_access)); - spin_lock(&recall_lock); - list_add(&fp->fi_hash, &file_hashtbl[hashval]); - spin_unlock(&recall_lock); - return fp; - } - return NULL; + atomic_set(&fp->fi_ref, 1); + INIT_LIST_HEAD(&fp->fi_hash); + INIT_LIST_HEAD(&fp->fi_stateids); + INIT_LIST_HEAD(&fp->fi_delegations); + fp->fi_inode = igrab(ino); + fp->fi_had_conflict = false; + fp->fi_lease = NULL; + memset(fp->fi_fds, 0, sizeof(fp->fi_fds)); + memset(fp->fi_access, 0, sizeof(fp->fi_access)); + spin_lock(&recall_lock); + list_add(&fp->fi_hash, &file_hashtbl[hashval]); + spin_unlock(&recall_lock); } static void @@ -2117,7 +2294,8 @@ nfsd4_free_slab(struct kmem_cache **slab) void nfsd4_free_slabs(void) { - nfsd4_free_slab(&stateowner_slab); + nfsd4_free_slab(&openowner_slab); + nfsd4_free_slab(&lockowner_slab); nfsd4_free_slab(&file_slab); nfsd4_free_slab(&stateid_slab); nfsd4_free_slab(&deleg_slab); @@ -2126,16 +2304,20 @@ nfsd4_free_slabs(void) static int nfsd4_init_slabs(void) { - stateowner_slab = kmem_cache_create("nfsd4_stateowners", - sizeof(struct nfs4_stateowner), 0, 0, NULL); - if (stateowner_slab == NULL) + openowner_slab = kmem_cache_create("nfsd4_openowners", + sizeof(struct nfs4_openowner), 0, 0, NULL); + if (openowner_slab == NULL) + goto out_nomem; + lockowner_slab = kmem_cache_create("nfsd4_lockowners", + sizeof(struct nfs4_openowner), 0, 0, NULL); + if (lockowner_slab == NULL) goto out_nomem; file_slab = kmem_cache_create("nfsd4_files", sizeof(struct nfs4_file), 0, 0, NULL); if (file_slab == NULL) goto out_nomem; stateid_slab = kmem_cache_create("nfsd4_stateids", - sizeof(struct nfs4_stateid), 0, 0, NULL); + sizeof(struct nfs4_ol_stateid), 0, 0, NULL); if (stateid_slab == NULL) goto out_nomem; deleg_slab = kmem_cache_create("nfsd4_delegations", @@ -2149,97 +2331,94 @@ out_nomem: return -ENOMEM; } -void -nfs4_free_stateowner(struct kref *kref) +void nfs4_free_openowner(struct nfs4_openowner *oo) { - struct nfs4_stateowner *sop = - container_of(kref, struct nfs4_stateowner, so_ref); - kfree(sop->so_owner.data); - kmem_cache_free(stateowner_slab, sop); + kfree(oo->oo_owner.so_owner.data); + kmem_cache_free(openowner_slab, oo); } -static inline struct nfs4_stateowner * -alloc_stateowner(struct xdr_netobj *owner) +void nfs4_free_lockowner(struct nfs4_lockowner *lo) { - struct nfs4_stateowner *sop; + kfree(lo->lo_owner.so_owner.data); + kmem_cache_free(lockowner_slab, lo); +} - if ((sop = kmem_cache_alloc(stateowner_slab, GFP_KERNEL))) { - if ((sop->so_owner.data = kmalloc(owner->len, GFP_KERNEL))) { - memcpy(sop->so_owner.data, owner->data, owner->len); - sop->so_owner.len = owner->len; - kref_init(&sop->so_ref); - return sop; - } - kmem_cache_free(stateowner_slab, sop); - } - return NULL; +static void init_nfs4_replay(struct nfs4_replay *rp) +{ + rp->rp_status = nfserr_serverfault; + rp->rp_buflen = 0; + rp->rp_buf = rp->rp_ibuf; } -static struct nfs4_stateowner * -alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfsd4_open *open) { +static inline void *alloc_stateowner(struct kmem_cache *slab, struct xdr_netobj *owner, struct nfs4_client *clp) +{ struct nfs4_stateowner *sop; - struct nfs4_replay *rp; - unsigned int idhashval; - if (!(sop = alloc_stateowner(&open->op_owner))) + sop = kmem_cache_alloc(slab, GFP_KERNEL); + if (!sop) + return NULL; + + sop->so_owner.data = kmemdup(owner->data, owner->len, GFP_KERNEL); + if (!sop->so_owner.data) { + kmem_cache_free(slab, sop); return NULL; - idhashval = ownerid_hashval(current_ownerid); - INIT_LIST_HEAD(&sop->so_idhash); - INIT_LIST_HEAD(&sop->so_strhash); - INIT_LIST_HEAD(&sop->so_perclient); + } + sop->so_owner.len = owner->len; + INIT_LIST_HEAD(&sop->so_stateids); - INIT_LIST_HEAD(&sop->so_perstateid); /* not used */ - INIT_LIST_HEAD(&sop->so_close_lru); - sop->so_time = 0; - list_add(&sop->so_idhash, &ownerid_hashtbl[idhashval]); - list_add(&sop->so_strhash, &ownerstr_hashtbl[strhashval]); - list_add(&sop->so_perclient, &clp->cl_openowners); - sop->so_is_open_owner = 1; - sop->so_id = current_ownerid++; sop->so_client = clp; - sop->so_seqid = open->op_seqid; - sop->so_confirmed = 0; - rp = &sop->so_replay; - rp->rp_status = nfserr_serverfault; - rp->rp_buflen = 0; - rp->rp_buf = rp->rp_ibuf; + init_nfs4_replay(&sop->so_replay); return sop; } -static inline void -init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) { - struct nfs4_stateowner *sop = open->op_stateowner; - unsigned int hashval = stateid_hashval(sop->so_id, fp->fi_id); +static void hash_openowner(struct nfs4_openowner *oo, struct nfs4_client *clp, unsigned int strhashval) +{ + list_add(&oo->oo_owner.so_strhash, &open_ownerstr_hashtbl[strhashval]); + list_add(&oo->oo_perclient, &clp->cl_openowners); +} - INIT_LIST_HEAD(&stp->st_hash); - INIT_LIST_HEAD(&stp->st_perstateowner); +static struct nfs4_openowner * +alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfsd4_open *open) { + struct nfs4_openowner *oo; + + oo = alloc_stateowner(openowner_slab, &open->op_owner, clp); + if (!oo) + return NULL; + oo->oo_owner.so_is_open_owner = 1; + oo->oo_owner.so_seqid = open->op_seqid; + oo->oo_flags = NFS4_OO_NEW; + oo->oo_time = 0; + oo->oo_last_closed_stid = NULL; + INIT_LIST_HEAD(&oo->oo_close_lru); + hash_openowner(oo, clp, strhashval); + return oo; +} + +static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) { + struct nfs4_openowner *oo = open->op_openowner; + struct nfs4_client *clp = oo->oo_owner.so_client; + + init_stid(&stp->st_stid, clp, NFS4_OPEN_STID); INIT_LIST_HEAD(&stp->st_lockowners); - INIT_LIST_HEAD(&stp->st_perfile); - list_add(&stp->st_hash, &stateid_hashtbl[hashval]); - list_add(&stp->st_perstateowner, &sop->so_stateids); + list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids); list_add(&stp->st_perfile, &fp->fi_stateids); - stp->st_stateowner = sop; + stp->st_stateowner = &oo->oo_owner; get_nfs4_file(fp); stp->st_file = fp; - stp->st_stateid.si_boot = boot_time; - stp->st_stateid.si_stateownerid = sop->so_id; - stp->st_stateid.si_fileid = fp->fi_id; - stp->st_stateid.si_generation = 0; stp->st_access_bmap = 0; stp->st_deny_bmap = 0; - __set_bit(open->op_share_access & ~NFS4_SHARE_WANT_MASK, - &stp->st_access_bmap); + __set_bit(open->op_share_access, &stp->st_access_bmap); __set_bit(open->op_share_deny, &stp->st_deny_bmap); stp->st_openstp = NULL; } static void -move_to_close_lru(struct nfs4_stateowner *sop) +move_to_close_lru(struct nfs4_openowner *oo) { - dprintk("NFSD: move_to_close_lru nfs4_stateowner %p\n", sop); + dprintk("NFSD: move_to_close_lru nfs4_openowner %p\n", oo); - list_move_tail(&sop->so_close_lru, &close_lru); - sop->so_time = get_seconds(); + list_move_tail(&oo->oo_close_lru, &close_lru); + oo->oo_time = get_seconds(); } static int @@ -2251,14 +2430,18 @@ same_owner_str(struct nfs4_stateowner *sop, struct xdr_netobj *owner, (sop->so_client->cl_clientid.cl_id == clid->cl_id); } -static struct nfs4_stateowner * +static struct nfs4_openowner * find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open) { - struct nfs4_stateowner *so = NULL; + struct nfs4_stateowner *so; + struct nfs4_openowner *oo; - list_for_each_entry(so, &ownerstr_hashtbl[hashval], so_strhash) { - if (same_owner_str(so, &open->op_owner, &open->op_clientid)) - return so; + list_for_each_entry(so, &open_ownerstr_hashtbl[hashval], so_strhash) { + if (same_owner_str(so, &open->op_owner, &open->op_clientid)) { + oo = openowner(so); + renew_client(oo->oo_owner.so_client); + return oo; + } } return NULL; } @@ -2282,31 +2465,6 @@ find_file(struct inode *ino) return NULL; } -static inline int access_valid(u32 x, u32 minorversion) -{ - if ((x & NFS4_SHARE_ACCESS_MASK) < NFS4_SHARE_ACCESS_READ) - return 0; - if ((x & NFS4_SHARE_ACCESS_MASK) > NFS4_SHARE_ACCESS_BOTH) - return 0; - x &= ~NFS4_SHARE_ACCESS_MASK; - if (minorversion && x) { - if ((x & NFS4_SHARE_WANT_MASK) > NFS4_SHARE_WANT_CANCEL) - return 0; - if ((x & NFS4_SHARE_WHEN_MASK) > NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED) - return 0; - x &= ~(NFS4_SHARE_WANT_MASK | NFS4_SHARE_WHEN_MASK); - } - if (x) - return 0; - return 1; -} - -static inline int deny_valid(u32 x) -{ - /* Note: unlike access bits, deny bits may be zero. */ - return x <= NFS4_SHARE_DENY_BOTH; -} - /* * Called to check deny when READ with all zero stateid or * WRITE with all zero or all one stateid @@ -2316,7 +2474,7 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type) { struct inode *ino = current_fh->fh_dentry->d_inode; struct nfs4_file *fp; - struct nfs4_stateid *stp; + struct nfs4_ol_stateid *stp; __be32 ret; dprintk("NFSD: nfs4_share_conflict\n"); @@ -2337,15 +2495,6 @@ out: return ret; } -static inline void -nfs4_file_downgrade(struct nfs4_file *fp, unsigned int share_access) -{ - if (share_access & NFS4_SHARE_ACCESS_WRITE) - nfs4_file_put_access(fp, O_WRONLY); - if (share_access & NFS4_SHARE_ACCESS_READ) - nfs4_file_put_access(fp, O_RDONLY); -} - static void nfsd_break_one_deleg(struct nfs4_delegation *dp) { /* We're assuming the state code never drops its reference @@ -2396,10 +2545,20 @@ int nfsd_change_deleg_cb(struct file_lock **onlist, int arg) } static const struct lock_manager_operations nfsd_lease_mng_ops = { - .fl_break = nfsd_break_deleg_cb, - .fl_change = nfsd_change_deleg_cb, + .lm_break = nfsd_break_deleg_cb, + .lm_change = nfsd_change_deleg_cb, }; +static __be32 nfsd4_check_seqid(struct nfsd4_compound_state *cstate, struct nfs4_stateowner *so, u32 seqid) +{ + if (nfsd4_has_session(cstate)) + return nfs_ok; + if (seqid == so->so_seqid - 1) + return nfserr_replay_me; + if (seqid == so->so_seqid) + return nfs_ok; + return nfserr_bad_seqid; +} __be32 nfsd4_process_open1(struct nfsd4_compound_state *cstate, @@ -2408,57 +2567,49 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate, clientid_t *clientid = &open->op_clientid; struct nfs4_client *clp = NULL; unsigned int strhashval; - struct nfs4_stateowner *sop = NULL; - - if (!check_name(open->op_owner)) - return nfserr_inval; + struct nfs4_openowner *oo = NULL; + __be32 status; if (STALE_CLIENTID(&open->op_clientid)) return nfserr_stale_clientid; + /* + * In case we need it later, after we've already created the + * file and don't want to risk a further failure: + */ + open->op_file = nfsd4_alloc_file(); + if (open->op_file == NULL) + return nfserr_jukebox; - strhashval = ownerstr_hashval(clientid->cl_id, open->op_owner); - sop = find_openstateowner_str(strhashval, open); - open->op_stateowner = sop; - if (!sop) { - /* Make sure the client's lease hasn't expired. */ + strhashval = open_ownerstr_hashval(clientid->cl_id, &open->op_owner); + oo = find_openstateowner_str(strhashval, open); + open->op_openowner = oo; + if (!oo) { clp = find_confirmed_client(clientid); if (clp == NULL) return nfserr_expired; - goto renew; + goto new_owner; } - /* When sessions are used, skip open sequenceid processing */ - if (nfsd4_has_session(cstate)) - goto renew; - if (!sop->so_confirmed) { + if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) { /* Replace unconfirmed owners without checking for replay. */ - clp = sop->so_client; - release_openowner(sop); - open->op_stateowner = NULL; - goto renew; + clp = oo->oo_owner.so_client; + release_openowner(oo); + open->op_openowner = NULL; + goto new_owner; } - if (open->op_seqid == sop->so_seqid - 1) { - if (sop->so_replay.rp_buflen) - return nfserr_replay_me; - /* The original OPEN failed so spectacularly - * that we don't even have replay data saved! - * Therefore, we have no choice but to continue - * processing this OPEN; presumably, we'll - * fail again for the same reason. - */ - dprintk("nfsd4_process_open1: replay with no replay cache\n"); - goto renew; - } - if (open->op_seqid != sop->so_seqid) - return nfserr_bad_seqid; -renew: - if (open->op_stateowner == NULL) { - sop = alloc_init_open_stateowner(strhashval, clp, open); - if (sop == NULL) - return nfserr_resource; - open->op_stateowner = sop; - } - list_del_init(&sop->so_close_lru); - renew_client(sop->so_client); + status = nfsd4_check_seqid(cstate, &oo->oo_owner, open->op_seqid); + if (status) + return status; + clp = oo->oo_owner.so_client; + goto alloc_stateid; +new_owner: + oo = alloc_init_open_stateowner(strhashval, clp, open); + if (oo == NULL) + return nfserr_jukebox; + open->op_openowner = oo; +alloc_stateid: + open->op_stp = nfs4_alloc_stateid(clp); + if (!open->op_stp) + return nfserr_jukebox; return nfs_ok; } @@ -2471,36 +2622,37 @@ nfs4_check_delegmode(struct nfs4_delegation *dp, int flags) return nfs_ok; } -static struct nfs4_delegation * -find_delegation_file(struct nfs4_file *fp, stateid_t *stid) +static int share_access_to_flags(u32 share_access) { - struct nfs4_delegation *dp; + share_access &= ~NFS4_SHARE_WANT_MASK; - spin_lock(&recall_lock); - list_for_each_entry(dp, &fp->fi_delegations, dl_perfile) - if (dp->dl_stateid.si_stateownerid == stid->si_stateownerid) { - spin_unlock(&recall_lock); - return dp; - } - spin_unlock(&recall_lock); - return NULL; + return share_access == NFS4_SHARE_ACCESS_READ ? RD_STATE : WR_STATE; } -static int share_access_to_flags(u32 share_access) +static struct nfs4_delegation *find_deleg_stateid(struct nfs4_client *cl, stateid_t *s) { - share_access &= ~NFS4_SHARE_WANT_MASK; + struct nfs4_stid *ret; - return share_access == NFS4_SHARE_ACCESS_READ ? RD_STATE : WR_STATE; + ret = find_stateid_by_type(cl, s, NFS4_DELEG_STID); + if (!ret) + return NULL; + return delegstateid(ret); +} + +static bool nfsd4_is_deleg_cur(struct nfsd4_open *open) +{ + return open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR || + open->op_claim_type == NFS4_OPEN_CLAIM_DELEG_CUR_FH; } static __be32 -nfs4_check_deleg(struct nfs4_file *fp, struct nfsd4_open *open, +nfs4_check_deleg(struct nfs4_client *cl, struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_delegation **dp) { int flags; __be32 status = nfserr_bad_stateid; - *dp = find_delegation_file(fp, &open->op_delegate_stateid); + *dp = find_deleg_stateid(cl, &open->op_delegate_stateid); if (*dp == NULL) goto out; flags = share_access_to_flags(open->op_share_access); @@ -2508,41 +2660,37 @@ nfs4_check_deleg(struct nfs4_file *fp, struct nfsd4_open *open, if (status) *dp = NULL; out: - if (open->op_claim_type != NFS4_OPEN_CLAIM_DELEGATE_CUR) + if (!nfsd4_is_deleg_cur(open)) return nfs_ok; if (status) return status; - open->op_stateowner->so_confirmed = 1; + open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; return nfs_ok; } static __be32 -nfs4_check_open(struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_stateid **stpp) +nfs4_check_open(struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_ol_stateid **stpp) { - struct nfs4_stateid *local; - __be32 status = nfserr_share_denied; - struct nfs4_stateowner *sop = open->op_stateowner; + struct nfs4_ol_stateid *local; + struct nfs4_openowner *oo = open->op_openowner; list_for_each_entry(local, &fp->fi_stateids, st_perfile) { /* ignore lock owners */ if (local->st_stateowner->so_is_open_owner == 0) continue; /* remember if we have seen this open owner */ - if (local->st_stateowner == sop) + if (local->st_stateowner == &oo->oo_owner) *stpp = local; /* check for conflicting share reservations */ if (!test_share(local, open)) - goto out; + return nfserr_share_denied; } - status = 0; -out: - return status; + return nfs_ok; } -static inline struct nfs4_stateid * -nfs4_alloc_stateid(void) +static void nfs4_free_stateid(struct nfs4_ol_stateid *s) { - return kmem_cache_alloc(stateid_slab, GFP_KERNEL); + kmem_cache_free(stateid_slab, s); } static inline int nfs4_access_to_access(u32 nfs4_access) @@ -2556,12 +2704,12 @@ static inline int nfs4_access_to_access(u32 nfs4_access) return flags; } -static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file -*fp, struct svc_fh *cur_fh, u32 nfs4_access) +static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp, + struct svc_fh *cur_fh, struct nfsd4_open *open) { __be32 status; - int oflag = nfs4_access_to_omode(nfs4_access); - int access = nfs4_access_to_access(nfs4_access); + int oflag = nfs4_access_to_omode(open->op_share_access); + int access = nfs4_access_to_access(open->op_share_access); if (!fp->fi_fds[oflag]) { status = nfsd_open(rqstp, cur_fh, S_IFREG, access, @@ -2574,27 +2722,6 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file return nfs_ok; } -static __be32 -nfs4_new_open(struct svc_rqst *rqstp, struct nfs4_stateid **stpp, - struct nfs4_file *fp, struct svc_fh *cur_fh, - struct nfsd4_open *open) -{ - struct nfs4_stateid *stp; - __be32 status; - - stp = nfs4_alloc_stateid(); - if (stp == NULL) - return nfserr_resource; - - status = nfs4_get_vfs_file(rqstp, fp, cur_fh, open->op_share_access); - if (status) { - kmem_cache_free(stateid_slab, stp); - return status; - } - *stpp = stp; - return 0; -} - static inline __be32 nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh, struct nfsd4_open *open) @@ -2611,22 +2738,22 @@ nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh, } static __be32 -nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_stateid *stp, struct nfsd4_open *open) +nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, struct nfsd4_open *open) { - u32 op_share_access = open->op_share_access & ~NFS4_SHARE_WANT_MASK; + u32 op_share_access = open->op_share_access; bool new_access; __be32 status; new_access = !test_bit(op_share_access, &stp->st_access_bmap); if (new_access) { - status = nfs4_get_vfs_file(rqstp, fp, cur_fh, op_share_access); + status = nfs4_get_vfs_file(rqstp, fp, cur_fh, open); if (status) return status; } status = nfsd4_truncate(rqstp, cur_fh, open); if (status) { if (new_access) { - int oflag = nfs4_access_to_omode(new_access); + int oflag = nfs4_access_to_omode(op_share_access); nfs4_file_put_access(fp, oflag); } return status; @@ -2642,8 +2769,8 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *c static void nfs4_set_claim_prev(struct nfsd4_open *open) { - open->op_stateowner->so_confirmed = 1; - open->op_stateowner->so_client->cl_firststate = 1; + open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; + open->op_openowner->oo_owner.so_client->cl_firststate = 1; } /* Should we give out recallable state?: */ @@ -2686,7 +2813,7 @@ static int nfs4_setlease(struct nfs4_delegation *dp, int flag) if (!fl) return -ENOMEM; fl->fl_file = find_readable_file(fp); - list_add(&dp->dl_perclnt, &dp->dl_client->cl_delegations); + list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations); status = vfs_setlease(fl->fl_file, fl->fl_type, &fl); if (status) { list_del_init(&dp->dl_perclnt); @@ -2715,7 +2842,7 @@ static int nfs4_set_delegation(struct nfs4_delegation *dp, int flag) atomic_inc(&fp->fi_delegees); list_add(&dp->dl_perfile, &fp->fi_delegations); spin_unlock(&recall_lock); - list_add(&dp->dl_perclnt, &dp->dl_client->cl_delegations); + list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations); return 0; } @@ -2723,14 +2850,14 @@ static int nfs4_set_delegation(struct nfs4_delegation *dp, int flag) * Attempt to hand out a delegation. */ static void -nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_stateid *stp) +nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_ol_stateid *stp) { struct nfs4_delegation *dp; - struct nfs4_stateowner *sop = stp->st_stateowner; + struct nfs4_openowner *oo = container_of(stp->st_stateowner, struct nfs4_openowner, oo_owner); int cb_up; int status, flag = 0; - cb_up = nfsd4_cb_channel_good(sop->so_client); + cb_up = nfsd4_cb_channel_good(oo->oo_owner.so_client); flag = NFS4_OPEN_DELEGATE_NONE; open->op_recall = 0; switch (open->op_claim_type) { @@ -2746,7 +2873,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta * had the chance to reclaim theirs.... */ if (locks_in_grace()) goto out; - if (!cb_up || !sop->so_confirmed) + if (!cb_up || !(oo->oo_flags & NFS4_OO_CONFIRMED)) goto out; if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) flag = NFS4_OPEN_DELEGATE_WRITE; @@ -2757,17 +2884,17 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta goto out; } - dp = alloc_init_deleg(sop->so_client, stp, fh, flag); + dp = alloc_init_deleg(oo->oo_owner.so_client, stp, fh, flag); if (dp == NULL) goto out_no_deleg; status = nfs4_set_delegation(dp, flag); if (status) goto out_free; - memcpy(&open->op_delegate_stateid, &dp->dl_stateid, sizeof(dp->dl_stateid)); + memcpy(&open->op_delegate_stateid, &dp->dl_stid.sc_stateid, sizeof(dp->dl_stid.sc_stateid)); dprintk("NFSD: delegation stateid=" STATEID_FMT "\n", - STATEID_VAL(&dp->dl_stateid)); + STATEID_VAL(&dp->dl_stid.sc_stateid)); out: if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS && flag == NFS4_OPEN_DELEGATE_NONE @@ -2789,16 +2916,13 @@ __be32 nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open) { struct nfsd4_compoundres *resp = rqstp->rq_resp; + struct nfs4_client *cl = open->op_openowner->oo_owner.so_client; struct nfs4_file *fp = NULL; struct inode *ino = current_fh->fh_dentry->d_inode; - struct nfs4_stateid *stp = NULL; + struct nfs4_ol_stateid *stp = NULL; struct nfs4_delegation *dp = NULL; __be32 status; - status = nfserr_inval; - if (!access_valid(open->op_share_access, resp->cstate.minorversion) - || !deny_valid(open->op_share_deny)) - goto out; /* * Lookup file; if found, lookup stateid and check open request, * and check for delegations in the process of being recalled. @@ -2808,17 +2932,17 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf if (fp) { if ((status = nfs4_check_open(fp, open, &stp))) goto out; - status = nfs4_check_deleg(fp, open, &dp); + status = nfs4_check_deleg(cl, fp, open, &dp); if (status) goto out; } else { status = nfserr_bad_stateid; - if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR) - goto out; - status = nfserr_resource; - fp = alloc_init_file(ino); - if (fp == NULL) + if (nfsd4_is_deleg_cur(open)) goto out; + status = nfserr_jukebox; + fp = open->op_file; + open->op_file = NULL; + nfsd4_init_file(fp, ino); } /* @@ -2830,24 +2954,24 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open); if (status) goto out; - update_stateid(&stp->st_stateid); } else { - status = nfs4_new_open(rqstp, &stp, fp, current_fh, open); + status = nfs4_get_vfs_file(rqstp, fp, current_fh, open); if (status) goto out; - init_stateid(stp, fp, open); + stp = open->op_stp; + open->op_stp = NULL; + init_open_stateid(stp, fp, open); status = nfsd4_truncate(rqstp, current_fh, open); if (status) { release_open_stateid(stp); goto out; } - if (nfsd4_has_session(&resp->cstate)) - update_stateid(&stp->st_stateid); } - memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t)); + update_stateid(&stp->st_stid.sc_stateid); + memcpy(&open->op_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); if (nfsd4_has_session(&resp->cstate)) - open->op_stateowner->so_confirmed = 1; + open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; /* * Attempt to hand out a delegation. No error return, because the @@ -2858,7 +2982,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf status = nfs_ok; dprintk("%s: stateid=" STATEID_FMT "\n", __func__, - STATEID_VAL(&stp->st_stateid)); + STATEID_VAL(&stp->st_stid.sc_stateid)); out: if (fp) put_nfs4_file(fp); @@ -2868,13 +2992,34 @@ out: * To finish the open response, we just need to set the rflags. */ open->op_rflags = NFS4_OPEN_RESULT_LOCKTYPE_POSIX; - if (!open->op_stateowner->so_confirmed && + if (!(open->op_openowner->oo_flags & NFS4_OO_CONFIRMED) && !nfsd4_has_session(&resp->cstate)) open->op_rflags |= NFS4_OPEN_RESULT_CONFIRM; return status; } +void nfsd4_cleanup_open_state(struct nfsd4_open *open, __be32 status) +{ + if (open->op_openowner) { + struct nfs4_openowner *oo = open->op_openowner; + + if (!list_empty(&oo->oo_owner.so_stateids)) + list_del_init(&oo->oo_close_lru); + if (oo->oo_flags & NFS4_OO_NEW) { + if (status) { + release_openowner(oo); + open->op_openowner = NULL; + } else + oo->oo_flags &= ~NFS4_OO_NEW; + } + } + if (open->op_file) + nfsd4_free_file(open->op_file); + if (open->op_stp) + nfs4_free_stateid(open->op_stp); +} + __be32 nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, clientid_t *clid) @@ -2895,7 +3040,6 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, dprintk("nfsd4_renew: clientid not found!\n"); goto out; } - renew_client(clp); status = nfserr_cb_path_down; if (!list_empty(&clp->cl_delegations) && clp->cl_cb_state != NFSD4_CB_UP) @@ -2927,7 +3071,7 @@ static time_t nfs4_laundromat(void) { struct nfs4_client *clp; - struct nfs4_stateowner *sop; + struct nfs4_openowner *oo; struct nfs4_delegation *dp; struct list_head *pos, *next, reaplist; time_t cutoff = get_seconds() - nfsd4_lease; @@ -2984,16 +3128,14 @@ nfs4_laundromat(void) } test_val = nfsd4_lease; list_for_each_safe(pos, next, &close_lru) { - sop = list_entry(pos, struct nfs4_stateowner, so_close_lru); - if (time_after((unsigned long)sop->so_time, (unsigned long)cutoff)) { - u = sop->so_time - cutoff; + oo = container_of(pos, struct nfs4_openowner, oo_close_lru); + if (time_after((unsigned long)oo->oo_time, (unsigned long)cutoff)) { + u = oo->oo_time - cutoff; if (test_val > u) test_val = u; break; } - dprintk("NFSD: purging unused open stateowner (so_id %d)\n", - sop->so_id); - release_openowner(sop); + release_openowner(oo); } if (clientid_val < NFSD_LAUNDROMAT_MINTIMEOUT) clientid_val = NFSD_LAUNDROMAT_MINTIMEOUT; @@ -3015,30 +3157,17 @@ laundromat_main(struct work_struct *not_used) queue_delayed_work(laundry_wq, &laundromat_work, t*HZ); } -static struct nfs4_stateowner * -search_close_lru(u32 st_id, int flags) +static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *stp) { - struct nfs4_stateowner *local = NULL; - - if (flags & CLOSE_STATE) { - list_for_each_entry(local, &close_lru, so_close_lru) { - if (local->so_id == st_id) - return local; - } - } - return NULL; -} - -static inline int -nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stateid *stp) -{ - return fhp->fh_dentry->d_inode != stp->st_file->fi_inode; + if (fhp->fh_dentry->d_inode != stp->st_file->fi_inode) + return nfserr_bad_stateid; + return nfs_ok; } static int STALE_STATEID(stateid_t *stateid) { - if (stateid->si_boot == boot_time) + if (stateid->si_opaque.so_clid.cl_boot == boot_time) return 0; dprintk("NFSD: stale stateid " STATEID_FMT "!\n", STATEID_VAL(stateid)); @@ -3061,7 +3190,7 @@ access_permit_write(unsigned long access_bmap) } static -__be32 nfs4_check_openmode(struct nfs4_stateid *stp, int flags) +__be32 nfs4_check_openmode(struct nfs4_ol_stateid *stp, int flags) { __be32 status = nfserr_openmode; @@ -3104,37 +3233,80 @@ grace_disallows_io(struct inode *inode) return locks_in_grace() && mandatory_lock(inode); } -static int check_stateid_generation(stateid_t *in, stateid_t *ref, int flags) +/* Returns true iff a is later than b: */ +static bool stateid_generation_after(stateid_t *a, stateid_t *b) +{ + return (s32)a->si_generation - (s32)b->si_generation > 0; +} + +static int check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_session) { /* * When sessions are used the stateid generation number is ignored * when it is zero. */ - if ((flags & HAS_SESSION) && in->si_generation == 0) - goto out; + if (has_session && in->si_generation == 0) + return nfs_ok; + + if (in->si_generation == ref->si_generation) + return nfs_ok; /* If the client sends us a stateid from the future, it's buggy: */ - if (in->si_generation > ref->si_generation) + if (stateid_generation_after(in, ref)) return nfserr_bad_stateid; /* - * The following, however, can happen. For example, if the - * client sends an open and some IO at the same time, the open - * may bump si_generation while the IO is still in flight. - * Thanks to hard links and renames, the client never knows what - * file an open will affect. So it could avoid that situation - * only by serializing all opens and IO from the same open - * owner. To recover from the old_stateid error, the client - * will just have to retry the IO: + * However, we could see a stateid from the past, even from a + * non-buggy client. For example, if the client sends a lock + * while some IO is outstanding, the lock may bump si_generation + * while the IO is still in flight. The client could avoid that + * situation by waiting for responses on all the IO requests, + * but better performance may result in retrying IO that + * receives an old_stateid error if requests are rarely + * reordered in flight: */ - if (in->si_generation < ref->si_generation) - return nfserr_old_stateid; -out: + return nfserr_old_stateid; +} + +__be32 nfs4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid) +{ + struct nfs4_stid *s; + struct nfs4_ol_stateid *ols; + __be32 status; + + if (STALE_STATEID(stateid)) + return nfserr_stale_stateid; + + s = find_stateid(cl, stateid); + if (!s) + return nfserr_stale_stateid; + status = check_stateid_generation(stateid, &s->sc_stateid, 1); + if (status) + return status; + if (!(s->sc_type & (NFS4_OPEN_STID | NFS4_LOCK_STID))) + return nfs_ok; + ols = openlockstateid(s); + if (ols->st_stateowner->so_is_open_owner + && !(openowner(ols->st_stateowner)->oo_flags & NFS4_OO_CONFIRMED)) + return nfserr_bad_stateid; return nfs_ok; } -static int is_delegation_stateid(stateid_t *stateid) +static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask, struct nfs4_stid **s) { - return stateid->si_fileid == 0; + struct nfs4_client *cl; + + if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) + return nfserr_bad_stateid; + if (STALE_STATEID(stateid)) + return nfserr_stale_stateid; + cl = find_confirmed_client(&stateid->si_opaque.so_clid); + if (!cl) + return nfserr_expired; + *s = find_stateid_by_type(cl, stateid, typemask); + if (!*s) + return nfserr_bad_stateid; + return nfs_ok; + } /* @@ -3144,7 +3316,8 @@ __be32 nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, stateid_t *stateid, int flags, struct file **filpp) { - struct nfs4_stateid *stp = NULL; + struct nfs4_stid *s; + struct nfs4_ol_stateid *stp = NULL; struct nfs4_delegation *dp = NULL; struct svc_fh *current_fh = &cstate->current_fh; struct inode *ino = current_fh->fh_dentry->d_inode; @@ -3156,66 +3329,108 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, if (grace_disallows_io(ino)) return nfserr_grace; - if (nfsd4_has_session(cstate)) - flags |= HAS_SESSION; - if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) return check_special_stateids(current_fh, stateid, flags); - status = nfserr_stale_stateid; - if (STALE_STATEID(stateid)) + status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID, &s); + if (status) + return status; + status = check_stateid_generation(stateid, &s->sc_stateid, nfsd4_has_session(cstate)); + if (status) goto out; - - /* - * We assume that any stateid that has the current boot time, - * but that we can't find, is expired: - */ - status = nfserr_expired; - if (is_delegation_stateid(stateid)) { - dp = find_delegation_stateid(ino, stateid); - if (!dp) - goto out; - status = check_stateid_generation(stateid, &dp->dl_stateid, - flags); - if (status) - goto out; + switch (s->sc_type) { + case NFS4_DELEG_STID: + dp = delegstateid(s); status = nfs4_check_delegmode(dp, flags); if (status) goto out; - renew_client(dp->dl_client); if (filpp) { *filpp = dp->dl_file->fi_deleg_file; BUG_ON(!*filpp); } - } else { /* open or lock stateid */ - stp = find_stateid(stateid, flags); - if (!stp) - goto out; - status = nfserr_bad_stateid; - if (nfs4_check_fh(current_fh, stp)) - goto out; - if (!stp->st_stateowner->so_confirmed) - goto out; - status = check_stateid_generation(stateid, &stp->st_stateid, - flags); + break; + case NFS4_OPEN_STID: + case NFS4_LOCK_STID: + stp = openlockstateid(s); + status = nfs4_check_fh(current_fh, stp); if (status) goto out; + if (stp->st_stateowner->so_is_open_owner + && !(openowner(stp->st_stateowner)->oo_flags & NFS4_OO_CONFIRMED)) + goto out; status = nfs4_check_openmode(stp, flags); if (status) goto out; - renew_client(stp->st_stateowner->so_client); if (filpp) { if (flags & RD_STATE) *filpp = find_readable_file(stp->st_file); else *filpp = find_writeable_file(stp->st_file); } + break; + default: + return nfserr_bad_stateid; } status = nfs_ok; out: return status; } +static __be32 +nfsd4_free_lock_stateid(struct nfs4_ol_stateid *stp) +{ + if (check_for_locks(stp->st_file, lockowner(stp->st_stateowner))) + return nfserr_locks_held; + release_lock_stateid(stp); + return nfs_ok; +} + +/* + * Test if the stateid is valid + */ +__be32 +nfsd4_test_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_test_stateid *test_stateid) +{ + /* real work is done during encoding */ + return nfs_ok; +} + +__be32 +nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_free_stateid *free_stateid) +{ + stateid_t *stateid = &free_stateid->fr_stateid; + struct nfs4_stid *s; + struct nfs4_client *cl = cstate->session->se_client; + __be32 ret = nfserr_bad_stateid; + + nfs4_lock_state(); + s = find_stateid(cl, stateid); + if (!s) + goto out; + switch (s->sc_type) { + case NFS4_DELEG_STID: + ret = nfserr_locks_held; + goto out; + case NFS4_OPEN_STID: + case NFS4_LOCK_STID: + ret = check_stateid_generation(stateid, &s->sc_stateid, 1); + if (ret) + goto out; + if (s->sc_type == NFS4_LOCK_STID) + ret = nfsd4_free_lock_stateid(openlockstateid(s)); + else + ret = nfserr_locks_held; + break; + default: + ret = nfserr_bad_stateid; + } +out: + nfs4_unlock_state(); + return ret; +} + static inline int setlkflg (int type) { @@ -3223,124 +3438,64 @@ setlkflg (int type) RD_STATE : WR_STATE; } +static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_t *stateid, u32 seqid, struct nfs4_ol_stateid *stp) +{ + struct svc_fh *current_fh = &cstate->current_fh; + struct nfs4_stateowner *sop = stp->st_stateowner; + __be32 status; + + status = nfsd4_check_seqid(cstate, sop, seqid); + if (status) + return status; + if (stp->st_stid.sc_type == NFS4_CLOSED_STID) + /* + * "Closed" stateid's exist *only* to return + * nfserr_replay_me from the previous step. + */ + return nfserr_bad_stateid; + status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate)); + if (status) + return status; + return nfs4_check_fh(current_fh, stp); +} + /* * Checks for sequence id mutating operations. */ static __be32 nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, - stateid_t *stateid, int flags, - struct nfs4_stateowner **sopp, - struct nfs4_stateid **stpp, struct nfsd4_lock *lock) + stateid_t *stateid, char typemask, + struct nfs4_ol_stateid **stpp) { - struct nfs4_stateid *stp; - struct nfs4_stateowner *sop; - struct svc_fh *current_fh = &cstate->current_fh; __be32 status; + struct nfs4_stid *s; dprintk("NFSD: %s: seqid=%d stateid = " STATEID_FMT "\n", __func__, seqid, STATEID_VAL(stateid)); *stpp = NULL; - *sopp = NULL; - - if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) { - dprintk("NFSD: preprocess_seqid_op: magic stateid!\n"); - return nfserr_bad_stateid; - } - - if (STALE_STATEID(stateid)) - return nfserr_stale_stateid; - - if (nfsd4_has_session(cstate)) - flags |= HAS_SESSION; - - /* - * We return BAD_STATEID if filehandle doesn't match stateid, - * the confirmed flag is incorrecly set, or the generation - * number is incorrect. - */ - stp = find_stateid(stateid, flags); - if (stp == NULL) { - /* - * Also, we should make sure this isn't just the result of - * a replayed close: - */ - sop = search_close_lru(stateid->si_stateownerid, flags); - /* It's not stale; let's assume it's expired: */ - if (sop == NULL) - return nfserr_expired; - *sopp = sop; - goto check_replay; - } - - *stpp = stp; - *sopp = sop = stp->st_stateowner; - - if (lock) { - clientid_t *lockclid = &lock->v.new.clientid; - struct nfs4_client *clp = sop->so_client; - int lkflg = 0; - __be32 status; - - lkflg = setlkflg(lock->lk_type); - - if (lock->lk_is_new) { - if (!sop->so_is_open_owner) - return nfserr_bad_stateid; - if (!(flags & HAS_SESSION) && - !same_clid(&clp->cl_clientid, lockclid)) - return nfserr_bad_stateid; - /* stp is the open stateid */ - status = nfs4_check_openmode(stp, lkflg); - if (status) - return status; - } else { - /* stp is the lock stateid */ - status = nfs4_check_openmode(stp->st_openstp, lkflg); - if (status) - return status; - } - } + status = nfsd4_lookup_stateid(stateid, typemask, &s); + if (status) + return status; + *stpp = openlockstateid(s); + cstate->replay_owner = (*stpp)->st_stateowner; - if (nfs4_check_fh(current_fh, stp)) { - dprintk("NFSD: preprocess_seqid_op: fh-stateid mismatch!\n"); - return nfserr_bad_stateid; - } + return nfs4_seqid_op_checks(cstate, stateid, seqid, *stpp); +} - /* - * We now validate the seqid and stateid generation numbers. - * For the moment, we ignore the possibility of - * generation number wraparound. - */ - if (!(flags & HAS_SESSION) && seqid != sop->so_seqid) - goto check_replay; +static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, stateid_t *stateid, struct nfs4_ol_stateid **stpp) +{ + __be32 status; + struct nfs4_openowner *oo; - if (sop->so_confirmed && flags & CONFIRM) { - dprintk("NFSD: preprocess_seqid_op: expected" - " unconfirmed stateowner!\n"); - return nfserr_bad_stateid; - } - if (!sop->so_confirmed && !(flags & CONFIRM)) { - dprintk("NFSD: preprocess_seqid_op: stateowner not" - " confirmed yet!\n"); - return nfserr_bad_stateid; - } - status = check_stateid_generation(stateid, &stp->st_stateid, flags); + status = nfs4_preprocess_seqid_op(cstate, seqid, stateid, + NFS4_OPEN_STID, stpp); if (status) return status; - renew_client(sop->so_client); + oo = openowner((*stpp)->st_stateowner); + if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) + return nfserr_bad_stateid; return nfs_ok; - -check_replay: - if (seqid == sop->so_seqid - 1) { - dprintk("NFSD: preprocess_seqid_op: retransmission?\n"); - /* indicate replay to calling function */ - return nfserr_replay_me; - } - dprintk("NFSD: preprocess_seqid_op: bad seqid (expected %d, got %d)\n", - sop->so_seqid, seqid); - *sopp = NULL; - return nfserr_bad_seqid; } __be32 @@ -3348,8 +3503,8 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open_confirm *oc) { __be32 status; - struct nfs4_stateowner *sop; - struct nfs4_stateid *stp; + struct nfs4_openowner *oo; + struct nfs4_ol_stateid *stp; dprintk("NFSD: nfsd4_open_confirm on file %.*s\n", (int)cstate->current_fh.fh_dentry->d_name.len, @@ -3361,41 +3516,52 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfs4_lock_state(); - if ((status = nfs4_preprocess_seqid_op(cstate, + status = nfs4_preprocess_seqid_op(cstate, oc->oc_seqid, &oc->oc_req_stateid, - CONFIRM | OPEN_STATE, - &oc->oc_stateowner, &stp, NULL))) - goto out; - - sop = oc->oc_stateowner; - sop->so_confirmed = 1; - update_stateid(&stp->st_stateid); - memcpy(&oc->oc_resp_stateid, &stp->st_stateid, sizeof(stateid_t)); + NFS4_OPEN_STID, &stp); + if (status) + goto out; + oo = openowner(stp->st_stateowner); + status = nfserr_bad_stateid; + if (oo->oo_flags & NFS4_OO_CONFIRMED) + goto out; + oo->oo_flags |= NFS4_OO_CONFIRMED; + update_stateid(&stp->st_stid.sc_stateid); + memcpy(&oc->oc_resp_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); dprintk("NFSD: %s: success, seqid=%d stateid=" STATEID_FMT "\n", - __func__, oc->oc_seqid, STATEID_VAL(&stp->st_stateid)); + __func__, oc->oc_seqid, STATEID_VAL(&stp->st_stid.sc_stateid)); - nfsd4_create_clid_dir(sop->so_client); + nfsd4_create_clid_dir(oo->oo_owner.so_client); + status = nfs_ok; out: - if (oc->oc_stateowner) { - nfs4_get_stateowner(oc->oc_stateowner); - cstate->replay_owner = oc->oc_stateowner; - } - nfs4_unlock_state(); + if (!cstate->replay_owner) + nfs4_unlock_state(); return status; } +static inline void nfs4_stateid_downgrade_bit(struct nfs4_ol_stateid *stp, u32 access) +{ + if (!test_bit(access, &stp->st_access_bmap)) + return; + nfs4_file_put_access(stp->st_file, nfs4_access_to_omode(access)); + __clear_bit(access, &stp->st_access_bmap); +} -/* - * unset all bits in union bitmap (bmap) that - * do not exist in share (from successful OPEN_DOWNGRADE) - */ -static void -reset_union_bmap_access(unsigned long access, unsigned long *bmap) +static inline void nfs4_stateid_downgrade(struct nfs4_ol_stateid *stp, u32 to_access) { - int i; - for (i = 1; i < 4; i++) { - if ((i & access) != i) - __clear_bit(i, bmap); + switch (to_access) { + case NFS4_SHARE_ACCESS_READ: + nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_WRITE); + nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_BOTH); + break; + case NFS4_SHARE_ACCESS_WRITE: + nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_READ); + nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_BOTH); + break; + case NFS4_SHARE_ACCESS_BOTH: + break; + default: + BUG(); } } @@ -3415,25 +3581,20 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp, struct nfsd4_open_downgrade *od) { __be32 status; - struct nfs4_stateid *stp; - unsigned int share_access; + struct nfs4_ol_stateid *stp; dprintk("NFSD: nfsd4_open_downgrade on file %.*s\n", (int)cstate->current_fh.fh_dentry->d_name.len, cstate->current_fh.fh_dentry->d_name.name); - if (!access_valid(od->od_share_access, cstate->minorversion) - || !deny_valid(od->od_share_deny)) - return nfserr_inval; + /* We don't yet support WANT bits: */ + od->od_share_access &= NFS4_SHARE_ACCESS_MASK; nfs4_lock_state(); - if ((status = nfs4_preprocess_seqid_op(cstate, - od->od_seqid, - &od->od_stateid, - OPEN_STATE, - &od->od_stateowner, &stp, NULL))) + status = nfs4_preprocess_confirmed_seqid_op(cstate, od->od_seqid, + &od->od_stateid, &stp); + if (status) goto out; - status = nfserr_inval; if (!test_bit(od->od_share_access, &stp->st_access_bmap)) { dprintk("NFSD:access not a subset current bitmap: 0x%lx, input access=%08x\n", @@ -3445,24 +3606,45 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp, stp->st_deny_bmap, od->od_share_deny); goto out; } - set_access(&share_access, stp->st_access_bmap); - nfs4_file_downgrade(stp->st_file, share_access & ~od->od_share_access); + nfs4_stateid_downgrade(stp, od->od_share_access); - reset_union_bmap_access(od->od_share_access, &stp->st_access_bmap); reset_union_bmap_deny(od->od_share_deny, &stp->st_deny_bmap); - update_stateid(&stp->st_stateid); - memcpy(&od->od_stateid, &stp->st_stateid, sizeof(stateid_t)); + update_stateid(&stp->st_stid.sc_stateid); + memcpy(&od->od_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); status = nfs_ok; out: - if (od->od_stateowner) { - nfs4_get_stateowner(od->od_stateowner); - cstate->replay_owner = od->od_stateowner; - } - nfs4_unlock_state(); + if (!cstate->replay_owner) + nfs4_unlock_state(); return status; } +void nfsd4_purge_closed_stateid(struct nfs4_stateowner *so) +{ + struct nfs4_openowner *oo; + struct nfs4_ol_stateid *s; + + if (!so->so_is_open_owner) + return; + oo = openowner(so); + s = oo->oo_last_closed_stid; + if (!s) + return; + if (!(oo->oo_flags & NFS4_OO_PURGE_CLOSE)) { + /* Release the last_closed_stid on the next seqid bump: */ + oo->oo_flags |= NFS4_OO_PURGE_CLOSE; + return; + } + oo->oo_flags &= ~NFS4_OO_PURGE_CLOSE; + release_last_closed_stateid(oo); +} + +static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s) +{ + unhash_open_stateid(s); + s->st_stid.sc_type = NFS4_CLOSED_STID; +} + /* * nfs4_unlock_state() called after encode */ @@ -3471,39 +3653,37 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_close *close) { __be32 status; - struct nfs4_stateid *stp; + struct nfs4_openowner *oo; + struct nfs4_ol_stateid *stp; dprintk("NFSD: nfsd4_close on file %.*s\n", (int)cstate->current_fh.fh_dentry->d_name.len, cstate->current_fh.fh_dentry->d_name.name); nfs4_lock_state(); - /* check close_lru for replay */ - if ((status = nfs4_preprocess_seqid_op(cstate, - close->cl_seqid, - &close->cl_stateid, - OPEN_STATE | CLOSE_STATE, - &close->cl_stateowner, &stp, NULL))) + status = nfs4_preprocess_seqid_op(cstate, close->cl_seqid, + &close->cl_stateid, + NFS4_OPEN_STID|NFS4_CLOSED_STID, + &stp); + if (status) goto out; + oo = openowner(stp->st_stateowner); status = nfs_ok; - update_stateid(&stp->st_stateid); - memcpy(&close->cl_stateid, &stp->st_stateid, sizeof(stateid_t)); + update_stateid(&stp->st_stid.sc_stateid); + memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); - /* release_stateid() calls nfsd_close() if needed */ - release_open_stateid(stp); + nfsd4_close_open_stateid(stp); + oo->oo_last_closed_stid = stp; /* place unused nfs4_stateowners on so_close_lru list to be * released by the laundromat service after the lease period * to enable us to handle CLOSE replay */ - if (list_empty(&close->cl_stateowner->so_stateids)) - move_to_close_lru(close->cl_stateowner); + if (list_empty(&oo->oo_owner.so_stateids)) + move_to_close_lru(oo); out: - if (close->cl_stateowner) { - nfs4_get_stateowner(close->cl_stateowner); - cstate->replay_owner = close->cl_stateowner; - } - nfs4_unlock_state(); + if (!cstate->replay_owner) + nfs4_unlock_state(); return status; } @@ -3513,34 +3693,22 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { struct nfs4_delegation *dp; stateid_t *stateid = &dr->dr_stateid; + struct nfs4_stid *s; struct inode *inode; __be32 status; - int flags = 0; if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) return status; inode = cstate->current_fh.fh_dentry->d_inode; - if (nfsd4_has_session(cstate)) - flags |= HAS_SESSION; nfs4_lock_state(); - status = nfserr_bad_stateid; - if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) - goto out; - status = nfserr_stale_stateid; - if (STALE_STATEID(stateid)) - goto out; - status = nfserr_bad_stateid; - if (!is_delegation_stateid(stateid)) - goto out; - status = nfserr_expired; - dp = find_delegation_stateid(inode, stateid); - if (!dp) + status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID, &s); + if (status) goto out; - status = check_stateid_generation(stateid, &dp->dl_stateid, flags); + dp = delegstateid(s); + status = check_stateid_generation(stateid, &dp->dl_stid.sc_stateid, nfsd4_has_session(cstate)); if (status) goto out; - renew_client(dp->dl_client); unhash_delegation(dp); out: @@ -3578,9 +3746,6 @@ last_byte_offset(u64 start, u64 len) return end > start ? end - 1: NFS4_MAX_UINT64; } -#define lockownerid_hashval(id) \ - ((id) & LOCK_HASH_MASK) - static inline unsigned int lock_ownerstr_hashval(struct inode *inode, u32 cl_id, struct xdr_netobj *ownername) @@ -3590,55 +3755,7 @@ lock_ownerstr_hashval(struct inode *inode, u32 cl_id, & LOCK_HASH_MASK; } -static struct list_head lock_ownerid_hashtbl[LOCK_HASH_SIZE]; static struct list_head lock_ownerstr_hashtbl[LOCK_HASH_SIZE]; -static struct list_head lockstateid_hashtbl[STATEID_HASH_SIZE]; - -static struct nfs4_stateid * -find_stateid(stateid_t *stid, int flags) -{ - struct nfs4_stateid *local; - u32 st_id = stid->si_stateownerid; - u32 f_id = stid->si_fileid; - unsigned int hashval; - - dprintk("NFSD: find_stateid flags 0x%x\n",flags); - if (flags & (LOCK_STATE | RD_STATE | WR_STATE)) { - hashval = stateid_hashval(st_id, f_id); - list_for_each_entry(local, &lockstateid_hashtbl[hashval], st_hash) { - if ((local->st_stateid.si_stateownerid == st_id) && - (local->st_stateid.si_fileid == f_id)) - return local; - } - } - - if (flags & (OPEN_STATE | RD_STATE | WR_STATE)) { - hashval = stateid_hashval(st_id, f_id); - list_for_each_entry(local, &stateid_hashtbl[hashval], st_hash) { - if ((local->st_stateid.si_stateownerid == st_id) && - (local->st_stateid.si_fileid == f_id)) - return local; - } - } - return NULL; -} - -static struct nfs4_delegation * -find_delegation_stateid(struct inode *ino, stateid_t *stid) -{ - struct nfs4_file *fp; - struct nfs4_delegation *dl; - - dprintk("NFSD: %s: stateid=" STATEID_FMT "\n", __func__, - STATEID_VAL(stid)); - - fp = find_file(ino); - if (!fp) - return NULL; - dl = find_delegation_file(fp, stid); - put_nfs4_file(fp); - return dl; -} /* * TODO: Linux file offsets are _signed_ 64-bit quantities, which means that @@ -3665,15 +3782,21 @@ static const struct lock_manager_operations nfsd_posix_mng_ops = { static inline void nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny) { - struct nfs4_stateowner *sop; + struct nfs4_lockowner *lo; if (fl->fl_lmops == &nfsd_posix_mng_ops) { - sop = (struct nfs4_stateowner *) fl->fl_owner; - kref_get(&sop->so_ref); - deny->ld_sop = sop; - deny->ld_clientid = sop->so_client->cl_clientid; + lo = (struct nfs4_lockowner *) fl->fl_owner; + deny->ld_owner.data = kmemdup(lo->lo_owner.so_owner.data, + lo->lo_owner.so_owner.len, GFP_KERNEL); + if (!deny->ld_owner.data) + /* We just don't care that much */ + goto nevermind; + deny->ld_owner.len = lo->lo_owner.so_owner.len; + deny->ld_clientid = lo->lo_owner.so_client->cl_clientid; } else { - deny->ld_sop = NULL; +nevermind: + deny->ld_owner.len = 0; + deny->ld_owner.data = NULL; deny->ld_clientid.cl_boot = 0; deny->ld_clientid.cl_id = 0; } @@ -3686,8 +3809,8 @@ nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny) deny->ld_type = NFS4_WRITE_LT; } -static struct nfs4_stateowner * -find_lockstateowner_str(struct inode *inode, clientid_t *clid, +static struct nfs4_lockowner * +find_lockowner_str(struct inode *inode, clientid_t *clid, struct xdr_netobj *owner) { unsigned int hashval = lock_ownerstr_hashval(inode, clid->cl_id, owner); @@ -3695,11 +3818,17 @@ find_lockstateowner_str(struct inode *inode, clientid_t *clid, list_for_each_entry(op, &lock_ownerstr_hashtbl[hashval], so_strhash) { if (same_owner_str(op, owner, clid)) - return op; + return lockowner(op); } return NULL; } +static void hash_lockowner(struct nfs4_lockowner *lo, unsigned int strhashval, struct nfs4_client *clp, struct nfs4_ol_stateid *open_stp) +{ + list_add(&lo->lo_owner.so_strhash, &lock_ownerstr_hashtbl[strhashval]); + list_add(&lo->lo_perstateid, &open_stp->st_lockowners); +} + /* * Alloc a lock owner structure. * Called in nfsd4_lock - therefore, OPEN and OPEN_CONFIRM (if needed) has @@ -3708,67 +3837,40 @@ find_lockstateowner_str(struct inode *inode, clientid_t *clid, * strhashval = lock_ownerstr_hashval */ -static struct nfs4_stateowner * -alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfs4_stateid *open_stp, struct nfsd4_lock *lock) { - struct nfs4_stateowner *sop; - struct nfs4_replay *rp; - unsigned int idhashval; +static struct nfs4_lockowner * +alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfs4_ol_stateid *open_stp, struct nfsd4_lock *lock) { + struct nfs4_lockowner *lo; - if (!(sop = alloc_stateowner(&lock->lk_new_owner))) + lo = alloc_stateowner(lockowner_slab, &lock->lk_new_owner, clp); + if (!lo) return NULL; - idhashval = lockownerid_hashval(current_ownerid); - INIT_LIST_HEAD(&sop->so_idhash); - INIT_LIST_HEAD(&sop->so_strhash); - INIT_LIST_HEAD(&sop->so_perclient); - INIT_LIST_HEAD(&sop->so_stateids); - INIT_LIST_HEAD(&sop->so_perstateid); - INIT_LIST_HEAD(&sop->so_close_lru); /* not used */ - sop->so_time = 0; - list_add(&sop->so_idhash, &lock_ownerid_hashtbl[idhashval]); - list_add(&sop->so_strhash, &lock_ownerstr_hashtbl[strhashval]); - list_add(&sop->so_perstateid, &open_stp->st_lockowners); - sop->so_is_open_owner = 0; - sop->so_id = current_ownerid++; - sop->so_client = clp; + INIT_LIST_HEAD(&lo->lo_owner.so_stateids); + lo->lo_owner.so_is_open_owner = 0; /* It is the openowner seqid that will be incremented in encode in the * case of new lockowners; so increment the lock seqid manually: */ - sop->so_seqid = lock->lk_new_lock_seqid + 1; - sop->so_confirmed = 1; - rp = &sop->so_replay; - rp->rp_status = nfserr_serverfault; - rp->rp_buflen = 0; - rp->rp_buf = rp->rp_ibuf; - return sop; + lo->lo_owner.so_seqid = lock->lk_new_lock_seqid + 1; + hash_lockowner(lo, strhashval, clp, open_stp); + return lo; } -static struct nfs4_stateid * -alloc_init_lock_stateid(struct nfs4_stateowner *sop, struct nfs4_file *fp, struct nfs4_stateid *open_stp) +static struct nfs4_ol_stateid * +alloc_init_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp, struct nfs4_ol_stateid *open_stp) { - struct nfs4_stateid *stp; - unsigned int hashval = stateid_hashval(sop->so_id, fp->fi_id); + struct nfs4_ol_stateid *stp; + struct nfs4_client *clp = lo->lo_owner.so_client; - stp = nfs4_alloc_stateid(); + stp = nfs4_alloc_stateid(clp); if (stp == NULL) - goto out; - INIT_LIST_HEAD(&stp->st_hash); - INIT_LIST_HEAD(&stp->st_perfile); - INIT_LIST_HEAD(&stp->st_perstateowner); - INIT_LIST_HEAD(&stp->st_lockowners); /* not used */ - list_add(&stp->st_hash, &lockstateid_hashtbl[hashval]); + return NULL; + init_stid(&stp->st_stid, clp, NFS4_LOCK_STID); list_add(&stp->st_perfile, &fp->fi_stateids); - list_add(&stp->st_perstateowner, &sop->so_stateids); - stp->st_stateowner = sop; + list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids); + stp->st_stateowner = &lo->lo_owner; get_nfs4_file(fp); stp->st_file = fp; - stp->st_stateid.si_boot = boot_time; - stp->st_stateid.si_stateownerid = sop->so_id; - stp->st_stateid.si_fileid = fp->fi_id; - stp->st_stateid.si_generation = 0; stp->st_access_bmap = 0; stp->st_deny_bmap = open_stp->st_deny_bmap; stp->st_openstp = open_stp; - -out: return stp; } @@ -3779,7 +3881,7 @@ check_lock_length(u64 offset, u64 length) LOFF_OVERFLOW(offset, length))); } -static void get_lock_access(struct nfs4_stateid *lock_stp, u32 access) +static void get_lock_access(struct nfs4_ol_stateid *lock_stp, u32 access) { struct nfs4_file *fp = lock_stp->st_file; int oflag = nfs4_access_to_omode(access); @@ -3797,15 +3899,16 @@ __be32 nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_lock *lock) { - struct nfs4_stateowner *open_sop = NULL; - struct nfs4_stateowner *lock_sop = NULL; - struct nfs4_stateid *lock_stp; + struct nfs4_openowner *open_sop = NULL; + struct nfs4_lockowner *lock_sop = NULL; + struct nfs4_ol_stateid *lock_stp; struct nfs4_file *fp; struct file *filp = NULL; struct file_lock file_lock; struct file_lock conflock; __be32 status = 0; unsigned int strhashval; + int lkflg; int err; dprintk("NFSD: nfsd4_lock: start=%Ld length=%Ld\n", @@ -3829,7 +3932,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * Use open owner and open stateid to create lock owner and * lock stateid. */ - struct nfs4_stateid *open_stp = NULL; + struct nfs4_ol_stateid *open_stp = NULL; status = nfserr_stale_clientid; if (!nfsd4_has_session(cstate) && @@ -3837,26 +3940,29 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; /* validate and update open stateid and open seqid */ - status = nfs4_preprocess_seqid_op(cstate, + status = nfs4_preprocess_confirmed_seqid_op(cstate, lock->lk_new_open_seqid, &lock->lk_new_open_stateid, - OPEN_STATE, - &lock->lk_replay_owner, &open_stp, - lock); + &open_stp); if (status) goto out; - open_sop = lock->lk_replay_owner; + open_sop = openowner(open_stp->st_stateowner); + status = nfserr_bad_stateid; + if (!nfsd4_has_session(cstate) && + !same_clid(&open_sop->oo_owner.so_client->cl_clientid, + &lock->v.new.clientid)) + goto out; /* create lockowner and lock stateid */ fp = open_stp->st_file; - strhashval = lock_ownerstr_hashval(fp->fi_inode, - open_sop->so_client->cl_clientid.cl_id, + strhashval = lock_ownerstr_hashval(fp->fi_inode, + open_sop->oo_owner.so_client->cl_clientid.cl_id, &lock->v.new.owner); /* XXX: Do we need to check for duplicate stateowners on * the same file, or should they just be allowed (and * create new stateids)? */ - status = nfserr_resource; + status = nfserr_jukebox; lock_sop = alloc_init_lock_stateowner(strhashval, - open_sop->so_client, open_stp, lock); + open_sop->oo_owner.so_client, open_stp, lock); if (lock_sop == NULL) goto out; lock_stp = alloc_init_lock_stateid(lock_sop, fp, open_stp); @@ -3865,16 +3971,20 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, } else { /* lock (lock owner + lock stateid) already exists */ status = nfs4_preprocess_seqid_op(cstate, - lock->lk_old_lock_seqid, - &lock->lk_old_lock_stateid, - LOCK_STATE, - &lock->lk_replay_owner, &lock_stp, lock); + lock->lk_old_lock_seqid, + &lock->lk_old_lock_stateid, + NFS4_LOCK_STID, &lock_stp); if (status) goto out; - lock_sop = lock->lk_replay_owner; + lock_sop = lockowner(lock_stp->st_stateowner); fp = lock_stp->st_file; } - /* lock->lk_replay_owner and lock_stp have been created or found */ + /* lock_sop and lock_stp have been created or found */ + + lkflg = setlkflg(lock->lk_type); + status = nfs4_check_openmode(lock_stp, lkflg); + if (status) + goto out; status = nfserr_grace; if (locks_in_grace() && !lock->lk_reclaim) @@ -3925,8 +4035,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, err = vfs_lock_file(filp, F_SETLK, &file_lock, &conflock); switch (-err) { case 0: /* success! */ - update_stateid(&lock_stp->st_stateid); - memcpy(&lock->lk_resp_stateid, &lock_stp->st_stateid, + update_stateid(&lock_stp->st_stid.sc_stateid); + memcpy(&lock->lk_resp_stateid, &lock_stp->st_stid.sc_stateid, sizeof(stateid_t)); status = 0; break; @@ -3938,19 +4048,16 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, case (EDEADLK): status = nfserr_deadlock; break; - default: + default: dprintk("NFSD: nfsd4_lock: vfs_lock_file() failed! status %d\n",err); - status = nfserr_resource; + status = nfserrno(err); break; } out: if (status && lock->lk_is_new && lock_sop) release_lockowner(lock_sop); - if (lock->lk_replay_owner) { - nfs4_get_stateowner(lock->lk_replay_owner); - cstate->replay_owner = lock->lk_replay_owner; - } - nfs4_unlock_state(); + if (!cstate->replay_owner) + nfs4_unlock_state(); return status; } @@ -3982,6 +4089,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { struct inode *inode; struct file_lock file_lock; + struct nfs4_lockowner *lo; int error; __be32 status; @@ -3991,19 +4099,14 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (check_lock_length(lockt->lt_offset, lockt->lt_length)) return nfserr_inval; - lockt->lt_stateowner = NULL; nfs4_lock_state(); status = nfserr_stale_clientid; if (!nfsd4_has_session(cstate) && STALE_CLIENTID(&lockt->lt_clientid)) goto out; - if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) { - dprintk("NFSD: nfsd4_lockt: fh_verify() failed!\n"); - if (status == nfserr_symlink) - status = nfserr_inval; + if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) goto out; - } inode = cstate->current_fh.fh_dentry->d_inode; locks_init_lock(&file_lock); @@ -4022,10 +4125,9 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; } - lockt->lt_stateowner = find_lockstateowner_str(inode, - &lockt->lt_clientid, &lockt->lt_owner); - if (lockt->lt_stateowner) - file_lock.fl_owner = (fl_owner_t)lockt->lt_stateowner; + lo = find_lockowner_str(inode, &lockt->lt_clientid, &lockt->lt_owner); + if (lo) + file_lock.fl_owner = (fl_owner_t)lo; file_lock.fl_pid = current->tgid; file_lock.fl_flags = FL_POSIX; @@ -4053,7 +4155,7 @@ __be32 nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_locku *locku) { - struct nfs4_stateid *stp; + struct nfs4_ol_stateid *stp; struct file *filp = NULL; struct file_lock file_lock; __be32 status; @@ -4068,13 +4170,10 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfs4_lock_state(); - if ((status = nfs4_preprocess_seqid_op(cstate, - locku->lu_seqid, - &locku->lu_stateid, - LOCK_STATE, - &locku->lu_stateowner, &stp, NULL))) + status = nfs4_preprocess_seqid_op(cstate, locku->lu_seqid, + &locku->lu_stateid, NFS4_LOCK_STID, &stp); + if (status) goto out; - filp = find_any_file(stp->st_file); if (!filp) { status = nfserr_lock_range; @@ -4083,7 +4182,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, BUG_ON(!filp); locks_init_lock(&file_lock); file_lock.fl_type = F_UNLCK; - file_lock.fl_owner = (fl_owner_t) locku->lu_stateowner; + file_lock.fl_owner = (fl_owner_t)lockowner(stp->st_stateowner); file_lock.fl_pid = current->tgid; file_lock.fl_file = filp; file_lock.fl_flags = FL_POSIX; @@ -4104,15 +4203,12 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, /* * OK, unlock succeeded; the only thing left to do is update the stateid. */ - update_stateid(&stp->st_stateid); - memcpy(&locku->lu_stateid, &stp->st_stateid, sizeof(stateid_t)); + update_stateid(&stp->st_stid.sc_stateid); + memcpy(&locku->lu_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); out: - if (locku->lu_stateowner) { - nfs4_get_stateowner(locku->lu_stateowner); - cstate->replay_owner = locku->lu_stateowner; - } - nfs4_unlock_state(); + if (!cstate->replay_owner) + nfs4_unlock_state(); return status; out_nfserr: @@ -4126,7 +4222,7 @@ out_nfserr: * 0: no locks held by lockowner */ static int -check_for_locks(struct nfs4_file *filp, struct nfs4_stateowner *lowner) +check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner) { struct file_lock **flpp; struct inode *inode = filp->fi_inode; @@ -4151,7 +4247,8 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, { clientid_t *clid = &rlockowner->rl_clientid; struct nfs4_stateowner *sop; - struct nfs4_stateid *stp; + struct nfs4_lockowner *lo; + struct nfs4_ol_stateid *stp; struct xdr_netobj *owner = &rlockowner->rl_owner; struct list_head matches; int i; @@ -4175,16 +4272,15 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, * data structures. */ INIT_LIST_HEAD(&matches); for (i = 0; i < LOCK_HASH_SIZE; i++) { - list_for_each_entry(sop, &lock_ownerid_hashtbl[i], so_idhash) { + list_for_each_entry(sop, &lock_ownerstr_hashtbl[i], so_strhash) { if (!same_owner_str(sop, owner, clid)) continue; list_for_each_entry(stp, &sop->so_stateids, st_perstateowner) { - if (check_for_locks(stp->st_file, sop)) + lo = lockowner(sop); + if (check_for_locks(stp->st_file, lo)) goto out; - /* Note: so_perclient unused for lockowners, - * so it's OK to fool with here. */ - list_add(&sop->so_perclient, &matches); + list_add(&lo->lo_list, &matches); } } } @@ -4193,12 +4289,12 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, * have been checked. */ status = nfs_ok; while (!list_empty(&matches)) { - sop = list_entry(matches.next, struct nfs4_stateowner, - so_perclient); + lo = list_entry(matches.next, struct nfs4_lockowner, + lo_list); /* unhash_stateowner deletes so_perclient only * for openowners. */ - list_del(&sop->so_perclient); - release_lockowner(sop); + list_del(&lo->lo_list); + release_lockowner(lo); } out: nfs4_unlock_state(); @@ -4320,16 +4416,10 @@ nfs4_state_init(void) for (i = 0; i < FILE_HASH_SIZE; i++) { INIT_LIST_HEAD(&file_hashtbl[i]); } - for (i = 0; i < OWNER_HASH_SIZE; i++) { - INIT_LIST_HEAD(&ownerstr_hashtbl[i]); - INIT_LIST_HEAD(&ownerid_hashtbl[i]); - } - for (i = 0; i < STATEID_HASH_SIZE; i++) { - INIT_LIST_HEAD(&stateid_hashtbl[i]); - INIT_LIST_HEAD(&lockstateid_hashtbl[i]); + for (i = 0; i < OPEN_OWNER_HASH_SIZE; i++) { + INIT_LIST_HEAD(&open_ownerstr_hashtbl[i]); } for (i = 0; i < LOCK_HASH_SIZE; i++) { - INIT_LIST_HEAD(&lock_ownerid_hashtbl[i]); INIT_LIST_HEAD(&lock_ownerstr_hashtbl[i]); } memset(&onestateid, ~0, sizeof(stateid_t)); @@ -4346,7 +4436,7 @@ nfsd4_load_reboot_recovery_data(void) int status; nfs4_lock_state(); - nfsd4_init_recdir(user_recovery_dirname); + nfsd4_init_recdir(); status = nfsd4_recdir_load(); nfs4_unlock_state(); if (status) @@ -4455,40 +4545,3 @@ nfs4_state_shutdown(void) nfs4_unlock_state(); nfsd4_destroy_callback_queue(); } - -/* - * user_recovery_dirname is protected by the nfsd_mutex since it's only - * accessed when nfsd is starting. - */ -static void -nfs4_set_recdir(char *recdir) -{ - strcpy(user_recovery_dirname, recdir); -} - -/* - * Change the NFSv4 recovery directory to recdir. - */ -int -nfs4_reset_recoverydir(char *recdir) -{ - int status; - struct path path; - - status = kern_path(recdir, LOOKUP_FOLLOW, &path); - if (status) - return status; - status = -ENOTDIR; - if (S_ISDIR(path.dentry->d_inode->i_mode)) { - nfs4_set_recdir(recdir); - status = 0; - } - path_put(&path); - return status; -} - -char * -nfs4_recoverydir(void) -{ - return user_recovery_dirname; -} diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 990181103214..b6fa792d6b85 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -44,13 +44,15 @@ #include <linux/namei.h> #include <linux/statfs.h> #include <linux/utsname.h> +#include <linux/pagemap.h> #include <linux/sunrpc/svcauth_gss.h> #include "idmap.h" #include "acl.h" #include "xdr4.h" #include "vfs.h" - +#include "state.h" +#include "cache.h" #define NFSDDBG_FACILITY NFSDDBG_XDR @@ -131,6 +133,22 @@ xdr_error: \ } \ } while (0) +static void save_buf(struct nfsd4_compoundargs *argp, struct nfsd4_saved_compoundargs *savep) +{ + savep->p = argp->p; + savep->end = argp->end; + savep->pagelen = argp->pagelen; + savep->pagelist = argp->pagelist; +} + +static void restore_buf(struct nfsd4_compoundargs *argp, struct nfsd4_saved_compoundargs *savep) +{ + argp->p = savep->p; + argp->end = savep->end; + argp->pagelen = savep->pagelen; + argp->pagelist = savep->pagelist; +} + static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes) { /* We want more bytes than seem to be available. @@ -438,7 +456,6 @@ nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close) { DECODE_HEAD; - close->cl_stateowner = NULL; READ_BUF(4); READ32(close->cl_seqid); return nfsd4_decode_stateid(argp, &close->cl_stateid); @@ -533,7 +550,6 @@ nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock) { DECODE_HEAD; - lock->lk_replay_owner = NULL; /* * type, reclaim(boolean), offset, length, new_lock_owner(boolean) */ @@ -593,7 +609,6 @@ nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku) { DECODE_HEAD; - locku->lu_stateowner = NULL; READ_BUF(8); READ32(locku->lu_type); if ((locku->lu_type < NFS4_READ_LT) || (locku->lu_type > NFS4_WRITEW_LT)) @@ -624,6 +639,83 @@ nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, struct nfsd4_lookup *lookup DECODE_TAIL; } +static __be32 nfsd4_decode_share_access(struct nfsd4_compoundargs *argp, u32 *x) +{ + __be32 *p; + u32 w; + + READ_BUF(4); + READ32(w); + *x = w; + switch (w & NFS4_SHARE_ACCESS_MASK) { + case NFS4_SHARE_ACCESS_READ: + case NFS4_SHARE_ACCESS_WRITE: + case NFS4_SHARE_ACCESS_BOTH: + break; + default: + return nfserr_bad_xdr; + } + w &= ~NFS4_SHARE_ACCESS_MASK; + if (!w) + return nfs_ok; + if (!argp->minorversion) + return nfserr_bad_xdr; + switch (w & NFS4_SHARE_WANT_MASK) { + case NFS4_SHARE_WANT_NO_PREFERENCE: + case NFS4_SHARE_WANT_READ_DELEG: + case NFS4_SHARE_WANT_WRITE_DELEG: + case NFS4_SHARE_WANT_ANY_DELEG: + case NFS4_SHARE_WANT_NO_DELEG: + case NFS4_SHARE_WANT_CANCEL: + break; + default: + return nfserr_bad_xdr; + } + w &= ~NFS4_SHARE_WANT_MASK; + if (!w) + return nfs_ok; + switch (w) { + case NFS4_SHARE_SIGNAL_DELEG_WHEN_RESRC_AVAIL: + case NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED: + case (NFS4_SHARE_SIGNAL_DELEG_WHEN_RESRC_AVAIL | + NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED): + return nfs_ok; + } +xdr_error: + return nfserr_bad_xdr; +} + +static __be32 nfsd4_decode_share_deny(struct nfsd4_compoundargs *argp, u32 *x) +{ + __be32 *p; + + READ_BUF(4); + READ32(*x); + /* Note: unlinke access bits, deny bits may be zero. */ + if (*x & ~NFS4_SHARE_DENY_BOTH) + return nfserr_bad_xdr; + return nfs_ok; +xdr_error: + return nfserr_bad_xdr; +} + +static __be32 nfsd4_decode_opaque(struct nfsd4_compoundargs *argp, struct xdr_netobj *o) +{ + __be32 *p; + + READ_BUF(4); + READ32(o->len); + + if (o->len == 0 || o->len > NFS4_OPAQUE_LIMIT) + return nfserr_bad_xdr; + + READ_BUF(o->len); + SAVEMEM(o->data, o->len); + return nfs_ok; +xdr_error: + return nfserr_bad_xdr; +} + static __be32 nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) { @@ -631,19 +723,23 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) memset(open->op_bmval, 0, sizeof(open->op_bmval)); open->op_iattr.ia_valid = 0; - open->op_stateowner = NULL; + open->op_openowner = NULL; /* seqid, share_access, share_deny, clientid, ownerlen */ - READ_BUF(16 + sizeof(clientid_t)); + READ_BUF(4); READ32(open->op_seqid); - READ32(open->op_share_access); - READ32(open->op_share_deny); + status = nfsd4_decode_share_access(argp, &open->op_share_access); + if (status) + goto xdr_error; + status = nfsd4_decode_share_deny(argp, &open->op_share_deny); + if (status) + goto xdr_error; + READ_BUF(sizeof(clientid_t)); COPYMEM(&open->op_clientid, sizeof(clientid_t)); - READ32(open->op_owner.len); - - /* owner, open_flag */ - READ_BUF(open->op_owner.len + 4); - SAVEMEM(open->op_owner.data, open->op_owner.len); + status = nfsd4_decode_opaque(argp, &open->op_owner); + if (status) + goto xdr_error; + READ_BUF(4); READ32(open->op_create); switch (open->op_create) { case NFS4_OPEN_NOCREATE: @@ -709,6 +805,19 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) if ((status = check_filename(open->op_fname.data, open->op_fname.len, nfserr_inval))) return status; break; + case NFS4_OPEN_CLAIM_FH: + case NFS4_OPEN_CLAIM_DELEG_PREV_FH: + if (argp->minorversion < 1) + goto xdr_error; + /* void */ + break; + case NFS4_OPEN_CLAIM_DELEG_CUR_FH: + if (argp->minorversion < 1) + goto xdr_error; + status = nfsd4_decode_stateid(argp, &open->op_delegate_stateid); + if (status) + return status; + break; default: goto xdr_error; } @@ -721,7 +830,6 @@ nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_con { DECODE_HEAD; - open_conf->oc_stateowner = NULL; status = nfsd4_decode_stateid(argp, &open_conf->oc_req_stateid); if (status) return status; @@ -736,15 +844,17 @@ nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_d { DECODE_HEAD; - open_down->od_stateowner = NULL; status = nfsd4_decode_stateid(argp, &open_down->od_stateid); if (status) return status; - READ_BUF(12); + READ_BUF(4); READ32(open_down->od_seqid); - READ32(open_down->od_share_access); - READ32(open_down->od_share_deny); - + status = nfsd4_decode_share_access(argp, &open_down->od_share_access); + if (status) + return status; + status = nfsd4_decode_share_deny(argp, &open_down->od_share_deny); + if (status) + return status; DECODE_TAIL; } @@ -885,12 +995,13 @@ nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclient { DECODE_HEAD; - READ_BUF(12); + READ_BUF(8); COPYMEM(setclientid->se_verf.data, 8); - READ32(setclientid->se_namelen); - READ_BUF(setclientid->se_namelen + 8); - SAVEMEM(setclientid->se_name, setclientid->se_namelen); + status = nfsd4_decode_opaque(argp, &setclientid->se_name); + if (status) + return nfserr_bad_xdr; + READ_BUF(8); READ32(setclientid->se_callback_prog); READ32(setclientid->se_callback_netid_len); @@ -1033,11 +1144,9 @@ nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp, READ_BUF(NFS4_VERIFIER_SIZE); COPYMEM(exid->verifier.data, NFS4_VERIFIER_SIZE); - READ_BUF(4); - READ32(exid->clname.len); - - READ_BUF(exid->clname.len); - SAVEMEM(exid->clname.data, exid->clname.len); + status = nfsd4_decode_opaque(argp, &exid->clname); + if (status) + return nfserr_bad_xdr; READ_BUF(4); READ32(exid->flags); @@ -1246,6 +1355,19 @@ nfsd4_decode_destroy_session(struct nfsd4_compoundargs *argp, } static __be32 +nfsd4_decode_free_stateid(struct nfsd4_compoundargs *argp, + struct nfsd4_free_stateid *free_stateid) +{ + DECODE_HEAD; + + READ_BUF(sizeof(stateid_t)); + READ32(free_stateid->fr_stateid.si_generation); + COPYMEM(&free_stateid->fr_stateid.si_opaque, sizeof(stateid_opaque_t)); + + DECODE_TAIL; +} + +static __be32 nfsd4_decode_sequence(struct nfsd4_compoundargs *argp, struct nfsd4_sequence *seq) { @@ -1261,6 +1383,50 @@ nfsd4_decode_sequence(struct nfsd4_compoundargs *argp, DECODE_TAIL; } +static __be32 +nfsd4_decode_test_stateid(struct nfsd4_compoundargs *argp, struct nfsd4_test_stateid *test_stateid) +{ + unsigned int nbytes; + stateid_t si; + int i; + __be32 *p; + __be32 status; + + READ_BUF(4); + test_stateid->ts_num_ids = ntohl(*p++); + + nbytes = test_stateid->ts_num_ids * sizeof(stateid_t); + if (nbytes > (u32)((char *)argp->end - (char *)argp->p)) + goto xdr_error; + + test_stateid->ts_saved_args = argp; + save_buf(argp, &test_stateid->ts_savedp); + + for (i = 0; i < test_stateid->ts_num_ids; i++) { + status = nfsd4_decode_stateid(argp, &si); + if (status) + return status; + } + + status = 0; +out: + return status; +xdr_error: + dprintk("NFSD: xdr error (%s:%d)\n", __FILE__, __LINE__); + status = nfserr_bad_xdr; + goto out; +} + +static __be32 nfsd4_decode_destroy_clientid(struct nfsd4_compoundargs *argp, struct nfsd4_destroy_clientid *dc) +{ + DECODE_HEAD; + + READ_BUF(8); + COPYMEM(&dc->clientid, 8); + + DECODE_TAIL; +} + static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, struct nfsd4_reclaim_complete *rc) { DECODE_HEAD; @@ -1370,7 +1536,7 @@ static nfsd4_dec nfsd41_dec_ops[] = { [OP_EXCHANGE_ID] = (nfsd4_dec)nfsd4_decode_exchange_id, [OP_CREATE_SESSION] = (nfsd4_dec)nfsd4_decode_create_session, [OP_DESTROY_SESSION] = (nfsd4_dec)nfsd4_decode_destroy_session, - [OP_FREE_STATEID] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_FREE_STATEID] = (nfsd4_dec)nfsd4_decode_free_stateid, [OP_GET_DIR_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp, @@ -1380,9 +1546,9 @@ static nfsd4_dec nfsd41_dec_ops[] = { [OP_SECINFO_NO_NAME] = (nfsd4_dec)nfsd4_decode_secinfo_no_name, [OP_SEQUENCE] = (nfsd4_dec)nfsd4_decode_sequence, [OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_TEST_STATEID] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_TEST_STATEID] = (nfsd4_dec)nfsd4_decode_test_stateid, [OP_WANT_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_DESTROY_CLIENTID] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_DESTROY_CLIENTID] = (nfsd4_dec)nfsd4_decode_destroy_clientid, [OP_RECLAIM_COMPLETE] = (nfsd4_dec)nfsd4_decode_reclaim_complete, }; @@ -1402,6 +1568,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) DECODE_HEAD; struct nfsd4_op *op; struct nfsd4_minorversion_ops *ops; + bool cachethis = false; int i; /* @@ -1483,7 +1650,16 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) argp->opcnt = i+1; break; } + /* + * We'll try to cache the result in the DRC if any one + * op in the compound wants to be cached: + */ + cachethis |= nfsd4_cache_this_op(op); } + /* Sessions make the DRC unnecessary: */ + if (argp->minorversion) + cachethis = false; + argp->rqstp->rq_cachetype = cachethis ? RC_REPLBUFF : RC_NOCACHE; DECODE_TAIL; } @@ -1555,15 +1731,20 @@ static void write_cinfo(__be32 **p, struct nfsd4_change_info *c) * we know whether the error to be returned is a sequence id mutating error. */ -#define ENCODE_SEQID_OP_TAIL(stateowner) do { \ - if (seqid_mutating_err(nfserr) && stateowner) { \ - stateowner->so_seqid++; \ - stateowner->so_replay.rp_status = nfserr; \ - stateowner->so_replay.rp_buflen = \ - (((char *)(resp)->p - (char *)save)); \ - memcpy(stateowner->so_replay.rp_buf, save, \ - stateowner->so_replay.rp_buflen); \ - } } while (0); +static void encode_seqid_op_tail(struct nfsd4_compoundres *resp, __be32 *save, __be32 nfserr) +{ + struct nfs4_stateowner *stateowner = resp->cstate.replay_owner; + + if (seqid_mutating_err(ntohl(nfserr)) && stateowner) { + stateowner->so_seqid++; + stateowner->so_replay.rp_status = nfserr; + stateowner->so_replay.rp_buflen = + (char *)resp->p - (char *)save; + memcpy(stateowner->so_replay.rp_buf, save, + stateowner->so_replay.rp_buflen); + nfsd4_purge_closed_stateid(stateowner); + } +} /* Encode as an array of strings the string given with components * separated @sep. @@ -1622,36 +1803,89 @@ static __be32 nfsd4_encode_fs_location4(struct nfsd4_fs_location *location, } /* - * Return the path to an export point in the pseudo filesystem namespace - * Returned string is safe to use as long as the caller holds a reference - * to @exp. + * Encode a path in RFC3530 'pathname4' format */ -static char *nfsd4_path(struct svc_rqst *rqstp, struct svc_export *exp, __be32 *stat) +static __be32 nfsd4_encode_path(const struct path *root, + const struct path *path, __be32 **pp, int *buflen) { - struct svc_fh tmp_fh; - char *path = NULL, *rootpath; - size_t rootlen; + struct path cur = { + .mnt = path->mnt, + .dentry = path->dentry, + }; + __be32 *p = *pp; + struct dentry **components = NULL; + unsigned int ncomponents = 0; + __be32 err = nfserr_jukebox; - fh_init(&tmp_fh, NFS4_FHSIZE); - *stat = exp_pseudoroot(rqstp, &tmp_fh); - if (*stat) - return NULL; - rootpath = tmp_fh.fh_export->ex_pathname; + dprintk("nfsd4_encode_components("); - path = exp->ex_pathname; + path_get(&cur); + /* First walk the path up to the nfsd root, and store the + * dentries/path components in an array. + */ + for (;;) { + if (cur.dentry == root->dentry && cur.mnt == root->mnt) + break; + if (cur.dentry == cur.mnt->mnt_root) { + if (follow_up(&cur)) + continue; + goto out_free; + } + if ((ncomponents & 15) == 0) { + struct dentry **new; + new = krealloc(components, + sizeof(*new) * (ncomponents + 16), + GFP_KERNEL); + if (!new) + goto out_free; + components = new; + } + components[ncomponents++] = cur.dentry; + cur.dentry = dget_parent(cur.dentry); + } - rootlen = strlen(rootpath); - if (strncmp(path, rootpath, rootlen)) { - dprintk("nfsd: fs_locations failed;" - "%s is not contained in %s\n", path, rootpath); - *stat = nfserr_notsupp; - path = NULL; - goto out; + *buflen -= 4; + if (*buflen < 0) + goto out_free; + WRITE32(ncomponents); + + while (ncomponents) { + struct dentry *dentry = components[ncomponents - 1]; + unsigned int len = dentry->d_name.len; + + *buflen -= 4 + (XDR_QUADLEN(len) << 2); + if (*buflen < 0) + goto out_free; + WRITE32(len); + WRITEMEM(dentry->d_name.name, len); + dprintk("/%s", dentry->d_name.name); + dput(dentry); + ncomponents--; } - path += rootlen; -out: - fh_put(&tmp_fh); - return path; + + *pp = p; + err = 0; +out_free: + dprintk(")\n"); + while (ncomponents) + dput(components[--ncomponents]); + kfree(components); + path_put(&cur); + return err; +} + +static __be32 nfsd4_encode_fsloc_fsroot(struct svc_rqst *rqstp, + const struct path *path, __be32 **pp, int *buflen) +{ + struct svc_export *exp_ps; + __be32 res; + + exp_ps = rqst_find_fsidzero_export(rqstp); + if (IS_ERR(exp_ps)) + return nfserrno(PTR_ERR(exp_ps)); + res = nfsd4_encode_path(&exp_ps->ex_path, path, pp, buflen); + exp_put(exp_ps); + return res; } /* @@ -1665,11 +1899,8 @@ static __be32 nfsd4_encode_fs_locations(struct svc_rqst *rqstp, int i; __be32 *p = *pp; struct nfsd4_fs_locations *fslocs = &exp->ex_fslocs; - char *root = nfsd4_path(rqstp, exp, &status); - if (status) - return status; - status = nfsd4_encode_components('/', root, &p, buflen); + status = nfsd4_encode_fsloc_fsroot(rqstp, &exp->ex_path, &p, buflen); if (status) return status; if ((*buflen -= 4) < 0) @@ -1685,12 +1916,19 @@ static __be32 nfsd4_encode_fs_locations(struct svc_rqst *rqstp, return 0; } -static u32 nfs4_ftypes[16] = { - NF4BAD, NF4FIFO, NF4CHR, NF4BAD, - NF4DIR, NF4BAD, NF4BLK, NF4BAD, - NF4REG, NF4BAD, NF4LNK, NF4BAD, - NF4SOCK, NF4BAD, NF4LNK, NF4BAD, -}; +static u32 nfs4_file_type(umode_t mode) +{ + switch (mode & S_IFMT) { + case S_IFIFO: return NF4FIFO; + case S_IFCHR: return NF4CHR; + case S_IFDIR: return NF4DIR; + case S_IFBLK: return NF4BLK; + case S_IFLNK: return NF4LNK; + case S_IFREG: return NF4REG; + case S_IFSOCK: return NF4SOCK; + default: return NF4BAD; + }; +} static __be32 nfsd4_encode_name(struct svc_rqst *rqstp, int whotype, uid_t id, int group, @@ -1879,7 +2117,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, if (bmval0 & FATTR4_WORD0_TYPE) { if ((buflen -= 4) < 0) goto out_resource; - dummy = nfs4_ftypes[(stat.mode & S_IFMT) >> 12]; + dummy = nfs4_file_type(stat.mode); if (dummy == NF4BAD) goto out_serverfault; WRITE32(dummy); @@ -2413,7 +2651,7 @@ nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_c if (!nfserr) nfsd4_encode_stateid(resp, &close->cl_stateid); - ENCODE_SEQID_OP_TAIL(close->cl_stateowner); + encode_seqid_op_tail(resp, save, nfserr); return nfserr; } @@ -2489,17 +2727,18 @@ nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh static void nfsd4_encode_lock_denied(struct nfsd4_compoundres *resp, struct nfsd4_lock_denied *ld) { + struct xdr_netobj *conf = &ld->ld_owner; __be32 *p; - RESERVE_SPACE(32 + XDR_LEN(ld->ld_sop ? ld->ld_sop->so_owner.len : 0)); + RESERVE_SPACE(32 + XDR_LEN(conf->len)); WRITE64(ld->ld_start); WRITE64(ld->ld_length); WRITE32(ld->ld_type); - if (ld->ld_sop) { + if (conf->len) { WRITEMEM(&ld->ld_clientid, 8); - WRITE32(ld->ld_sop->so_owner.len); - WRITEMEM(ld->ld_sop->so_owner.data, ld->ld_sop->so_owner.len); - kref_put(&ld->ld_sop->so_ref, nfs4_free_stateowner); + WRITE32(conf->len); + WRITEMEM(conf->data, conf->len); + kfree(conf->data); } else { /* non - nfsv4 lock in conflict, no clientid nor owner */ WRITE64((u64)0); /* clientid */ WRITE32(0); /* length of owner name */ @@ -2517,7 +2756,7 @@ nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lo else if (nfserr == nfserr_denied) nfsd4_encode_lock_denied(resp, &lock->lk_denied); - ENCODE_SEQID_OP_TAIL(lock->lk_replay_owner); + encode_seqid_op_tail(resp, save, nfserr); return nfserr; } @@ -2537,7 +2776,7 @@ nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l if (!nfserr) nfsd4_encode_stateid(resp, &locku->lu_stateid); - ENCODE_SEQID_OP_TAIL(locku->lu_stateowner); + encode_seqid_op_tail(resp, save, nfserr); return nfserr; } @@ -2618,7 +2857,7 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op } /* XXX save filehandle here */ out: - ENCODE_SEQID_OP_TAIL(open->op_stateowner); + encode_seqid_op_tail(resp, save, nfserr); return nfserr; } @@ -2630,7 +2869,7 @@ nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct if (!nfserr) nfsd4_encode_stateid(resp, &oc->oc_resp_stateid); - ENCODE_SEQID_OP_TAIL(oc->oc_stateowner); + encode_seqid_op_tail(resp, save, nfserr); return nfserr; } @@ -2642,7 +2881,7 @@ nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struc if (!nfserr) nfsd4_encode_stateid(resp, &od->od_stateid); - ENCODE_SEQID_OP_TAIL(od->od_stateowner); + encode_seqid_op_tail(resp, save, nfserr); return nfserr; } @@ -2684,8 +2923,6 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen, &maxcount); - if (nfserr == nfserr_symlink) - nfserr = nfserr_inval; if (nfserr) return nfserr; eof = (read->rd_offset + maxcount >= @@ -2811,8 +3048,6 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4 readdir->common.err == nfserr_toosmall && readdir->buffer == page) nfserr = nfserr_toosmall; - if (nfserr == nfserr_symlink) - nfserr = nfserr_notdir; if (nfserr) goto err_no_verf; @@ -3116,6 +3351,21 @@ nfsd4_encode_destroy_session(struct nfsd4_compoundres *resp, int nfserr, } static __be32 +nfsd4_encode_free_stateid(struct nfsd4_compoundres *resp, int nfserr, + struct nfsd4_free_stateid *free_stateid) +{ + __be32 *p; + + if (nfserr) + return nfserr; + + RESERVE_SPACE(4); + WRITE32(nfserr); + ADJUST_ARGS(); + return nfserr; +} + +static __be32 nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_sequence *seq) { @@ -3128,9 +3378,9 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr, WRITEMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN); WRITE32(seq->seqid); WRITE32(seq->slotid); - WRITE32(seq->maxslots); - /* For now: target_maxslots = maxslots */ - WRITE32(seq->maxslots); + /* Note slotid's are numbered from zero: */ + WRITE32(seq->maxslots - 1); /* sr_highest_slotid */ + WRITE32(seq->maxslots - 1); /* sr_target_highest_slotid */ WRITE32(seq->status_flags); ADJUST_ARGS(); @@ -3138,6 +3388,37 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr, return 0; } +__be32 +nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, int nfserr, + struct nfsd4_test_stateid *test_stateid) +{ + struct nfsd4_compoundargs *argp; + struct nfs4_client *cl = resp->cstate.session->se_client; + stateid_t si; + __be32 *p; + int i; + int valid; + + restore_buf(test_stateid->ts_saved_args, &test_stateid->ts_savedp); + argp = test_stateid->ts_saved_args; + + RESERVE_SPACE(4); + *p++ = htonl(test_stateid->ts_num_ids); + resp->p = p; + + nfs4_lock_state(); + for (i = 0; i < test_stateid->ts_num_ids; i++) { + nfsd4_decode_stateid(argp, &si); + valid = nfs4_validate_stateid(cl, &si); + RESERVE_SPACE(4); + *p++ = htonl(valid); + resp->p = p; + } + nfs4_unlock_state(); + + return nfserr; +} + static __be32 nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p) { @@ -3196,7 +3477,7 @@ static nfsd4_enc nfsd4_enc_ops[] = { [OP_EXCHANGE_ID] = (nfsd4_enc)nfsd4_encode_exchange_id, [OP_CREATE_SESSION] = (nfsd4_enc)nfsd4_encode_create_session, [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_destroy_session, - [OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_noop, + [OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_free_stateid, [OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop, [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_noop, [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop, @@ -3206,7 +3487,7 @@ static nfsd4_enc nfsd4_enc_ops[] = { [OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_secinfo_no_name, [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence, [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop, - [OP_TEST_STATEID] = (nfsd4_enc)nfsd4_encode_noop, + [OP_TEST_STATEID] = (nfsd4_enc)nfsd4_encode_test_stateid, [OP_WANT_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop, [OP_DESTROY_CLIENTID] = (nfsd4_enc)nfsd4_encode_noop, [OP_RECLAIM_COMPLETE] = (nfsd4_enc)nfsd4_encode_noop, @@ -3214,34 +3495,29 @@ static nfsd4_enc nfsd4_enc_ops[] = { /* * Calculate the total amount of memory that the compound response has taken - * after encoding the current operation. + * after encoding the current operation with pad. * - * pad: add on 8 bytes for the next operation's op_code and status so that - * there is room to cache a failure on the next operation. + * pad: if operation is non-idempotent, pad was calculate by op_rsize_bop() + * which was specified at nfsd4_operation, else pad is zero. * - * Compare this length to the session se_fmaxresp_cached. + * Compare this length to the session se_fmaxresp_sz and se_fmaxresp_cached. * * Our se_fmaxresp_cached will always be a multiple of PAGE_SIZE, and so * will be at least a page and will therefore hold the xdr_buf head. */ -static int nfsd4_check_drc_limit(struct nfsd4_compoundres *resp) +int nfsd4_check_resp_size(struct nfsd4_compoundres *resp, u32 pad) { - int status = 0; struct xdr_buf *xb = &resp->rqstp->rq_res; - struct nfsd4_compoundargs *args = resp->rqstp->rq_argp; struct nfsd4_session *session = NULL; struct nfsd4_slot *slot = resp->cstate.slot; - u32 length, tlen = 0, pad = 8; + u32 length, tlen = 0; if (!nfsd4_has_session(&resp->cstate)) - return status; + return 0; session = resp->cstate.session; - if (session == NULL || slot->sl_cachethis == 0) - return status; - - if (resp->opcnt >= args->opcnt) - pad = 0; /* this is the last operation */ + if (session == NULL) + return 0; if (xb->page_len == 0) { length = (char *)resp->p - (char *)xb->head[0].iov_base + pad; @@ -3254,10 +3530,14 @@ static int nfsd4_check_drc_limit(struct nfsd4_compoundres *resp) dprintk("%s length %u, xb->page_len %u tlen %u pad %u\n", __func__, length, xb->page_len, tlen, pad); - if (length <= session->se_fchannel.maxresp_cached) - return status; - else + if (length > session->se_fchannel.maxresp_sz) + return nfserr_rep_too_big; + + if (slot->sl_cachethis == 1 && + length > session->se_fchannel.maxresp_cached) return nfserr_rep_too_big_to_cache; + + return 0; } void @@ -3277,8 +3557,8 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op) !nfsd4_enc_ops[op->opnum]); op->status = nfsd4_enc_ops[op->opnum](resp, op->status, &op->u); /* nfsd4_check_drc_limit guarantees enough room for error status */ - if (!op->status && nfsd4_check_drc_limit(resp)) - op->status = nfserr_rep_too_big_to_cache; + if (!op->status) + op->status = nfsd4_check_resp_size(resp, 0); status: /* * Note: We write the status directly, instead of using WRITE32(), @@ -3319,8 +3599,11 @@ nfs4svc_encode_voidres(struct svc_rqst *rqstp, __be32 *p, void *dummy) return xdr_ressize_check(rqstp, p); } -void nfsd4_release_compoundargs(struct nfsd4_compoundargs *args) +int nfsd4_release_compoundargs(void *rq, __be32 *p, void *resp) { + struct svc_rqst *rqstp = rq; + struct nfsd4_compoundargs *args = rqstp->rq_argp; + if (args->ops != args->iops) { kfree(args->ops); args->ops = args->iops; @@ -3333,13 +3616,12 @@ void nfsd4_release_compoundargs(struct nfsd4_compoundargs *args) tb->release(tb->buf); kfree(tb); } + return 1; } int nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compoundargs *args) { - __be32 status; - args->p = p; args->end = rqstp->rq_arg.head[0].iov_base + rqstp->rq_arg.head[0].iov_len; args->pagelist = rqstp->rq_arg.pages; @@ -3349,11 +3631,7 @@ nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_comp args->ops = args->iops; args->rqstp = rqstp; - status = nfsd4_decode_compound(args); - if (status) { - nfsd4_release_compoundargs(args); - } - return !status; + return !nfsd4_decode_compound(args); } int diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 4666a209678a..2cbac34a55da 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -118,7 +118,7 @@ hash_refile(struct svc_cacherep *rp) * Note that no operation within the loop may sleep. */ int -nfsd_cache_lookup(struct svc_rqst *rqstp, int type) +nfsd_cache_lookup(struct svc_rqst *rqstp) { struct hlist_node *hn; struct hlist_head *rh; @@ -128,6 +128,7 @@ nfsd_cache_lookup(struct svc_rqst *rqstp, int type) vers = rqstp->rq_vers, proc = rqstp->rq_proc; unsigned long age; + int type = rqstp->rq_cachetype; int rtn; rqstp->rq_cacherep = NULL; diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 2b1449dd2f49..c45a2ea4a090 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -9,11 +9,11 @@ #include <linux/ctype.h> #include <linux/sunrpc/svcsock.h> -#include <linux/nfsd/syscall.h> #include <linux/lockd/lockd.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/gss_api.h> #include <linux/sunrpc/gss_krb5_enctypes.h> +#include <linux/module.h> #include "idmap.h" #include "nfsd.h" @@ -24,15 +24,6 @@ */ enum { NFSD_Root = 1, -#ifdef CONFIG_NFSD_DEPRECATED - NFSD_Svc, - NFSD_Add, - NFSD_Del, - NFSD_Export, - NFSD_Unexport, - NFSD_Getfd, - NFSD_Getfs, -#endif NFSD_List, NFSD_Export_features, NFSD_Fh, @@ -59,15 +50,6 @@ enum { /* * write() for these nodes. */ -#ifdef CONFIG_NFSD_DEPRECATED -static ssize_t write_svc(struct file *file, char *buf, size_t size); -static ssize_t write_add(struct file *file, char *buf, size_t size); -static ssize_t write_del(struct file *file, char *buf, size_t size); -static ssize_t write_export(struct file *file, char *buf, size_t size); -static ssize_t write_unexport(struct file *file, char *buf, size_t size); -static ssize_t write_getfd(struct file *file, char *buf, size_t size); -static ssize_t write_getfs(struct file *file, char *buf, size_t size); -#endif static ssize_t write_filehandle(struct file *file, char *buf, size_t size); static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size); static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size); @@ -83,15 +65,6 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size); #endif static ssize_t (*write_op[])(struct file *, char *, size_t) = { -#ifdef CONFIG_NFSD_DEPRECATED - [NFSD_Svc] = write_svc, - [NFSD_Add] = write_add, - [NFSD_Del] = write_del, - [NFSD_Export] = write_export, - [NFSD_Unexport] = write_unexport, - [NFSD_Getfd] = write_getfd, - [NFSD_Getfs] = write_getfs, -#endif [NFSD_Fh] = write_filehandle, [NFSD_FO_UnlockIP] = write_unlock_ip, [NFSD_FO_UnlockFS] = write_unlock_fs, @@ -130,16 +103,6 @@ static ssize_t nfsctl_transaction_write(struct file *file, const char __user *bu static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos) { -#ifdef CONFIG_NFSD_DEPRECATED - static int warned; - if (file->f_dentry->d_name.name[0] == '.' && !warned) { - printk(KERN_INFO - "Warning: \"%s\" uses deprecated NFSD interface: %s." - " This will be removed in 2.6.40\n", - current->comm, file->f_dentry->d_name.name); - warned = 1; - } -#endif if (! file->private_data) { /* An attempt to read a transaction file without writing * causes a 0-byte write so that the file can return @@ -226,303 +189,6 @@ static const struct file_operations pool_stats_operations = { * payload - write methods */ -#ifdef CONFIG_NFSD_DEPRECATED -/** - * write_svc - Start kernel's NFSD server - * - * Deprecated. /proc/fs/nfsd/threads is preferred. - * Function remains to support old versions of nfs-utils. - * - * Input: - * buf: struct nfsctl_svc - * svc_port: port number of this - * server's listener - * svc_nthreads: number of threads to start - * size: size in bytes of passed in nfsctl_svc - * Output: - * On success: returns zero - * On error: return code is negative errno value - */ -static ssize_t write_svc(struct file *file, char *buf, size_t size) -{ - struct nfsctl_svc *data; - int err; - if (size < sizeof(*data)) - return -EINVAL; - data = (struct nfsctl_svc*) buf; - err = nfsd_svc(data->svc_port, data->svc_nthreads); - if (err < 0) - return err; - return 0; -} - -/** - * write_add - Add or modify client entry in auth unix cache - * - * Deprecated. /proc/net/rpc/auth.unix.ip is preferred. - * Function remains to support old versions of nfs-utils. - * - * Input: - * buf: struct nfsctl_client - * cl_ident: '\0'-terminated C string - * containing domain name - * of client - * cl_naddr: no. of items in cl_addrlist - * cl_addrlist: array of client addresses - * cl_fhkeytype: ignored - * cl_fhkeylen: ignored - * cl_fhkey: ignored - * size: size in bytes of passed in nfsctl_client - * Output: - * On success: returns zero - * On error: return code is negative errno value - * - * Note: Only AF_INET client addresses are passed in, since - * nfsctl_client.cl_addrlist contains only in_addr fields for addresses. - */ -static ssize_t write_add(struct file *file, char *buf, size_t size) -{ - struct nfsctl_client *data; - if (size < sizeof(*data)) - return -EINVAL; - data = (struct nfsctl_client *)buf; - return exp_addclient(data); -} - -/** - * write_del - Remove client from auth unix cache - * - * Deprecated. /proc/net/rpc/auth.unix.ip is preferred. - * Function remains to support old versions of nfs-utils. - * - * Input: - * buf: struct nfsctl_client - * cl_ident: '\0'-terminated C string - * containing domain name - * of client - * cl_naddr: ignored - * cl_addrlist: ignored - * cl_fhkeytype: ignored - * cl_fhkeylen: ignored - * cl_fhkey: ignored - * size: size in bytes of passed in nfsctl_client - * Output: - * On success: returns zero - * On error: return code is negative errno value - * - * Note: Only AF_INET client addresses are passed in, since - * nfsctl_client.cl_addrlist contains only in_addr fields for addresses. - */ -static ssize_t write_del(struct file *file, char *buf, size_t size) -{ - struct nfsctl_client *data; - if (size < sizeof(*data)) - return -EINVAL; - data = (struct nfsctl_client *)buf; - return exp_delclient(data); -} - -/** - * write_export - Export part or all of a local file system - * - * Deprecated. /proc/net/rpc/{nfsd.export,nfsd.fh} are preferred. - * Function remains to support old versions of nfs-utils. - * - * Input: - * buf: struct nfsctl_export - * ex_client: '\0'-terminated C string - * containing domain name - * of client allowed to access - * this export - * ex_path: '\0'-terminated C string - * containing pathname of - * directory in local file system - * ex_dev: fsid to use for this export - * ex_ino: ignored - * ex_flags: export flags for this export - * ex_anon_uid: UID to use for anonymous - * requests - * ex_anon_gid: GID to use for anonymous - * requests - * size: size in bytes of passed in nfsctl_export - * Output: - * On success: returns zero - * On error: return code is negative errno value - */ -static ssize_t write_export(struct file *file, char *buf, size_t size) -{ - struct nfsctl_export *data; - if (size < sizeof(*data)) - return -EINVAL; - data = (struct nfsctl_export*)buf; - return exp_export(data); -} - -/** - * write_unexport - Unexport a previously exported file system - * - * Deprecated. /proc/net/rpc/{nfsd.export,nfsd.fh} are preferred. - * Function remains to support old versions of nfs-utils. - * - * Input: - * buf: struct nfsctl_export - * ex_client: '\0'-terminated C string - * containing domain name - * of client no longer allowed - * to access this export - * ex_path: '\0'-terminated C string - * containing pathname of - * directory in local file system - * ex_dev: ignored - * ex_ino: ignored - * ex_flags: ignored - * ex_anon_uid: ignored - * ex_anon_gid: ignored - * size: size in bytes of passed in nfsctl_export - * Output: - * On success: returns zero - * On error: return code is negative errno value - */ -static ssize_t write_unexport(struct file *file, char *buf, size_t size) -{ - struct nfsctl_export *data; - - if (size < sizeof(*data)) - return -EINVAL; - data = (struct nfsctl_export*)buf; - return exp_unexport(data); -} - -/** - * write_getfs - Get a variable-length NFS file handle by path - * - * Deprecated. /proc/fs/nfsd/filehandle is preferred. - * Function remains to support old versions of nfs-utils. - * - * Input: - * buf: struct nfsctl_fsparm - * gd_addr: socket address of client - * gd_path: '\0'-terminated C string - * containing pathname of - * directory in local file system - * gd_maxlen: maximum size of returned file - * handle - * size: size in bytes of passed in nfsctl_fsparm - * Output: - * On success: passed-in buffer filled with a knfsd_fh structure - * (a variable-length raw NFS file handle); - * return code is the size in bytes of the file handle - * On error: return code is negative errno value - * - * Note: Only AF_INET client addresses are passed in, since gd_addr - * is the same size as a struct sockaddr_in. - */ -static ssize_t write_getfs(struct file *file, char *buf, size_t size) -{ - struct nfsctl_fsparm *data; - struct sockaddr_in *sin; - struct auth_domain *clp; - int err = 0; - struct knfsd_fh *res; - struct in6_addr in6; - - if (size < sizeof(*data)) - return -EINVAL; - data = (struct nfsctl_fsparm*)buf; - err = -EPROTONOSUPPORT; - if (data->gd_addr.sa_family != AF_INET) - goto out; - sin = (struct sockaddr_in *)&data->gd_addr; - if (data->gd_maxlen > NFS3_FHSIZE) - data->gd_maxlen = NFS3_FHSIZE; - - res = (struct knfsd_fh*)buf; - - exp_readlock(); - - ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &in6); - - clp = auth_unix_lookup(&init_net, &in6); - if (!clp) - err = -EPERM; - else { - err = exp_rootfh(clp, data->gd_path, res, data->gd_maxlen); - auth_domain_put(clp); - } - exp_readunlock(); - if (err == 0) - err = res->fh_size + offsetof(struct knfsd_fh, fh_base); - out: - return err; -} - -/** - * write_getfd - Get a fixed-length NFS file handle by path (used by mountd) - * - * Deprecated. /proc/fs/nfsd/filehandle is preferred. - * Function remains to support old versions of nfs-utils. - * - * Input: - * buf: struct nfsctl_fdparm - * gd_addr: socket address of client - * gd_path: '\0'-terminated C string - * containing pathname of - * directory in local file system - * gd_version: fdparm structure version - * size: size in bytes of passed in nfsctl_fdparm - * Output: - * On success: passed-in buffer filled with nfsctl_res - * (a fixed-length raw NFS file handle); - * return code is the size in bytes of the file handle - * On error: return code is negative errno value - * - * Note: Only AF_INET client addresses are passed in, since gd_addr - * is the same size as a struct sockaddr_in. - */ -static ssize_t write_getfd(struct file *file, char *buf, size_t size) -{ - struct nfsctl_fdparm *data; - struct sockaddr_in *sin; - struct auth_domain *clp; - int err = 0; - struct knfsd_fh fh; - char *res; - struct in6_addr in6; - - if (size < sizeof(*data)) - return -EINVAL; - data = (struct nfsctl_fdparm*)buf; - err = -EPROTONOSUPPORT; - if (data->gd_addr.sa_family != AF_INET) - goto out; - err = -EINVAL; - if (data->gd_version < 2 || data->gd_version > NFSSVC_MAXVERS) - goto out; - - res = buf; - sin = (struct sockaddr_in *)&data->gd_addr; - exp_readlock(); - - ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &in6); - - clp = auth_unix_lookup(&init_net, &in6); - if (!clp) - err = -EPERM; - else { - err = exp_rootfh(clp, data->gd_path, &fh, NFS_FHSIZE); - auth_domain_put(clp); - } - exp_readunlock(); - - if (err == 0) { - memset(res,0, NFS_FHSIZE); - memcpy(res, &fh.fh_base, fh.fh_size); - err = NFS_FHSIZE; - } - out: - return err; -} -#endif /* CONFIG_NFSD_DEPRECATED */ /** * write_unlock_ip - Release all locks used by a client @@ -1397,15 +1063,6 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size) static int nfsd_fill_super(struct super_block * sb, void * data, int silent) { static struct tree_descr nfsd_files[] = { -#ifdef CONFIG_NFSD_DEPRECATED - [NFSD_Svc] = {".svc", &transaction_ops, S_IWUSR}, - [NFSD_Add] = {".add", &transaction_ops, S_IWUSR}, - [NFSD_Del] = {".del", &transaction_ops, S_IWUSR}, - [NFSD_Export] = {".export", &transaction_ops, S_IWUSR}, - [NFSD_Unexport] = {".unexport", &transaction_ops, S_IWUSR}, - [NFSD_Getfd] = {".getfd", &transaction_ops, S_IWUSR|S_IRUSR}, - [NFSD_Getfs] = {".getfs", &transaction_ops, S_IWUSR|S_IRUSR}, -#endif [NFSD_List] = {"exports", &exports_operations, S_IRUGO}, [NFSD_Export_features] = {"export_features", &export_features_operations, S_IRUGO}, diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 7ecfa2420307..58134a23fdfb 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -11,13 +11,39 @@ #include <linux/types.h> #include <linux/mount.h> +#include <linux/nfs.h> +#include <linux/nfs2.h> +#include <linux/nfs3.h> +#include <linux/nfs4.h> +#include <linux/sunrpc/msg_prot.h> + #include <linux/nfsd/debug.h> #include <linux/nfsd/export.h> #include <linux/nfsd/stats.h> + /* * nfsd version */ #define NFSD_SUPPORTED_MINOR_VERSION 1 +/* + * Maximum blocksizes supported by daemon under various circumstances. + */ +#define NFSSVC_MAXBLKSIZE RPCSVC_MAXPAYLOAD +/* NFSv2 is limited by the protocol specification, see RFC 1094 */ +#define NFSSVC_MAXBLKSIZE_V2 (8*1024) + + +/* + * Largest number of bytes we need to allocate for an NFS + * call or reply. Used to control buffer sizes. We use + * the length of v3 WRITE, READDIR and READDIR replies + * which are an RPC header, up to 26 XDR units of reply + * data, and some page data. + * + * Note that accuracy here doesn't matter too much as the + * size is rounded up to a page size when allocating space. + */ +#define NFSD_BUFSIZE ((RPC_MAX_HEADER_WITH_AUTH+26)*XDR_UNIT + NFSSVC_MAXBLKSIZE) struct readdir_cd { __be32 err; /* 0, nfserr, or nfserr_eof */ @@ -335,6 +361,13 @@ static inline u32 nfsd_suppattrs2(u32 minorversion) #define NFSD_SUPPATTR_EXCLCREAT_WORD2 \ NFSD_WRITEABLE_ATTRS_WORD2 +extern int nfsd4_is_junction(struct dentry *dentry); +#else +static inline int nfsd4_is_junction(struct dentry *dentry) +{ + return 0; +} + #endif /* CONFIG_NFSD_V4 */ #endif /* LINUX_NFSD_NFSD_H */ diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 90c6aa6d5e0f..c763de5c1157 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -59,28 +59,25 @@ static int nfsd_acceptable(void *expv, struct dentry *dentry) * the write call). */ static inline __be32 -nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, int type) +nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, int requested) { - /* Type can be negative when creating hardlinks - not to a dir */ - if (type > 0 && (mode & S_IFMT) != type) { - if (rqstp->rq_vers == 4 && (mode & S_IFMT) == S_IFLNK) - return nfserr_symlink; - else if (type == S_IFDIR) - return nfserr_notdir; - else if ((mode & S_IFMT) == S_IFDIR) - return nfserr_isdir; - else - return nfserr_inval; - } - if (type < 0 && (mode & S_IFMT) == -type) { - if (rqstp->rq_vers == 4 && (mode & S_IFMT) == S_IFLNK) - return nfserr_symlink; - else if (type == -S_IFDIR) - return nfserr_isdir; - else - return nfserr_notdir; - } - return 0; + mode &= S_IFMT; + + if (requested == 0) /* the caller doesn't care */ + return nfs_ok; + if (mode == requested) + return nfs_ok; + /* + * v4 has an error more specific than err_notdir which we should + * return in preference to err_notdir: + */ + if (rqstp->rq_vers == 4 && mode == S_IFLNK) + return nfserr_symlink; + if (requested == S_IFDIR) + return nfserr_notdir; + if (mode == S_IFDIR) + return nfserr_isdir; + return nfserr_inval; } static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp, diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 18743c4d8bca..eda7d7e55e05 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -8,6 +8,7 @@ #include <linux/sched.h> #include <linux/freezer.h> +#include <linux/module.h> #include <linux/fs_struct.h> #include <linux/swap.h> @@ -256,6 +257,8 @@ static void nfsd_last_thread(struct svc_serv *serv) nfsd_serv = NULL; nfsd_shutdown(); + svc_rpcb_cleanup(serv); + printk(KERN_WARNING "nfsd: last server has exited, flushing export " "cache\n"); nfsd_export_flush(); @@ -528,16 +531,9 @@ nfsd(void *vrqstp) continue; } - - /* Lock the export hash tables for reading. */ - exp_readlock(); - validate_process_creds(); svc_process(rqstp); validate_process_creds(); - - /* Unlock export hash tables */ - exp_readunlock(); } /* Clear signals before calling svc_exit_thread() */ @@ -577,8 +573,22 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) rqstp->rq_vers, rqstp->rq_proc); proc = rqstp->rq_procinfo; + /* + * Give the xdr decoder a chance to change this if it wants + * (necessary in the NFSv4.0 compound case) + */ + rqstp->rq_cachetype = proc->pc_cachetype; + /* Decode arguments */ + xdr = proc->pc_decode; + if (xdr && !xdr(rqstp, (__be32*)rqstp->rq_arg.head[0].iov_base, + rqstp->rq_argp)) { + dprintk("nfsd: failed to decode arguments!\n"); + *statp = rpc_garbage_args; + return 1; + } + /* Check whether we have this call in the cache. */ - switch (nfsd_cache_lookup(rqstp, proc->pc_cachetype)) { + switch (nfsd_cache_lookup(rqstp)) { case RC_INTR: case RC_DROPIT: return 0; @@ -588,16 +598,6 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) /* do it */ } - /* Decode arguments */ - xdr = proc->pc_decode; - if (xdr && !xdr(rqstp, (__be32*)rqstp->rq_arg.head[0].iov_base, - rqstp->rq_argp)) { - dprintk("nfsd: failed to decode arguments!\n"); - nfsd_cache_update(rqstp, RC_NOCACHE, NULL); - *statp = rpc_garbage_args; - return 1; - } - /* need to grab the location to store the status, as * nfsv4 does some encoding while processing */ diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 6bd2f3c21f2b..a3cf38476a1b 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -35,6 +35,7 @@ #ifndef _NFSD4_STATE_H #define _NFSD4_STATE_H +#include <linux/idr.h> #include <linux/sunrpc/svc_xprt.h> #include <linux/nfsd/nfsfh.h> #include "nfsfh.h" @@ -45,24 +46,20 @@ typedef struct { } clientid_t; typedef struct { - u32 so_boot; - u32 so_stateownerid; - u32 so_fileid; + clientid_t so_clid; + u32 so_id; } stateid_opaque_t; typedef struct { u32 si_generation; stateid_opaque_t si_opaque; } stateid_t; -#define si_boot si_opaque.so_boot -#define si_stateownerid si_opaque.so_stateownerid -#define si_fileid si_opaque.so_fileid #define STATEID_FMT "(%08x/%08x/%08x/%08x)" #define STATEID_VAL(s) \ - (s)->si_boot, \ - (s)->si_stateownerid, \ - (s)->si_fileid, \ + (s)->si_opaque.so_clid.cl_boot, \ + (s)->si_opaque.so_clid.cl_id, \ + (s)->si_opaque.so_id, \ (s)->si_generation struct nfsd4_callback { @@ -76,17 +73,27 @@ struct nfsd4_callback { bool cb_done; }; +struct nfs4_stid { +#define NFS4_OPEN_STID 1 +#define NFS4_LOCK_STID 2 +#define NFS4_DELEG_STID 4 +/* For an open stateid kept around *only* to process close replays: */ +#define NFS4_CLOSED_STID 8 + unsigned char sc_type; + stateid_t sc_stateid; + struct nfs4_client *sc_client; +}; + struct nfs4_delegation { + struct nfs4_stid dl_stid; /* must be first field */ struct list_head dl_perfile; struct list_head dl_perclnt; struct list_head dl_recall_lru; /* delegation recalled */ atomic_t dl_count; /* ref count */ - struct nfs4_client *dl_client; struct nfs4_file *dl_file; u32 dl_type; time_t dl_time; /* For recall: */ - stateid_t dl_stateid; struct knfsd_fh dl_fh; int dl_retries; struct nfsd4_callback dl_recall; @@ -104,6 +111,11 @@ struct nfs4_cb_conn { struct svc_xprt *cb_xprt; /* minorversion 1 only */ }; +static inline struct nfs4_delegation *delegstateid(struct nfs4_stid *s) +{ + return container_of(s, struct nfs4_delegation, dl_stid); +} + /* Maximum number of slots per session. 160 is useful for long haul TCP */ #define NFSD_MAX_SLOTS_PER_SESSION 160 /* Maximum number of operations per session compound */ @@ -220,6 +232,7 @@ struct nfs4_client { struct list_head cl_idhash; /* hash by cl_clientid.id */ struct list_head cl_strhash; /* hash by cl_name */ struct list_head cl_openowners; + struct idr cl_stateids; /* stateid lookup */ struct list_head cl_delegations; struct list_head cl_lru; /* tail queue */ struct xdr_netobj cl_name; /* id generated by client */ @@ -245,6 +258,7 @@ struct nfs4_client { #define NFSD4_CB_UP 0 #define NFSD4_CB_UNKNOWN 1 #define NFSD4_CB_DOWN 2 +#define NFSD4_CB_FAULT 3 int cl_cb_state; struct nfsd4_callback cl_cb_null; struct nfsd4_session *cl_cb_session; @@ -293,6 +307,9 @@ static inline void update_stateid(stateid_t *stateid) { stateid->si_generation++; + /* Wraparound recommendation from 3530bis-13 9.1.3.2: */ + if (stateid->si_generation == 0) + stateid->si_generation = 1; } /* A reasonable value for REPLAY_ISIZE was estimated as follows: @@ -312,49 +329,57 @@ struct nfs4_replay { __be32 rp_status; unsigned int rp_buflen; char *rp_buf; - unsigned intrp_allocated; struct knfsd_fh rp_openfh; char rp_ibuf[NFSD4_REPLAY_ISIZE]; }; -/* -* nfs4_stateowner can either be an open_owner, or a lock_owner -* -* so_idhash: stateid_hashtbl[] for open owner, lockstateid_hashtbl[] -* for lock_owner -* so_strhash: ownerstr_hashtbl[] for open_owner, lock_ownerstr_hashtbl[] -* for lock_owner -* so_perclient: nfs4_client->cl_perclient entry - used when nfs4_client -* struct is reaped. -* so_perfilestate: heads the list of nfs4_stateid (either open or lock) -* and is used to ensure no dangling nfs4_stateid references when we -* release a stateowner. -* so_perlockowner: (open) nfs4_stateid->st_perlockowner entry - used when -* close is called to reap associated byte-range locks -* so_close_lru: (open) stateowner is placed on this list instead of being -* reaped (when so_perfilestate is empty) to hold the last close replay. -* reaped by laundramat thread after lease period. -*/ struct nfs4_stateowner { - struct kref so_ref; - struct list_head so_idhash; /* hash by so_id */ struct list_head so_strhash; /* hash by op_name */ - struct list_head so_perclient; struct list_head so_stateids; - struct list_head so_perstateid; /* for lockowners only */ - struct list_head so_close_lru; /* tail queue */ - time_t so_time; /* time of placement on so_close_lru */ - int so_is_open_owner; /* 1=openowner,0=lockowner */ - u32 so_id; struct nfs4_client * so_client; /* after increment in ENCODE_SEQID_OP_TAIL, represents the next * sequence id expected from the client: */ u32 so_seqid; struct xdr_netobj so_owner; /* open owner name */ - int so_confirmed; /* successful OPEN_CONFIRM? */ struct nfs4_replay so_replay; + bool so_is_open_owner; }; +struct nfs4_openowner { + struct nfs4_stateowner oo_owner; /* must be first field */ + struct list_head oo_perclient; + /* + * We keep around openowners a little while after last close, + * which saves clients from having to confirm, and allows us to + * handle close replays if they come soon enough. The close_lru + * is a list of such openowners, to be reaped by the laundromat + * thread eventually if they remain unused: + */ + struct list_head oo_close_lru; + struct nfs4_ol_stateid *oo_last_closed_stid; + time_t oo_time; /* time of placement on so_close_lru */ +#define NFS4_OO_CONFIRMED 1 +#define NFS4_OO_PURGE_CLOSE 2 +#define NFS4_OO_NEW 4 + unsigned char oo_flags; +}; + +struct nfs4_lockowner { + struct nfs4_stateowner lo_owner; /* must be first element */ + struct list_head lo_perstateid; /* for lockowners only */ + struct list_head lo_list; /* for temporary uses */ +}; + +static inline struct nfs4_openowner * openowner(struct nfs4_stateowner *so) +{ + return container_of(so, struct nfs4_openowner, oo_owner); +} + +static inline struct nfs4_lockowner * lockowner(struct nfs4_stateowner *so) +{ + return container_of(so, struct nfs4_lockowner, lo_owner); +} + /* * nfs4_file: a file opened by some number of (open) nfs4_stateowners. * o fi_perfile list is used to search for conflicting @@ -368,17 +393,17 @@ struct nfs4_file { /* One each for O_RDONLY, O_WRONLY, O_RDWR: */ struct file * fi_fds[3]; /* - * Each open or lock stateid contributes 1 to either - * fi_access[O_RDONLY], fi_access[O_WRONLY], or both, depending - * on open or lock mode: + * Each open or lock stateid contributes 0-4 to the counts + * below depending on which bits are set in st_access_bitmap: + * 1 to fi_access[O_RDONLY] if NFS4_SHARE_ACCES_READ is set + * + 1 to fi_access[O_WRONLY] if NFS4_SHARE_ACCESS_WRITE is set + * + 1 to both of the above if NFS4_SHARE_ACCESS_BOTH is set. */ atomic_t fi_access[2]; struct file *fi_deleg_file; struct file_lock *fi_lease; atomic_t fi_delegees; struct inode *fi_inode; - u32 fi_id; /* used with stateowner->so_id - * for stateid_hashtbl hash */ bool fi_had_conflict; }; @@ -408,50 +433,27 @@ static inline struct file *find_any_file(struct nfs4_file *f) return f->fi_fds[O_RDONLY]; } -/* -* nfs4_stateid can either be an open stateid or (eventually) a lock stateid -* -* (open)nfs4_stateid: one per (open)nfs4_stateowner, nfs4_file -* -* st_hash: stateid_hashtbl[] entry or lockstateid_hashtbl entry -* st_perfile: file_hashtbl[] entry. -* st_perfile_state: nfs4_stateowner->so_perfilestate -* st_perlockowner: (open stateid) list of lock nfs4_stateowners -* st_access_bmap: used only for open stateid -* st_deny_bmap: used only for open stateid -* st_openstp: open stateid lock stateid was derived from -* -* XXX: open stateids and lock stateids have diverged sufficiently that -* we should consider defining separate structs for the two cases. -*/ - -struct nfs4_stateid { - struct list_head st_hash; +/* "ol" stands for "Open or Lock". Better suggestions welcome. */ +struct nfs4_ol_stateid { + struct nfs4_stid st_stid; /* must be first field */ struct list_head st_perfile; struct list_head st_perstateowner; struct list_head st_lockowners; struct nfs4_stateowner * st_stateowner; struct nfs4_file * st_file; - stateid_t st_stateid; unsigned long st_access_bmap; unsigned long st_deny_bmap; - struct nfs4_stateid * st_openstp; + struct nfs4_ol_stateid * st_openstp; }; +static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s) +{ + return container_of(s, struct nfs4_ol_stateid, st_stid); +} + /* flags for preprocess_seqid_op() */ -#define HAS_SESSION 0x00000001 -#define CONFIRM 0x00000002 -#define OPEN_STATE 0x00000004 -#define LOCK_STATE 0x00000008 #define RD_STATE 0x00000010 #define WR_STATE 0x00000020 -#define CLOSE_STATE 0x00000040 - -#define seqid_mutating_err(err) \ - (((err) != nfserr_stale_clientid) && \ - ((err) != nfserr_bad_seqid) && \ - ((err) != nfserr_stale_stateid) && \ - ((err) != nfserr_bad_stateid)) struct nfsd4_compound_state; @@ -461,7 +463,8 @@ extern void nfs4_lock_state(void); extern void nfs4_unlock_state(void); extern int nfs4_in_grace(void); extern __be32 nfs4_check_open_reclaim(clientid_t *clid); -extern void nfs4_free_stateowner(struct kref *kref); +extern void nfs4_free_openowner(struct nfs4_openowner *); +extern void nfs4_free_lockowner(struct nfs4_lockowner *); extern int set_callback_cred(void); extern void nfsd4_probe_callback(struct nfs4_client *clp); extern void nfsd4_probe_callback_sync(struct nfs4_client *clp); @@ -473,7 +476,7 @@ extern void nfsd4_destroy_callback_queue(void); extern void nfsd4_shutdown_callback(struct nfs4_client *); extern void nfs4_put_delegation(struct nfs4_delegation *dp); extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname); -extern void nfsd4_init_recdir(char *recdir_name); +extern void nfsd4_init_recdir(void); extern int nfsd4_recdir_load(void); extern void nfsd4_shutdown_recdir(void); extern int nfs4_client_to_reclaim(const char *name); @@ -482,17 +485,7 @@ extern void nfsd4_recdir_purge_old(void); extern int nfsd4_create_clid_dir(struct nfs4_client *clp); extern void nfsd4_remove_clid_dir(struct nfs4_client *clp); extern void release_session_client(struct nfsd4_session *); - -static inline void -nfs4_put_stateowner(struct nfs4_stateowner *so) -{ - kref_put(&so->so_ref, nfs4_free_stateowner); -} - -static inline void -nfs4_get_stateowner(struct nfs4_stateowner *so) -{ - kref_get(&so->so_ref); -} +extern __be32 nfs4_validate_stateid(struct nfs4_client *, stateid_t *); +extern void nfsd4_purge_closed_stateid(struct nfs4_stateowner *); #endif /* NFSD4_STATE_H */ diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index fd0acca5370a..7a2e442623c8 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -168,6 +168,8 @@ int nfsd_mountpoint(struct dentry *dentry, struct svc_export *exp) { if (d_mountpoint(dentry)) return 1; + if (nfsd4_is_junction(dentry)) + return 1; if (!(exp->ex_flags & NFSEXP_V4ROOT)) return 0; return dentry->d_inode != NULL; @@ -502,7 +504,7 @@ nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp, unsigned int flags = 0; /* Get inode */ - error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, NFSD_MAY_SATTR); + error = fh_verify(rqstp, fhp, 0, NFSD_MAY_SATTR); if (error) return error; @@ -592,6 +594,22 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_ac return error; } +#define NFSD_XATTR_JUNCTION_PREFIX XATTR_TRUSTED_PREFIX "junction." +#define NFSD_XATTR_JUNCTION_TYPE NFSD_XATTR_JUNCTION_PREFIX "type" +int nfsd4_is_junction(struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + + if (inode == NULL) + return 0; + if (inode->i_mode & S_IXUGO) + return 0; + if (!(inode->i_mode & S_ISVTX)) + return 0; + if (vfs_getxattr(dentry, NFSD_XATTR_JUNCTION_TYPE, NULL, 0) <= 0) + return 0; + return 1; +} #endif /* defined(CONFIG_NFSD_V4) */ #ifdef CONFIG_NFSD_V3 @@ -1352,7 +1370,7 @@ __be32 do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, char *fname, int flen, struct iattr *iap, struct svc_fh *resfhp, int createmode, u32 *verifier, - int *truncp, int *created) + bool *truncp, bool *created) { struct dentry *dentry, *dchild = NULL; struct inode *dirp; @@ -1632,10 +1650,12 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_CREATE); if (err) goto out; - err = fh_verify(rqstp, tfhp, -S_IFDIR, NFSD_MAY_NOP); + err = fh_verify(rqstp, tfhp, 0, NFSD_MAY_NOP); if (err) goto out; - + err = nfserr_isdir; + if (S_ISDIR(tfhp->fh_dentry->d_inode->i_mode)) + goto out; err = nfserr_perm; if (!len) goto out; @@ -2114,7 +2134,8 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, /* Allow read access to binaries even when mode 111 */ if (err == -EACCES && S_ISREG(inode->i_mode) && - acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE)) + (acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE) || + acc == (NFSD_MAY_READ | NFSD_MAY_READ_IF_EXEC))) err = inode_permission(inode, MAY_EXEC); return err? nfserrno(err) : 0; diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index e0bbac04d1dd..3f54ad03bb2b 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -10,21 +10,22 @@ /* * Flags for nfsd_permission */ -#define NFSD_MAY_NOP 0 -#define NFSD_MAY_EXEC 1 /* == MAY_EXEC */ -#define NFSD_MAY_WRITE 2 /* == MAY_WRITE */ -#define NFSD_MAY_READ 4 /* == MAY_READ */ -#define NFSD_MAY_SATTR 8 -#define NFSD_MAY_TRUNC 16 -#define NFSD_MAY_LOCK 32 -#define NFSD_MAY_MASK 63 +#define NFSD_MAY_NOP 0 +#define NFSD_MAY_EXEC 0x001 /* == MAY_EXEC */ +#define NFSD_MAY_WRITE 0x002 /* == MAY_WRITE */ +#define NFSD_MAY_READ 0x004 /* == MAY_READ */ +#define NFSD_MAY_SATTR 0x008 +#define NFSD_MAY_TRUNC 0x010 +#define NFSD_MAY_LOCK 0x020 +#define NFSD_MAY_MASK 0x03f /* extra hints to permission and open routines: */ -#define NFSD_MAY_OWNER_OVERRIDE 64 -#define NFSD_MAY_LOCAL_ACCESS 128 /* IRIX doing local access check on device special file*/ -#define NFSD_MAY_BYPASS_GSS_ON_ROOT 256 -#define NFSD_MAY_NOT_BREAK_LEASE 512 -#define NFSD_MAY_BYPASS_GSS 1024 +#define NFSD_MAY_OWNER_OVERRIDE 0x040 +#define NFSD_MAY_LOCAL_ACCESS 0x080 /* for device special files */ +#define NFSD_MAY_BYPASS_GSS_ON_ROOT 0x100 +#define NFSD_MAY_NOT_BREAK_LEASE 0x200 +#define NFSD_MAY_BYPASS_GSS 0x400 +#define NFSD_MAY_READ_IF_EXEC 0x800 #define NFSD_MAY_CREATE (NFSD_MAY_EXEC|NFSD_MAY_WRITE) #define NFSD_MAY_REMOVE (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC) @@ -61,7 +62,7 @@ __be32 nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *); __be32 do_nfsd_create(struct svc_rqst *, struct svc_fh *, char *name, int len, struct iattr *attrs, struct svc_fh *res, int createmode, - u32 *verifier, int *truncp, int *created); + u32 *verifier, bool *truncp, bool *created); __be32 nfsd_commit(struct svc_rqst *, struct svc_fh *, loff_t, unsigned long); #endif /* CONFIG_NFSD_V3 */ diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 366401e1a536..2364747ee97d 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -81,7 +81,6 @@ struct nfsd4_access { struct nfsd4_close { u32 cl_seqid; /* request */ stateid_t cl_stateid; /* request+response */ - struct nfs4_stateowner * cl_stateowner; /* response */ }; struct nfsd4_commit { @@ -131,7 +130,7 @@ struct nfsd4_link { struct nfsd4_lock_denied { clientid_t ld_clientid; - struct nfs4_stateowner *ld_sop; + struct xdr_netobj ld_owner; u64 ld_start; u64 ld_length; u32 ld_type; @@ -165,9 +164,6 @@ struct nfsd4_lock { } ok; struct nfsd4_lock_denied denied; } u; - /* The lk_replay_owner is the open owner in the open_to_lock_owner - * case and the lock owner otherwise: */ - struct nfs4_stateowner *lk_replay_owner; }; #define lk_new_open_seqid v.new.open_seqid #define lk_new_open_stateid v.new.open_stateid @@ -188,7 +184,6 @@ struct nfsd4_lockt { struct xdr_netobj lt_owner; u64 lt_offset; u64 lt_length; - struct nfs4_stateowner * lt_stateowner; struct nfsd4_lock_denied lt_denied; }; @@ -199,7 +194,6 @@ struct nfsd4_locku { stateid_t lu_stateid; u64 lu_offset; u64 lu_length; - struct nfs4_stateowner *lu_stateowner; }; @@ -232,8 +226,11 @@ struct nfsd4_open { u32 op_recall; /* recall */ struct nfsd4_change_info op_cinfo; /* response */ u32 op_rflags; /* response */ - int op_truncate; /* used during processing */ - struct nfs4_stateowner *op_stateowner; /* used during processing */ + bool op_truncate; /* used during processing */ + bool op_created; /* used during processing */ + struct nfs4_openowner *op_openowner; /* used during processing */ + struct nfs4_file *op_file; /* used during processing */ + struct nfs4_ol_stateid *op_stp; /* used during processing */ struct nfs4_acl *op_acl; }; #define op_iattr iattr @@ -243,7 +240,6 @@ struct nfsd4_open_confirm { stateid_t oc_req_stateid /* request */; u32 oc_seqid /* request */; stateid_t oc_resp_stateid /* response */; - struct nfs4_stateowner * oc_stateowner; /* response */ }; struct nfsd4_open_downgrade { @@ -251,7 +247,6 @@ struct nfsd4_open_downgrade { u32 od_seqid; u32 od_share_access; u32 od_share_deny; - struct nfs4_stateowner *od_stateowner; }; @@ -325,8 +320,7 @@ struct nfsd4_setattr { struct nfsd4_setclientid { nfs4_verifier se_verf; /* request */ - u32 se_namelen; /* request */ - char * se_name; /* request */ + struct xdr_netobj se_name; u32 se_callback_prog; /* request */ u32 se_callback_netid_len; /* request */ char * se_callback_netid_val; /* request */ @@ -342,6 +336,24 @@ struct nfsd4_setclientid_confirm { nfs4_verifier sc_confirm; }; +struct nfsd4_saved_compoundargs { + __be32 *p; + __be32 *end; + int pagelen; + struct page **pagelist; +}; + +struct nfsd4_test_stateid { + __be32 ts_num_ids; + struct nfsd4_compoundargs *ts_saved_args; + struct nfsd4_saved_compoundargs ts_savedp; +}; + +struct nfsd4_free_stateid { + stateid_t fr_stateid; /* request */ + __be32 fr_status; /* response */ +}; + /* also used for NVERIFY */ struct nfsd4_verify { u32 ve_bmval[3]; /* request */ @@ -386,6 +398,10 @@ struct nfsd4_destroy_session { struct nfs4_sessionid sessionid; }; +struct nfsd4_destroy_clientid { + clientid_t clientid; +}; + struct nfsd4_reclaim_complete { u32 rca_one_fs; }; @@ -432,10 +448,14 @@ struct nfsd4_op { struct nfsd4_destroy_session destroy_session; struct nfsd4_sequence sequence; struct nfsd4_reclaim_complete reclaim_complete; + struct nfsd4_test_stateid test_stateid; + struct nfsd4_free_stateid free_stateid; } u; struct nfs4_replay * replay; }; +bool nfsd4_cache_this_op(struct nfsd4_op *); + struct nfsd4_compoundargs { /* scratch variables for XDR decode */ __be32 * p; @@ -458,6 +478,7 @@ struct nfsd4_compoundargs { u32 opcnt; struct nfsd4_op *ops; struct nfsd4_op iops[8]; + int cachetype; }; struct nfsd4_compoundres { @@ -508,6 +529,7 @@ int nfs4svc_decode_compoundargs(struct svc_rqst *, __be32 *, struct nfsd4_compoundargs *); int nfs4svc_encode_compoundres(struct svc_rqst *, __be32 *, struct nfsd4_compoundres *); +int nfsd4_check_resp_size(struct nfsd4_compoundres *, u32); void nfsd4_encode_operation(struct nfsd4_compoundres *, struct nfsd4_op *); void nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op); __be32 nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, @@ -534,11 +556,13 @@ extern __be32 nfsd4_sequence(struct svc_rqst *, extern __be32 nfsd4_destroy_session(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_destroy_session *); +extern __be32 nfsd4_destroy_clientid(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_destroy_clientid *); __be32 nfsd4_reclaim_complete(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_reclaim_complete *); extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *, struct nfsd4_open *open); extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open); +extern void nfsd4_cleanup_open_state(struct nfsd4_open *open, __be32 status); extern __be32 nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *, struct nfsd4_open_confirm *oc); extern __be32 nfsd4_close(struct svc_rqst *rqstp, @@ -559,11 +583,15 @@ extern __be32 nfsd4_release_lockowner(struct svc_rqst *rqstp, struct nfsd4_compound_state *, struct nfsd4_release_lockowner *rlockowner); -extern void nfsd4_release_compoundargs(struct nfsd4_compoundargs *); +extern int nfsd4_release_compoundargs(void *rq, __be32 *p, void *resp); extern __be32 nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *, struct nfsd4_delegreturn *dr); extern __be32 nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *, clientid_t *clid); +extern __be32 nfsd4_test_stateid(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, struct nfsd4_test_stateid *test_stateid); +extern __be32 nfsd4_free_stateid(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, struct nfsd4_free_stateid *free_stateid); #endif /* diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 666628b395f1..b50ffb72e5b3 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -354,7 +354,7 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode) failed_acl: failed_bmap: - inode->i_nlink = 0; + clear_nlink(inode); iput(inode); /* raw_inode will be deleted through generic_delete_inode() */ goto failed; @@ -396,7 +396,7 @@ int nilfs_read_inode_common(struct inode *inode, inode->i_mode = le16_to_cpu(raw_inode->i_mode); inode->i_uid = (uid_t)le32_to_cpu(raw_inode->i_uid); inode->i_gid = (gid_t)le32_to_cpu(raw_inode->i_gid); - inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); + set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); inode->i_size = le64_to_cpu(raw_inode->i_size); inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime); inode->i_ctime.tv_sec = le64_to_cpu(raw_inode->i_ctime); diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index a3141990061e..768982de10e4 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -289,7 +289,7 @@ static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry) nilfs_warning(inode->i_sb, __func__, "deleting nonexistent file (%lu), %d\n", inode->i_ino, inode->i_nlink); - inode->i_nlink = 1; + set_nlink(inode, 1); } err = nilfs_delete_entry(de, page); if (err) diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index 255d5e1c03b7..3777d138f895 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -276,10 +276,10 @@ int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, /* super.c */ extern struct inode *nilfs_alloc_inode(struct super_block *); extern void nilfs_destroy_inode(struct inode *); -extern void nilfs_error(struct super_block *, const char *, const char *, ...) - __attribute__ ((format (printf, 3, 4))); -extern void nilfs_warning(struct super_block *, const char *, const char *, ...) - __attribute__ ((format (printf, 3, 4))); +extern __printf(3, 4) +void nilfs_error(struct super_block *, const char *, const char *, ...); +extern __printf(3, 4) +void nilfs_warning(struct super_block *, const char *, const char *, ...); extern struct nilfs_super_block * nilfs_read_super_block(struct super_block *, u64, int, struct buffer_head **); extern int nilfs_store_magic_and_option(struct super_block *, diff --git a/fs/notify/group.c b/fs/notify/group.c index d309f38449cb..63fc294a4692 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -26,7 +26,7 @@ #include <linux/fsnotify_backend.h> #include "fsnotify.h" -#include <asm/atomic.h> +#include <linux/atomic.h> /* * Final freeing of a group diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c index 07ea8d3e6ea2..b13c00ac48eb 100644 --- a/fs/notify/inode_mark.c +++ b/fs/notify/inode_mark.c @@ -23,7 +23,7 @@ #include <linux/mutex.h> #include <linux/spinlock.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include <linux/fsnotify_backend.h> #include "fsnotify.h" diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 252ab1f6452b..e14587d55689 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -92,7 +92,7 @@ #include <linux/spinlock.h> #include <linux/srcu.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include <linux/fsnotify_backend.h> #include "fsnotify.h" diff --git a/fs/notify/notification.c b/fs/notify/notification.c index f39260f8f865..ee188158a224 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -43,7 +43,7 @@ #include <linux/slab.h> #include <linux/spinlock.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include <linux/fsnotify_backend.h> #include "fsnotify.h" diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c index e86577d6c5c3..778fe6cae3b0 100644 --- a/fs/notify/vfsmount_mark.c +++ b/fs/notify/vfsmount_mark.c @@ -24,7 +24,7 @@ #include <linux/mutex.h> #include <linux/spinlock.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include <linux/fsnotify_backend.h> #include "fsnotify.h" diff --git a/fs/ntfs/debug.h b/fs/ntfs/debug.h index 2142b1c68b61..53c27eaf2307 100644 --- a/fs/ntfs/debug.h +++ b/fs/ntfs/debug.h @@ -30,8 +30,9 @@ extern int debug_msgs; -extern void __ntfs_debug(const char *file, int line, const char *function, - const char *format, ...) __attribute__ ((format (printf, 4, 5))); +extern __printf(4, 5) +void __ntfs_debug(const char *file, int line, const char *function, + const char *format, ...); /** * ntfs_debug - write a debug level message to syslog * @f: a printf format string containing the message @@ -52,12 +53,14 @@ extern void ntfs_debug_dump_runlist(const runlist_element *rl); #endif /* !DEBUG */ -extern void __ntfs_warning(const char *function, const struct super_block *sb, - const char *fmt, ...) __attribute__ ((format (printf, 3, 4))); +extern __printf(3, 4) +void __ntfs_warning(const char *function, const struct super_block *sb, + const char *fmt, ...); #define ntfs_warning(sb, f, a...) __ntfs_warning(__func__, sb, f, ##a) -extern void __ntfs_error(const char *function, const struct super_block *sb, - const char *fmt, ...) __attribute__ ((format (printf, 3, 4))); +extern __printf(3, 4) +void __ntfs_error(const char *function, const struct super_block *sb, + const char *fmt, ...); #define ntfs_error(sb, f, a...) __ntfs_error(__func__, sb, f, ##a) #endif /* _LINUX_NTFS_DEBUG_H */ diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index 1371487da955..97e2dacbc867 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -612,7 +612,7 @@ static int ntfs_read_locked_inode(struct inode *vi) * might be tricky due to vfs interactions. Need to think about this * some more when implementing the unlink command. */ - vi->i_nlink = le16_to_cpu(m->link_count); + set_nlink(vi, le16_to_cpu(m->link_count)); /* * FIXME: Reparse points can have the directory bit set even though * they would be S_IFLNK. Need to deal with this further below when we @@ -634,7 +634,7 @@ static int ntfs_read_locked_inode(struct inode *vi) vi->i_mode &= ~vol->dmask; /* Things break without this kludge! */ if (vi->i_nlink > 1) - vi->i_nlink = 1; + set_nlink(vi, 1); } else { vi->i_mode |= S_IFREG; /* Apply the file permissions mask set in the mount options. */ @@ -1242,7 +1242,7 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi) vi->i_version = base_vi->i_version; vi->i_uid = base_vi->i_uid; vi->i_gid = base_vi->i_gid; - vi->i_nlink = base_vi->i_nlink; + set_nlink(vi, base_vi->i_nlink); vi->i_mtime = base_vi->i_mtime; vi->i_ctime = base_vi->i_ctime; vi->i_atime = base_vi->i_atime; @@ -1508,7 +1508,7 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi) vi->i_version = base_vi->i_version; vi->i_uid = base_vi->i_uid; vi->i_gid = base_vi->i_gid; - vi->i_nlink = base_vi->i_nlink; + set_nlink(vi, base_vi->i_nlink); vi->i_mtime = base_vi->i_mtime; vi->i_ctime = base_vi->i_ctime; vi->i_atime = base_vi->i_atime; diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h index 2dabf813456c..fe8e7e928889 100644 --- a/fs/ntfs/inode.h +++ b/fs/ntfs/inode.h @@ -24,7 +24,7 @@ #ifndef _LINUX_NTFS_INODE_H #define _LINUX_NTFS_INODE_H -#include <asm/atomic.h> +#include <linux/atomic.h> #include <linux/fs.h> #include <linux/list.h> diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 1cee970eb55a..a7219075b4de 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -247,7 +247,7 @@ static int ocfs2_set_acl(handle_t *handle, case ACL_TYPE_ACCESS: name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS; if (acl) { - mode_t mode = inode->i_mode; + umode_t mode = inode->i_mode; ret = posix_acl_equiv_mode(acl, &mode); if (ret < 0) return ret; @@ -290,47 +290,32 @@ static int ocfs2_set_acl(handle_t *handle, return ret; } -int ocfs2_check_acl(struct inode *inode, int mask) +struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type) { struct ocfs2_super *osb; struct buffer_head *di_bh = NULL; struct posix_acl *acl; int ret = -EAGAIN; - if (mask & MAY_NOT_BLOCK) - return -ECHILD; - osb = OCFS2_SB(inode->i_sb); if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) - return ret; + return NULL; ret = ocfs2_read_inode_block(inode, &di_bh); - if (ret < 0) { - mlog_errno(ret); - return ret; - } + if (ret < 0) + return ERR_PTR(ret); - acl = ocfs2_get_acl_nolock(inode, ACL_TYPE_ACCESS, di_bh); + acl = ocfs2_get_acl_nolock(inode, type, di_bh); brelse(di_bh); - if (IS_ERR(acl)) { - mlog_errno(PTR_ERR(acl)); - return PTR_ERR(acl); - } - if (acl) { - ret = posix_acl_permission(inode, acl, mask); - posix_acl_release(acl); - return ret; - } - - return -EAGAIN; + return acl; } int ocfs2_acl_chmod(struct inode *inode) { struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - struct posix_acl *acl, *clone; + struct posix_acl *acl; int ret; if (S_ISLNK(inode->i_mode)) @@ -342,15 +327,12 @@ int ocfs2_acl_chmod(struct inode *inode) acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR(acl) || !acl) return PTR_ERR(acl); - clone = posix_acl_clone(acl, GFP_KERNEL); + ret = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); + if (ret) + return ret; + ret = ocfs2_set_acl(NULL, inode, NULL, ACL_TYPE_ACCESS, + acl, NULL, NULL); posix_acl_release(acl); - if (!clone) - return -ENOMEM; - ret = posix_acl_chmod_masq(clone, inode->i_mode); - if (!ret) - ret = ocfs2_set_acl(NULL, inode, NULL, ACL_TYPE_ACCESS, - clone, NULL, NULL); - posix_acl_release(clone); return ret; } @@ -369,7 +351,7 @@ int ocfs2_init_acl(handle_t *handle, struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct posix_acl *acl = NULL; int ret = 0, ret2; - mode_t mode; + umode_t mode; if (!S_ISLNK(inode->i_mode)) { if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) { @@ -388,8 +370,6 @@ int ocfs2_init_acl(handle_t *handle, } } if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) { - struct posix_acl *clone; - if (S_ISDIR(inode->i_mode)) { ret = ocfs2_set_acl(handle, inode, di_bh, ACL_TYPE_DEFAULT, acl, @@ -397,27 +377,22 @@ int ocfs2_init_acl(handle_t *handle, if (ret) goto cleanup; } - clone = posix_acl_clone(acl, GFP_NOFS); - ret = -ENOMEM; - if (!clone) - goto cleanup; - mode = inode->i_mode; - ret = posix_acl_create_masq(clone, &mode); - if (ret >= 0) { - ret2 = ocfs2_acl_set_mode(inode, di_bh, handle, mode); - if (ret2) { - mlog_errno(ret2); - ret = ret2; - goto cleanup; - } - if (ret > 0) { - ret = ocfs2_set_acl(handle, inode, - di_bh, ACL_TYPE_ACCESS, - clone, meta_ac, data_ac); - } + ret = posix_acl_create(&acl, GFP_NOFS, &mode); + if (ret < 0) + return ret; + + ret2 = ocfs2_acl_set_mode(inode, di_bh, handle, mode); + if (ret2) { + mlog_errno(ret2); + ret = ret2; + goto cleanup; + } + if (ret > 0) { + ret = ocfs2_set_acl(handle, inode, + di_bh, ACL_TYPE_ACCESS, + acl, meta_ac, data_ac); } - posix_acl_release(clone); } cleanup: posix_acl_release(acl); diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h index 5c5d31f05853..071fbd380f2f 100644 --- a/fs/ocfs2/acl.h +++ b/fs/ocfs2/acl.h @@ -26,7 +26,7 @@ struct ocfs2_acl_entry { __le32 e_id; }; -extern int ocfs2_check_acl(struct inode *, int); +struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type); extern int ocfs2_acl_chmod(struct inode *); extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *, struct buffer_head *, struct buffer_head *, diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index ae13d5ca7908..044e7b58d31c 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -59,6 +59,7 @@ #include <linux/idr.h> #include <linux/kref.h> #include <linux/net.h> +#include <linux/export.h> #include <net/tcp.h> #include <asm/uaccess.h> diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 3302088e1f04..8fe4e2892ab9 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -2291,7 +2291,7 @@ static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb, ocfs2_journal_dirty(handle, di_bh); i_size_write(inode, size); - inode->i_nlink = 2; + set_nlink(inode, 2); inode->i_blocks = ocfs2_inode_sector_count(inode); ret = ocfs2_mark_inode_dirty(handle, inode, di_bh); @@ -2353,7 +2353,7 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb, ocfs2_journal_dirty(handle, new_bh); i_size_write(inode, inode->i_sb->s_blocksize); - inode->i_nlink = 2; + set_nlink(inode, 2); inode->i_blocks = ocfs2_inode_sector_count(inode); status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); if (status < 0) { diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index 56f82cb912e3..0e28e242226d 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -30,6 +30,7 @@ #include <linux/sysctl.h> #include <linux/spinlock.h> #include <linux/debugfs.h> +#include <linux/export.h> #include "cluster/heartbeat.h" #include "cluster/nodemanager.h" diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index da103f555a62..81a4cd22f80b 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2098,7 +2098,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode) inode->i_uid = be32_to_cpu(lvb->lvb_iuid); inode->i_gid = be32_to_cpu(lvb->lvb_igid); inode->i_mode = be16_to_cpu(lvb->lvb_imode); - inode->i_nlink = be16_to_cpu(lvb->lvb_inlink); + set_nlink(inode, be16_to_cpu(lvb->lvb_inlink)); ocfs2_unpack_timespec(&inode->i_atime, be64_to_cpu(lvb->lvb_iatime_packed)); ocfs2_unpack_timespec(&inode->i_mtime, diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 09e3de57cdee..6e396683c3d4 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2692,14 +2692,14 @@ const struct inode_operations ocfs2_file_iops = { .listxattr = ocfs2_listxattr, .removexattr = generic_removexattr, .fiemap = ocfs2_fiemap, - .check_acl = ocfs2_check_acl, + .get_acl = ocfs2_iop_get_acl, }; const struct inode_operations ocfs2_special_file_iops = { .setattr = ocfs2_setattr, .getattr = ocfs2_getattr, .permission = ocfs2_permission, - .check_acl = ocfs2_check_acl, + .get_acl = ocfs2_iop_get_acl, }; /* diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index c4bf6ac4a0bb..17454a904d7b 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -291,7 +291,7 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, (unsigned long long)OCFS2_I(inode)->ip_blkno, (unsigned long long)le64_to_cpu(fe->i_blkno)); - inode->i_nlink = ocfs2_read_links_count(fe); + set_nlink(inode, ocfs2_read_links_count(fe)); trace_ocfs2_populate_inode(OCFS2_I(inode)->ip_blkno, le32_to_cpu(fe->i_flags)); @@ -1290,7 +1290,7 @@ void ocfs2_refresh_inode(struct inode *inode, OCFS2_I(inode)->ip_dyn_features = le16_to_cpu(fe->i_dyn_features); ocfs2_set_inode_flags(inode); i_size_write(inode, le64_to_cpu(fe->i_size)); - inode->i_nlink = ocfs2_read_links_count(fe); + set_nlink(inode, ocfs2_read_links_count(fe)); inode->i_uid = le32_to_cpu(fe->i_uid); inode->i_gid = le32_to_cpu(fe->i_gid); inode->i_mode = le16_to_cpu(fe->i_mode); diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index d3433d60dbb9..184c76b8c293 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -36,7 +36,6 @@ #include "dir.h" #include "buffer_head_io.h" #include "sysfile.h" -#include "suballoc.h" #include "refcounttree.h" #include "move_extents.h" diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 33889dc52dd7..a8b2bfea574e 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -199,9 +199,7 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode) * these are used by the support functions here and in * callers. */ if (S_ISDIR(mode)) - inode->i_nlink = 2; - else - inode->i_nlink = 1; + set_nlink(inode, 2); inode_init_owner(inode, dir, mode); dquot_initialize(inode); return inode; @@ -1379,7 +1377,7 @@ static int ocfs2_rename(struct inode *old_dir, } if (new_inode) { - new_inode->i_nlink--; + drop_nlink(new_inode); new_inode->i_ctime = CURRENT_TIME; } old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME; @@ -1387,9 +1385,9 @@ static int ocfs2_rename(struct inode *old_dir, if (update_dot_dot) { status = ocfs2_update_entry(old_inode, handle, &old_inode_dot_dot_res, new_dir); - old_dir->i_nlink--; + drop_nlink(old_dir); if (new_inode) { - new_inode->i_nlink--; + drop_nlink(new_inode); } else { inc_nlink(new_dir); mark_inode_dirty(new_dir); @@ -2018,7 +2016,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data; if (S_ISDIR(inode->i_mode)) ocfs2_add_links_count(orphan_fe, 1); - orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe); + set_nlink(orphan_dir_inode, ocfs2_read_links_count(orphan_fe)); ocfs2_journal_dirty(handle, orphan_dir_bh); status = __ocfs2_add_entry(handle, orphan_dir_inode, name, @@ -2116,7 +2114,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data; if (S_ISDIR(inode->i_mode)) ocfs2_add_links_count(orphan_fe, -1); - orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe); + set_nlink(orphan_dir_inode, ocfs2_read_links_count(orphan_fe)); ocfs2_journal_dirty(handle, orphan_dir_bh); leave: @@ -2282,7 +2280,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, goto leave; } - inode->i_nlink = 0; + clear_nlink(inode); /* do the real work now. */ status = __ocfs2_mknod_locked(dir, inode, 0, &new_di_bh, parent_di_bh, handle, @@ -2437,7 +2435,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, di = (struct ocfs2_dinode *)di_bh->b_data; le32_add_cpu(&di->i_flags, -OCFS2_ORPHANED_FL); di->i_orphaned_slot = 0; - inode->i_nlink = 1; + set_nlink(inode, 1); ocfs2_set_links_count(di, inode->i_nlink); ocfs2_journal_dirty(handle, di_bh); @@ -2498,5 +2496,5 @@ const struct inode_operations ocfs2_dir_iops = { .listxattr = ocfs2_listxattr, .removexattr = generic_removexattr, .fiemap = ocfs2_fiemap, - .check_acl = ocfs2_check_acl, + .get_acl = ocfs2_iop_get_acl, }; diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h index 40c7de084c10..74ff74cf78fe 100644 --- a/fs/ocfs2/super.h +++ b/fs/ocfs2/super.h @@ -31,17 +31,15 @@ extern struct workqueue_struct *ocfs2_wq; int ocfs2_publish_get_mount_state(struct ocfs2_super *osb, int node_num); -void __ocfs2_error(struct super_block *sb, - const char *function, - const char *fmt, ...) - __attribute__ ((format (printf, 3, 4))); +__printf(3, 4) +void __ocfs2_error(struct super_block *sb, const char *function, + const char *fmt, ...); #define ocfs2_error(sb, fmt, args...) __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##args) -void __ocfs2_abort(struct super_block *sb, - const char *function, - const char *fmt, ...) - __attribute__ ((format (printf, 3, 4))); +__printf(3, 4) +void __ocfs2_abort(struct super_block *sb, const char *function, + const char *fmt, ...); #define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args) diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 93d6c80b3922..aa9e8777b09a 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -7187,20 +7187,9 @@ int ocfs2_init_security_and_acl(struct inode *dir, { int ret = 0; struct buffer_head *dir_bh = NULL; - struct ocfs2_security_xattr_info si = { - .enable = 1, - }; - ret = ocfs2_init_security_get(inode, dir, qstr, &si); + ret = ocfs2_init_security_get(inode, dir, qstr, NULL); if (!ret) { - ret = ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY, - si.name, si.value, si.value_len, - XATTR_CREATE); - if (ret) { - mlog_errno(ret); - goto leave; - } - } else if (ret != -EOPNOTSUPP) { mlog_errno(ret); goto leave; } @@ -7257,6 +7246,22 @@ static int ocfs2_xattr_security_set(struct dentry *dentry, const char *name, name, value, size, flags); } +int ocfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) +{ + const struct xattr *xattr; + int err = 0; + + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + err = ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY, + xattr->name, xattr->value, + xattr->value_len, XATTR_CREATE); + if (err) + break; + } + return err; +} + int ocfs2_init_security_get(struct inode *inode, struct inode *dir, const struct qstr *qstr, @@ -7265,8 +7270,13 @@ int ocfs2_init_security_get(struct inode *inode, /* check whether ocfs2 support feature xattr */ if (!ocfs2_supports_xattr(OCFS2_SB(dir->i_sb))) return -EOPNOTSUPP; - return security_inode_init_security(inode, dir, qstr, &si->name, - &si->value, &si->value_len); + if (si) + return security_old_inode_init_security(inode, dir, qstr, + &si->name, &si->value, + &si->value_len); + + return security_inode_init_security(inode, dir, qstr, + &ocfs2_initxattrs, NULL); } int ocfs2_init_security_set(handle_t *handle, diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c index 3b8d3979e03b..98e544274390 100644 --- a/fs/omfs/dir.c +++ b/fs/omfs/dir.c @@ -93,7 +93,7 @@ int omfs_make_empty(struct inode *inode, struct super_block *sb) memset(bh->b_data, 0, sizeof(struct omfs_inode)); - if (inode->i_mode & S_IFDIR) { + if (S_ISDIR(inode->i_mode)) { memset(&bh->b_data[OMFS_DIR_START], 0xff, sbi->s_sys_blocksize - OMFS_DIR_START); } else diff --git a/fs/open.c b/fs/open.c index 739b751aa73e..22c41b543f2d 100644 --- a/fs/open.c +++ b/fs/open.c @@ -446,74 +446,52 @@ out: return error; } -SYSCALL_DEFINE2(fchmod, unsigned int, fd, mode_t, mode) +static int chmod_common(struct path *path, umode_t mode) { - struct inode * inode; - struct dentry * dentry; - struct file * file; - int err = -EBADF; + struct inode *inode = path->dentry->d_inode; struct iattr newattrs; + int error; - file = fget(fd); - if (!file) - goto out; - - dentry = file->f_path.dentry; - inode = dentry->d_inode; - - audit_inode(NULL, dentry); - - err = mnt_want_write_file(file); - if (err) - goto out_putf; + error = mnt_want_write(path->mnt); + if (error) + return error; mutex_lock(&inode->i_mutex); - err = security_path_chmod(dentry, file->f_vfsmnt, mode); - if (err) + error = security_path_chmod(path->dentry, path->mnt, mode); + if (error) goto out_unlock; - if (mode == (mode_t) -1) - mode = inode->i_mode; newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; - err = notify_change(dentry, &newattrs); + error = notify_change(path->dentry, &newattrs); out_unlock: mutex_unlock(&inode->i_mutex); - mnt_drop_write(file->f_path.mnt); -out_putf: - fput(file); -out: + mnt_drop_write(path->mnt); + return error; +} + +SYSCALL_DEFINE2(fchmod, unsigned int, fd, mode_t, mode) +{ + struct file * file; + int err = -EBADF; + + file = fget(fd); + if (file) { + audit_inode(NULL, file->f_path.dentry); + err = chmod_common(&file->f_path, mode); + fput(file); + } return err; } SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, mode_t, mode) { struct path path; - struct inode *inode; int error; - struct iattr newattrs; error = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path); - if (error) - goto out; - inode = path.dentry->d_inode; - - error = mnt_want_write(path.mnt); - if (error) - goto dput_and_out; - mutex_lock(&inode->i_mutex); - error = security_path_chmod(path.dentry, path.mnt, mode); - if (error) - goto out_unlock; - if (mode == (mode_t) -1) - mode = inode->i_mode; - newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); - newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; - error = notify_change(path.dentry, &newattrs); -out_unlock: - mutex_unlock(&inode->i_mutex); - mnt_drop_write(path.mnt); -dput_and_out: - path_put(&path); -out: + if (!error) { + error = chmod_common(&path, mode); + path_put(&path); + } return error; } @@ -707,6 +685,10 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt, if (error) goto cleanup_all; + error = break_lease(inode, f->f_flags); + if (error) + goto cleanup_all; + if (!open && f->f_op) open = f->f_op->open; if (open) { diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index a2a5bff774e3..e4e0ff7962e2 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -242,7 +242,7 @@ found: inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; inode->i_op = &openprom_inode_operations; inode->i_fop = &openprom_operations; - inode->i_nlink = 2; + set_nlink(inode, 2); break; case op_inode_prop: if (!strcmp(dp->name, "options") && (len == 17) && @@ -251,7 +251,7 @@ found: else inode->i_mode = S_IFREG | S_IRUGO; inode->i_fop = &openpromfs_prop_ops; - inode->i_nlink = 1; + set_nlink(inode, 1); inode->i_size = ent_oi->u.prop->length; break; } diff --git a/fs/partitions/check.c b/fs/partitions/check.c index d545e97d99c3..e3c63d1c5e13 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -237,22 +237,22 @@ ssize_t part_size_show(struct device *dev, return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects); } -ssize_t part_ro_show(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t part_ro_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct hd_struct *p = dev_to_part(dev); return sprintf(buf, "%d\n", p->policy ? 1 : 0); } -ssize_t part_alignment_offset_show(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t part_alignment_offset_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct hd_struct *p = dev_to_part(dev); return sprintf(buf, "%llu\n", (unsigned long long)p->alignment_offset); } -ssize_t part_discard_alignment_show(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t part_discard_alignment_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct hd_struct *p = dev_to_part(dev); return sprintf(buf, "%u\n", p->discard_alignment); diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c index af9fdf046769..bd8ae788f689 100644 --- a/fs/partitions/ldm.c +++ b/fs/partitions/ldm.c @@ -49,18 +49,20 @@ #define ldm_error(f, a...) _ldm_printk (KERN_ERR, __func__, f, ##a) #define ldm_info(f, a...) _ldm_printk (KERN_INFO, __func__, f, ##a) -__attribute__ ((format (printf, 3, 4))) -static void _ldm_printk (const char *level, const char *function, - const char *fmt, ...) +static __printf(3, 4) +void _ldm_printk(const char *level, const char *function, const char *fmt, ...) { - static char buf[128]; + struct va_format vaf; va_list args; va_start (args, fmt); - vsnprintf (buf, sizeof (buf), fmt, args); - va_end (args); - printk ("%s%s(): %s\n", level, function, buf); + vaf.fmt = fmt; + vaf.va = &args; + + printk("%s%s(): %pV\n", level, function, &vaf); + + va_end(args); } /** diff --git a/fs/pipe.c b/fs/pipe.c index da42f7db50de..4065f07366b3 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -948,7 +948,7 @@ static const struct dentry_operations pipefs_dentry_operations = { static struct inode * get_pipe_inode(void) { - struct inode *inode = new_inode(pipe_mnt->mnt_sb); + struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb); struct pipe_inode_info *pipe; if (!inode) @@ -1254,6 +1254,7 @@ out: static const struct super_operations pipefs_ops = { .destroy_inode = free_inode_nonrcu, + .statfs = simple_statfs, }; /* @@ -1291,8 +1292,8 @@ static int __init init_pipe_fs(void) static void __exit exit_pipe_fs(void) { + kern_unmount(pipe_mnt); unregister_filesystem(&pipe_fs_type); - mntput(pipe_mnt); } fs_initcall(init_pipe_fs); diff --git a/fs/posix_acl.c b/fs/posix_acl.c index b1cf6bf4b41d..cea4623f1ed6 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -14,7 +14,7 @@ #include <linux/kernel.h> #include <linux/slab.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include <linux/fs.h> #include <linux/sched.h> #include <linux/posix_acl.h> @@ -24,13 +24,9 @@ EXPORT_SYMBOL(posix_acl_init); EXPORT_SYMBOL(posix_acl_alloc); -EXPORT_SYMBOL(posix_acl_clone); EXPORT_SYMBOL(posix_acl_valid); EXPORT_SYMBOL(posix_acl_equiv_mode); EXPORT_SYMBOL(posix_acl_from_mode); -EXPORT_SYMBOL(posix_acl_create_masq); -EXPORT_SYMBOL(posix_acl_chmod_masq); -EXPORT_SYMBOL(posix_acl_permission); /* * Init a fresh posix_acl @@ -59,7 +55,7 @@ posix_acl_alloc(int count, gfp_t flags) /* * Clone an ACL. */ -struct posix_acl * +static struct posix_acl * posix_acl_clone(const struct posix_acl *acl, gfp_t flags) { struct posix_acl *clone = NULL; @@ -153,10 +149,10 @@ posix_acl_valid(const struct posix_acl *acl) * file mode permission bits, or else 1. Returns -E... on error. */ int -posix_acl_equiv_mode(const struct posix_acl *acl, mode_t *mode_p) +posix_acl_equiv_mode(const struct posix_acl *acl, umode_t *mode_p) { const struct posix_acl_entry *pa, *pe; - mode_t mode = 0; + umode_t mode = 0; int not_equiv = 0; FOREACH_ACL_ENTRY(pa, acl, pe) { @@ -192,7 +188,7 @@ posix_acl_equiv_mode(const struct posix_acl *acl, mode_t *mode_p) * Create an ACL representing the file mode permission bits of an inode. */ struct posix_acl * -posix_acl_from_mode(mode_t mode, gfp_t flags) +posix_acl_from_mode(umode_t mode, gfp_t flags) { struct posix_acl *acl = posix_acl_alloc(3, flags); if (!acl) @@ -222,6 +218,8 @@ posix_acl_permission(struct inode *inode, const struct posix_acl *acl, int want) const struct posix_acl_entry *pa, *pe, *mask_obj; int found = 0; + want &= MAY_READ | MAY_WRITE | MAY_EXEC | MAY_NOT_BLOCK; + FOREACH_ACL_ENTRY(pa, acl, pe) { switch(pa->e_tag) { case ACL_USER_OBJ: @@ -283,12 +281,11 @@ check_perm: * system calls. All permissions that are not granted by the acl are removed. * The permissions in the acl are changed to reflect the mode_p parameter. */ -int -posix_acl_create_masq(struct posix_acl *acl, mode_t *mode_p) +static int posix_acl_create_masq(struct posix_acl *acl, umode_t *mode_p) { struct posix_acl_entry *pa, *pe; struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL; - mode_t mode = *mode_p; + umode_t mode = *mode_p; int not_equiv = 0; /* assert(atomic_read(acl->a_refcount) == 1); */ @@ -341,8 +338,7 @@ posix_acl_create_masq(struct posix_acl *acl, mode_t *mode_p) /* * Modify the ACL for the chmod syscall. */ -int -posix_acl_chmod_masq(struct posix_acl *acl, mode_t mode) +static int posix_acl_chmod_masq(struct posix_acl *acl, umode_t mode) { struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL; struct posix_acl_entry *pa, *pe; @@ -386,3 +382,39 @@ posix_acl_chmod_masq(struct posix_acl *acl, mode_t mode) return 0; } + +int +posix_acl_create(struct posix_acl **acl, gfp_t gfp, umode_t *mode_p) +{ + struct posix_acl *clone = posix_acl_clone(*acl, gfp); + int err = -ENOMEM; + if (clone) { + err = posix_acl_create_masq(clone, mode_p); + if (err < 0) { + posix_acl_release(clone); + clone = NULL; + } + } + posix_acl_release(*acl); + *acl = clone; + return err; +} +EXPORT_SYMBOL(posix_acl_create); + +int +posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, umode_t mode) +{ + struct posix_acl *clone = posix_acl_clone(*acl, gfp); + int err = -ENOMEM; + if (clone) { + err = posix_acl_chmod_masq(clone, mode); + if (err) { + posix_acl_release(clone); + clone = NULL; + } + } + posix_acl_release(*acl); + *acl = clone; + return err; +} +EXPORT_SYMBOL(posix_acl_chmod); diff --git a/fs/proc/base.c b/fs/proc/base.c index 91fb655a5cbf..851ba3dcdc29 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1107,21 +1107,13 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, goto err_sighand; } - if (oom_adjust != task->signal->oom_adj) { - if (oom_adjust == OOM_DISABLE) - atomic_inc(&task->mm->oom_disable_count); - if (task->signal->oom_adj == OOM_DISABLE) - atomic_dec(&task->mm->oom_disable_count); - } - /* * Warn that /proc/pid/oom_adj is deprecated, see * Documentation/feature-removal-schedule.txt. */ - printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, " - "please use /proc/%d/oom_score_adj instead.\n", - current->comm, task_pid_nr(current), - task_pid_nr(task), task_pid_nr(task)); + printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n", + current->comm, task_pid_nr(current), task_pid_nr(task), + task_pid_nr(task)); task->signal->oom_adj = oom_adjust; /* * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum @@ -1216,12 +1208,6 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, goto err_sighand; } - if (oom_score_adj != task->signal->oom_score_adj) { - if (oom_score_adj == OOM_SCORE_ADJ_MIN) - atomic_inc(&task->mm->oom_disable_count); - if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) - atomic_dec(&task->mm->oom_disable_count); - } task->signal->oom_score_adj = oom_score_adj; if (has_capability_noaudit(current, CAP_SYS_RESOURCE)) task->signal->oom_score_adj_min = oom_score_adj; @@ -1920,6 +1906,14 @@ static int proc_fd_info(struct inode *inode, struct path *path, char *info) spin_lock(&files->file_lock); file = fcheck_files(files, fd); if (file) { + unsigned int f_flags; + struct fdtable *fdt; + + fdt = files_fdtable(files); + f_flags = file->f_flags & ~O_CLOEXEC; + if (FD_ISSET(fd, fdt->close_on_exec)) + f_flags |= O_CLOEXEC; + if (path) { *path = file->f_path; path_get(&file->f_path); @@ -1929,7 +1923,7 @@ static int proc_fd_info(struct inode *inode, struct path *path, char *info) "pos:\t%lli\n" "flags:\t0%o\n", (long long) file->f_pos, - file->f_flags); + f_flags); spin_unlock(&files->file_lock); put_files_struct(files); return 0; @@ -2254,7 +2248,7 @@ static struct dentry *proc_pident_instantiate(struct inode *dir, ei = PROC_I(inode); inode->i_mode = p->mode; if (S_ISDIR(inode->i_mode)) - inode->i_nlink = 2; /* Use getattr to fix if necessary */ + set_nlink(inode, 2); /* Use getattr to fix if necessary */ if (p->iop) inode->i_op = p->iop; if (p->fop) @@ -2648,7 +2642,7 @@ static struct dentry *proc_base_instantiate(struct inode *dir, inode->i_mode = p->mode; if (S_ISDIR(inode->i_mode)) - inode->i_nlink = 2; + set_nlink(inode, 2); if (S_ISLNK(inode->i_mode)) inode->i_size = 64; if (p->iop) @@ -2707,9 +2701,16 @@ static int do_io_accounting(struct task_struct *task, char *buffer, int whole) { struct task_io_accounting acct = task->ioac; unsigned long flags; + int result; - if (!ptrace_may_access(task, PTRACE_MODE_READ)) - return -EACCES; + result = mutex_lock_killable(&task->signal->cred_guard_mutex); + if (result) + return result; + + if (!ptrace_may_access(task, PTRACE_MODE_READ)) { + result = -EACCES; + goto out_unlock; + } if (whole && lock_task_sighand(task, &flags)) { struct task_struct *t = task; @@ -2720,7 +2721,7 @@ static int do_io_accounting(struct task_struct *task, char *buffer, int whole) unlock_task_sighand(task, &flags); } - return sprintf(buffer, + result = sprintf(buffer, "rchar: %llu\n" "wchar: %llu\n" "syscr: %llu\n" @@ -2735,6 +2736,9 @@ static int do_io_accounting(struct task_struct *task, char *buffer, int whole) (unsigned long long)acct.read_bytes, (unsigned long long)acct.write_bytes, (unsigned long long)acct.cancelled_write_bytes); +out_unlock: + mutex_unlock(&task->signal->cred_guard_mutex); + return result; } static int proc_tid_io_accounting(struct task_struct *task, char *buffer) @@ -2977,8 +2981,8 @@ static struct dentry *proc_pid_instantiate(struct inode *dir, inode->i_fop = &proc_tgid_base_operations; inode->i_flags|=S_IMMUTABLE; - inode->i_nlink = 2 + pid_entry_count_dirs(tgid_base_stuff, - ARRAY_SIZE(tgid_base_stuff)); + set_nlink(inode, 2 + pid_entry_count_dirs(tgid_base_stuff, + ARRAY_SIZE(tgid_base_stuff))); d_set_d_op(dentry, &pid_dentry_operations); @@ -3229,8 +3233,8 @@ static struct dentry *proc_task_instantiate(struct inode *dir, inode->i_fop = &proc_tid_base_operations; inode->i_flags|=S_IMMUTABLE; - inode->i_nlink = 2 + pid_entry_count_dirs(tid_base_stuff, - ARRAY_SIZE(tid_base_stuff)); + set_nlink(inode, 2 + pid_entry_count_dirs(tid_base_stuff, + ARRAY_SIZE(tid_base_stuff))); d_set_d_op(dentry, &pid_dentry_operations); diff --git a/fs/proc/generic.c b/fs/proc/generic.c index f1637f17c37c..10090d9c7ad5 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -283,7 +283,7 @@ static int proc_getattr(struct vfsmount *mnt, struct dentry *dentry, struct inode *inode = dentry->d_inode; struct proc_dir_entry *de = PROC_I(inode)->pde; if (de && de->nlink) - inode->i_nlink = de->nlink; + set_nlink(inode, de->nlink); generic_fillattr(inode, stat); return 0; @@ -620,8 +620,7 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, if (!ent) goto out; memset(ent, 0, sizeof(struct proc_dir_entry)); - memcpy(((char *) ent) + sizeof(struct proc_dir_entry), fn, len + 1); - ent->name = ((char *) ent) + sizeof(*ent); + memcpy(ent->name, fn, len + 1); ent->namelen = len; ent->mode = mode; ent->nlink = nlink; diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 74b48cfa1bb2..7737c5468a40 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -319,7 +319,7 @@ static int proc_reg_open(struct inode *inode, struct file *file) if (!pde->proc_fops) { spin_unlock(&pde->pde_unload_lock); kfree(pdeo); - return -EINVAL; + return -ENOENT; } pde->pde_users++; open = pde->proc_fops->open; @@ -445,7 +445,7 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) if (de->size) inode->i_size = de->size; if (de->nlink) - inode->i_nlink = de->nlink; + set_nlink(inode, de->nlink); if (de->proc_iops) inode->i_op = de->proc_iops; if (de->proc_fops) { diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index ed257d141568..586174168e2a 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -10,7 +10,7 @@ #include <linux/seq_file.h> #include <linux/swap.h> #include <linux/vmstat.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include <asm/page.h> #include <asm/pgtable.h> #include "internal.h" diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 9020ac15baaa..f738024ccc8e 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -197,15 +197,15 @@ static __net_init int proc_net_ns_init(struct net *net) int err; err = -ENOMEM; - netd = kzalloc(sizeof(*netd), GFP_KERNEL); + netd = kzalloc(sizeof(*netd) + 4, GFP_KERNEL); if (!netd) goto out; netd->data = net; netd->nlink = 2; - netd->name = "net"; netd->namelen = 3; netd->parent = &proc_root; + memcpy(netd->name, "net", 4); err = -EEXIST; net_statd = proc_net_mkdir(net, "stat", netd); diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 1a77dbef226f..a6b62173d4c3 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -3,6 +3,7 @@ */ #include <linux/init.h> #include <linux/sysctl.h> +#include <linux/poll.h> #include <linux/proc_fs.h> #include <linux/security.h> #include <linux/namei.h> @@ -14,6 +15,15 @@ static const struct inode_operations proc_sys_inode_operations; static const struct file_operations proc_sys_dir_file_operations; static const struct inode_operations proc_sys_dir_operations; +void proc_sys_poll_notify(struct ctl_table_poll *poll) +{ + if (!poll) + return; + + atomic_inc(&poll->event); + wake_up_interruptible(&poll->wait); +} + static struct inode *proc_sys_make_inode(struct super_block *sb, struct ctl_table_header *head, struct ctl_table *table) { @@ -39,7 +49,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, inode->i_fop = &proc_sys_file_operations; } else { inode->i_mode |= S_IFDIR; - inode->i_nlink = 0; + clear_nlink(inode); inode->i_op = &proc_sys_dir_operations; inode->i_fop = &proc_sys_dir_file_operations; } @@ -176,6 +186,39 @@ static ssize_t proc_sys_write(struct file *filp, const char __user *buf, return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 1); } +static int proc_sys_open(struct inode *inode, struct file *filp) +{ + struct ctl_table *table = PROC_I(inode)->sysctl_entry; + + if (table->poll) + filp->private_data = proc_sys_poll_event(table->poll); + + return 0; +} + +static unsigned int proc_sys_poll(struct file *filp, poll_table *wait) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + struct ctl_table *table = PROC_I(inode)->sysctl_entry; + unsigned long event = (unsigned long)filp->private_data; + unsigned int ret = DEFAULT_POLLMASK; + + if (!table->proc_handler) + goto out; + + if (!table->poll) + goto out; + + poll_wait(filp, &table->poll->wait, wait); + + if (event != atomic_read(&table->poll->event)) { + filp->private_data = proc_sys_poll_event(table->poll); + ret = POLLIN | POLLRDNORM | POLLERR | POLLPRI; + } + +out: + return ret; +} static int proc_sys_fill_cache(struct file *filp, void *dirent, filldir_t filldir, @@ -364,12 +407,15 @@ static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct } static const struct file_operations proc_sys_file_operations = { + .open = proc_sys_open, + .poll = proc_sys_poll, .read = proc_sys_read, .write = proc_sys_write, .llseek = default_llseek, }; static const struct file_operations proc_sys_dir_file_operations = { + .read = generic_read_dir, .readdir = proc_sys_readdir, .llseek = generic_file_llseek, }; diff --git a/fs/proc/root.c b/fs/proc/root.c index d6c3b416529b..9a8a2b77b874 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -186,13 +186,13 @@ static const struct inode_operations proc_root_inode_operations = { struct proc_dir_entry proc_root = { .low_ino = PROC_ROOT_INO, .namelen = 5, - .name = "/proc", .mode = S_IFDIR | S_IRUGO | S_IXUGO, .nlink = 2, .count = ATOMIC_INIT(1), .proc_iops = &proc_root_inode_operations, .proc_fops = &proc_root_operations, .parent = &proc_root, + .name = "/proc", }; int pid_ns_prepare_proc(struct pid_namespace *ns) diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 9758b654a1bc..42b274da92c3 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -10,6 +10,7 @@ #include <linux/time.h> #include <linux/irqnr.h> #include <asm/cputime.h> +#include <linux/tick.h> #ifndef arch_irq_stat_cpu #define arch_irq_stat_cpu(cpu) 0 @@ -21,6 +22,35 @@ #define arch_idle_time(cpu) 0 #endif +static cputime64_t get_idle_time(int cpu) +{ + u64 idle_time = get_cpu_idle_time_us(cpu, NULL); + cputime64_t idle; + + if (idle_time == -1ULL) { + /* !NO_HZ so we can rely on cpustat.idle */ + idle = kstat_cpu(cpu).cpustat.idle; + idle = cputime64_add(idle, arch_idle_time(cpu)); + } else + idle = usecs_to_cputime(idle_time); + + return idle; +} + +static cputime64_t get_iowait_time(int cpu) +{ + u64 iowait_time = get_cpu_iowait_time_us(cpu, NULL); + cputime64_t iowait; + + if (iowait_time == -1ULL) + /* !NO_HZ so we can rely on cpustat.iowait */ + iowait = kstat_cpu(cpu).cpustat.iowait; + else + iowait = usecs_to_cputime(iowait_time); + + return iowait; +} + static int show_stat(struct seq_file *p, void *v) { int i, j; @@ -42,9 +72,8 @@ static int show_stat(struct seq_file *p, void *v) user = cputime64_add(user, kstat_cpu(i).cpustat.user); nice = cputime64_add(nice, kstat_cpu(i).cpustat.nice); system = cputime64_add(system, kstat_cpu(i).cpustat.system); - idle = cputime64_add(idle, kstat_cpu(i).cpustat.idle); - idle = cputime64_add(idle, arch_idle_time(i)); - iowait = cputime64_add(iowait, kstat_cpu(i).cpustat.iowait); + idle = cputime64_add(idle, get_idle_time(i)); + iowait = cputime64_add(iowait, get_iowait_time(i)); irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq); softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq); steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal); @@ -76,14 +105,12 @@ static int show_stat(struct seq_file *p, void *v) (unsigned long long)cputime64_to_clock_t(guest), (unsigned long long)cputime64_to_clock_t(guest_nice)); for_each_online_cpu(i) { - /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ user = kstat_cpu(i).cpustat.user; nice = kstat_cpu(i).cpustat.nice; system = kstat_cpu(i).cpustat.system; - idle = kstat_cpu(i).cpustat.idle; - idle = cputime64_add(idle, arch_idle_time(i)); - iowait = kstat_cpu(i).cpustat.iowait; + idle = get_idle_time(i); + iowait = get_iowait_time(i); irq = kstat_cpu(i).cpustat.irq; softirq = kstat_cpu(i).cpustat.softirq; steal = kstat_cpu(i).cpustat.steal; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 25b6a887adb9..e418c5abdb0e 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -44,6 +44,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) "VmPeak:\t%8lu kB\n" "VmSize:\t%8lu kB\n" "VmLck:\t%8lu kB\n" + "VmPin:\t%8lu kB\n" "VmHWM:\t%8lu kB\n" "VmRSS:\t%8lu kB\n" "VmData:\t%8lu kB\n" @@ -55,6 +56,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) hiwater_vm << (PAGE_SHIFT-10), (total_vm - mm->reserved_vm) << (PAGE_SHIFT-10), mm->locked_vm << (PAGE_SHIFT-10), + mm->pinned_vm << (PAGE_SHIFT-10), hiwater_rss << (PAGE_SHIFT-10), total_rss << (PAGE_SHIFT-10), data << (PAGE_SHIFT-10), @@ -877,30 +879,54 @@ struct numa_maps_private { struct numa_maps md; }; -static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty) +static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty, + unsigned long nr_pages) { int count = page_mapcount(page); - md->pages++; + md->pages += nr_pages; if (pte_dirty || PageDirty(page)) - md->dirty++; + md->dirty += nr_pages; if (PageSwapCache(page)) - md->swapcache++; + md->swapcache += nr_pages; if (PageActive(page) || PageUnevictable(page)) - md->active++; + md->active += nr_pages; if (PageWriteback(page)) - md->writeback++; + md->writeback += nr_pages; if (PageAnon(page)) - md->anon++; + md->anon += nr_pages; if (count > md->mapcount_max) md->mapcount_max = count; - md->node[page_to_nid(page)]++; + md->node[page_to_nid(page)] += nr_pages; +} + +static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma, + unsigned long addr) +{ + struct page *page; + int nid; + + if (!pte_present(pte)) + return NULL; + + page = vm_normal_page(vma, addr, pte); + if (!page) + return NULL; + + if (PageReserved(page)) + return NULL; + + nid = page_to_nid(page); + if (!node_isset(nid, node_states[N_HIGH_MEMORY])) + return NULL; + + return page; } static int gather_pte_stats(pmd_t *pmd, unsigned long addr, @@ -912,26 +938,32 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, pte_t *pte; md = walk->private; - orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); - do { - struct page *page; - int nid; + spin_lock(&walk->mm->page_table_lock); + if (pmd_trans_huge(*pmd)) { + if (pmd_trans_splitting(*pmd)) { + spin_unlock(&walk->mm->page_table_lock); + wait_split_huge_page(md->vma->anon_vma, pmd); + } else { + pte_t huge_pte = *(pte_t *)pmd; + struct page *page; - if (!pte_present(*pte)) - continue; + page = can_gather_numa_stats(huge_pte, md->vma, addr); + if (page) + gather_stats(page, md, pte_dirty(huge_pte), + HPAGE_PMD_SIZE/PAGE_SIZE); + spin_unlock(&walk->mm->page_table_lock); + return 0; + } + } else { + spin_unlock(&walk->mm->page_table_lock); + } - page = vm_normal_page(md->vma, addr, *pte); + orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + do { + struct page *page = can_gather_numa_stats(*pte, md->vma, addr); if (!page) continue; - - if (PageReserved(page)) - continue; - - nid = page_to_nid(page); - if (!node_isset(nid, node_states[N_HIGH_MEMORY])) - continue; - - gather_stats(page, md, pte_dirty(*pte)); + gather_stats(page, md, pte_dirty(*pte), 1); } while (pte++, addr += PAGE_SIZE, addr != end); pte_unmap_unlock(orig_pte, ptl); @@ -952,7 +984,7 @@ static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask, return 0; md = walk->private; - gather_stats(page, md, pte_dirty(*pte)); + gather_stats(page, md, pte_dirty(*pte), 1); return 0; } @@ -1009,6 +1041,9 @@ static int show_numa_map(struct seq_file *m, void *v) seq_printf(m, " stack"); } + if (is_vm_hugetlb_page(vma)) + seq_printf(m, " huge"); + walk_page_range(vma->vm_start, vma->vm_end, &walk); if (!md->pages) diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index cd99bf557650..b0f450a2bb7c 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -12,6 +12,7 @@ #include <linux/user.h> #include <linux/elf.h> #include <linux/elfcore.h> +#include <linux/export.h> #include <linux/slab.h> #include <linux/highmem.h> #include <linux/bootmem.h> diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 977ed2723845..379a02dc1217 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -24,6 +24,7 @@ #include <linux/highmem.h> #include <linux/time.h> #include <linux/init.h> +#include <linux/list.h> #include <linux/string.h> #include <linux/mount.h> #include <linux/ramfs.h> @@ -32,15 +33,21 @@ #include <linux/magic.h> #include <linux/pstore.h> #include <linux/slab.h> +#include <linux/spinlock.h> #include <linux/uaccess.h> #include "internal.h" #define PSTORE_NAMELEN 64 +static DEFINE_SPINLOCK(allpstore_lock); +static LIST_HEAD(allpstore); + struct pstore_private { + struct list_head list; + struct pstore_info *psi; + enum pstore_type_id type; u64 id; - int (*erase)(u64); ssize_t size; char data[]; }; @@ -73,15 +80,23 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry) { struct pstore_private *p = dentry->d_inode->i_private; - p->erase(p->id); + p->psi->erase(p->type, p->id, p->psi); return simple_unlink(dir, dentry); } static void pstore_evict_inode(struct inode *inode) { + struct pstore_private *p = inode->i_private; + unsigned long flags; + end_writeback(inode); - kfree(inode->i_private); + if (p) { + spin_lock_irqsave(&allpstore_lock, flags); + list_del(&p->list); + spin_unlock_irqrestore(&allpstore_lock, flags); + kfree(p); + } } static const struct inode_operations pstore_dir_inode_operations = { @@ -175,15 +190,29 @@ int pstore_is_mounted(void) * Set the mtime & ctime to the date that this record was originally stored. */ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, - char *data, size_t size, - struct timespec time, int (*erase)(u64)) + char *data, size_t size, struct timespec time, + struct pstore_info *psi) { struct dentry *root = pstore_sb->s_root; struct dentry *dentry; struct inode *inode; - int rc; + int rc = 0; char name[PSTORE_NAMELEN]; - struct pstore_private *private; + struct pstore_private *private, *pos; + unsigned long flags; + + spin_lock_irqsave(&allpstore_lock, flags); + list_for_each_entry(pos, &allpstore, list) { + if (pos->type == type && + pos->id == id && + pos->psi == psi) { + rc = -EEXIST; + break; + } + } + spin_unlock_irqrestore(&allpstore_lock, flags); + if (rc) + return rc; rc = -ENOMEM; inode = pstore_get_inode(pstore_sb, root->d_inode, S_IFREG | 0444, 0); @@ -192,8 +221,9 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, private = kmalloc(sizeof *private + size, GFP_KERNEL); if (!private) goto fail_alloc; + private->type = type; private->id = id; - private->erase = erase; + private->psi = psi; switch (type) { case PSTORE_TYPE_DMESG: @@ -227,6 +257,10 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, d_add(dentry, inode); + spin_lock_irqsave(&allpstore_lock, flags); + list_add(&private->list, &allpstore); + spin_unlock_irqrestore(&allpstore_lock, flags); + mutex_unlock(&root->d_inode->i_mutex); return 0; @@ -275,7 +309,7 @@ int pstore_fill_super(struct super_block *sb, void *data, int silent) goto fail; } - pstore_get_records(); + pstore_get_records(0); return 0; fail: diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h index 8c9f23eb1645..3bde461c3f34 100644 --- a/fs/pstore/internal.h +++ b/fs/pstore/internal.h @@ -1,6 +1,6 @@ extern void pstore_set_kmsg_bytes(int); -extern void pstore_get_records(void); +extern void pstore_get_records(int); extern int pstore_mkfile(enum pstore_type_id, char *psname, u64 id, char *data, size_t size, - struct timespec time, int (*erase)(u64)); + struct timespec time, struct pstore_info *psi); extern int pstore_is_mounted(void); diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index f2c3ff20ea68..57bbf9078ac8 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -25,18 +25,38 @@ #include <linux/module.h> #include <linux/pstore.h> #include <linux/string.h> +#include <linux/timer.h> #include <linux/slab.h> #include <linux/uaccess.h> +#include <linux/hardirq.h> +#include <linux/workqueue.h> #include "internal.h" /* + * We defer making "oops" entries appear in pstore - see + * whether the system is actually still running well enough + * to let someone see the entry + */ +#define PSTORE_INTERVAL (60 * HZ) + +static int pstore_new_entry; + +static void pstore_timefunc(unsigned long); +static DEFINE_TIMER(pstore_timer, pstore_timefunc, 0, 0); + +static void pstore_dowork(struct work_struct *); +static DECLARE_WORK(pstore_work, pstore_dowork); + +/* * pstore_lock just protects "psinfo" during * calls to pstore_register() */ static DEFINE_SPINLOCK(pstore_lock); static struct pstore_info *psinfo; +static char *backend; + /* How much of the console log to snapshot */ static unsigned long kmsg_bytes = 10240; @@ -67,18 +87,26 @@ static void pstore_dump(struct kmsg_dumper *dumper, unsigned long size, total = 0; char *dst, *why; u64 id; - int hsize, part = 1; + int hsize, ret; + unsigned int part = 1; + unsigned long flags = 0; + int is_locked = 0; if (reason < ARRAY_SIZE(reason_str)) why = reason_str[reason]; else why = "Unknown"; - mutex_lock(&psinfo->buf_mutex); + if (in_nmi()) { + is_locked = spin_trylock(&psinfo->buf_lock); + if (!is_locked) + pr_err("pstore dump routine blocked in NMI, may corrupt error record\n"); + } else + spin_lock_irqsave(&psinfo->buf_lock, flags); oopscount++; while (total < kmsg_bytes) { dst = psinfo->buf; - hsize = sprintf(dst, "%s#%d Part%d\n", why, oopscount, part++); + hsize = sprintf(dst, "%s#%d Part%d\n", why, oopscount, part); size = psinfo->bufsize - hsize; dst += hsize; @@ -94,16 +122,20 @@ static void pstore_dump(struct kmsg_dumper *dumper, memcpy(dst, s1 + s1_start, l1_cpy); memcpy(dst + l1_cpy, s2 + s2_start, l2_cpy); - id = psinfo->write(PSTORE_TYPE_DMESG, hsize + l1_cpy + l2_cpy); - if (reason == KMSG_DUMP_OOPS && pstore_is_mounted()) - pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id, - psinfo->buf, hsize + l1_cpy + l2_cpy, - CURRENT_TIME, psinfo->erase); + ret = psinfo->write(PSTORE_TYPE_DMESG, &id, part, + hsize + l1_cpy + l2_cpy, psinfo); + if (ret == 0 && reason == KMSG_DUMP_OOPS && pstore_is_mounted()) + pstore_new_entry = 1; l1 -= l1_cpy; l2 -= l2_cpy; total += l1_cpy + l2_cpy; + part++; } - mutex_unlock(&psinfo->buf_mutex); + if (in_nmi()) { + if (is_locked) + spin_unlock(&psinfo->buf_lock); + } else + spin_unlock_irqrestore(&psinfo->buf_lock, flags); } static struct kmsg_dumper pstore_dumper = { @@ -128,7 +160,14 @@ int pstore_register(struct pstore_info *psi) spin_unlock(&pstore_lock); return -EBUSY; } + + if (backend && strcmp(backend, psi->name)) { + spin_unlock(&pstore_lock); + return -EINVAL; + } + psinfo = psi; + mutex_init(&psinfo->read_mutex); spin_unlock(&pstore_lock); if (owner && !try_module_get(owner)) { @@ -137,21 +176,27 @@ int pstore_register(struct pstore_info *psi) } if (pstore_is_mounted()) - pstore_get_records(); + pstore_get_records(0); kmsg_dump_register(&pstore_dumper); + pstore_timer.expires = jiffies + PSTORE_INTERVAL; + add_timer(&pstore_timer); + return 0; } EXPORT_SYMBOL_GPL(pstore_register); /* - * Read all the records from the persistent store. Create and - * file files in our filesystem. + * Read all the records from the persistent store. Create + * files in our filesystem. Don't warn about -EEXIST errors + * when we are re-scanning the backing store looking to add new + * error records. */ -void pstore_get_records(void) +void pstore_get_records(int quiet) { struct pstore_info *psi = psinfo; + char *buf = NULL; ssize_t size; u64 id; enum pstore_type_id type; @@ -161,32 +206,52 @@ void pstore_get_records(void) if (!psi) return; - mutex_lock(&psinfo->buf_mutex); + mutex_lock(&psi->read_mutex); rc = psi->open(psi); if (rc) goto out; - while ((size = psi->read(&id, &type, &time)) > 0) { - if (pstore_mkfile(type, psi->name, id, psi->buf, (size_t)size, - time, psi->erase)) + while ((size = psi->read(&id, &type, &time, &buf, psi)) > 0) { + rc = pstore_mkfile(type, psi->name, id, buf, (size_t)size, + time, psi); + kfree(buf); + buf = NULL; + if (rc && (rc != -EEXIST || !quiet)) failed++; } psi->close(psi); out: - mutex_unlock(&psinfo->buf_mutex); + mutex_unlock(&psi->read_mutex); if (failed) printk(KERN_WARNING "pstore: failed to load %d record(s) from '%s'\n", failed, psi->name); } +static void pstore_dowork(struct work_struct *work) +{ + pstore_get_records(1); +} + +static void pstore_timefunc(unsigned long dummy) +{ + if (pstore_new_entry) { + pstore_new_entry = 0; + schedule_work(&pstore_work); + } + + mod_timer(&pstore_timer, jiffies + PSTORE_INTERVAL); +} + /* * Call platform driver to write a record to the * persistent store. */ int pstore_write(enum pstore_type_id type, char *buf, size_t size) { - u64 id; + u64 id; + int ret; + unsigned long flags; if (!psinfo) return -ENODEV; @@ -194,14 +259,17 @@ int pstore_write(enum pstore_type_id type, char *buf, size_t size) if (size > psinfo->bufsize) return -EFBIG; - mutex_lock(&psinfo->buf_mutex); + spin_lock_irqsave(&psinfo->buf_lock, flags); memcpy(psinfo->buf, buf, size); - id = psinfo->write(type, size); - if (pstore_is_mounted()) + ret = psinfo->write(type, &id, 0, size, psinfo); + if (ret == 0 && pstore_is_mounted()) pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id, psinfo->buf, - size, CURRENT_TIME, psinfo->erase); - mutex_unlock(&psinfo->buf_mutex); + size, CURRENT_TIME, psinfo); + spin_unlock_irqrestore(&psinfo->buf_lock, flags); return 0; } EXPORT_SYMBOL_GPL(pstore_write); + +module_param(backend, charp, 0444); +MODULE_PARM_DESC(backend, "Pstore backend to use"); diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index 2b0646613f5a..3bdd21418432 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -379,7 +379,7 @@ struct inode *qnx4_iget(struct super_block *sb, unsigned long ino) inode->i_mode = le16_to_cpu(raw_inode->di_mode); inode->i_uid = (uid_t)le16_to_cpu(raw_inode->di_uid); inode->i_gid = (gid_t)le16_to_cpu(raw_inode->di_gid); - inode->i_nlink = le16_to_cpu(raw_inode->di_nlink); + set_nlink(inode, le16_to_cpu(raw_inode->di_nlink)); inode->i_size = le32_to_cpu(raw_inode->di_size); inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->di_mtime); inode->i_mtime.tv_nsec = 0; diff --git a/fs/quota/quota.c b/fs/quota/quota.c index b34bdb25490c..35f4b0ecdeb3 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -286,7 +286,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, /* caller already holds s_umount */ if (sb->s_flags & MS_RDONLY) return -EROFS; - writeback_inodes_sb(sb); + writeback_inodes_sb(sb, WB_REASON_SYNC); return 0; default: return -EINVAL; @@ -355,7 +355,7 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special, * resolution (think about autofs) and thus deadlocks could arise. */ if (cmds == Q_QUOTAON) { - ret = user_path_at(AT_FDCWD, addr, LOOKUP_FOLLOW, &path); + ret = user_path_at(AT_FDCWD, addr, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path); if (ret) pathp = ERR_PTR(ret); else @@ -363,12 +363,15 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special, } sb = quotactl_block(special); - if (IS_ERR(sb)) - return PTR_ERR(sb); + if (IS_ERR(sb)) { + ret = PTR_ERR(sb); + goto out; + } ret = do_quotactl(sb, type, cmds, id, addr, pathp); drop_super(sb); +out: if (pathp && !IS_ERR(pathp)) path_put(pathp); return ret; diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index eacb166fb259..462ceb38fec6 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -23,7 +23,6 @@ * caches is sufficient. */ -#include <linux/module.h> #include <linux/fs.h> #include <linux/pagemap.h> #include <linux/highmem.h> @@ -288,14 +287,7 @@ static int __init init_ramfs_fs(void) { return register_filesystem(&ramfs_fs_type); } - -static void __exit exit_ramfs_fs(void) -{ - unregister_filesystem(&ramfs_fs_type); -} - module_init(init_ramfs_fs) -module_exit(exit_ramfs_fs) int __init init_rootfs(void) { @@ -311,5 +303,3 @@ int __init init_rootfs(void) return err; } - -MODULE_LICENSE("GPL"); diff --git a/fs/read_write.c b/fs/read_write.c index 5907b49e4d7e..5ad4248b0cd8 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -35,23 +35,45 @@ static inline int unsigned_offsets(struct file *file) return file->f_mode & FMODE_UNSIGNED_OFFSET; } +static loff_t lseek_execute(struct file *file, struct inode *inode, + loff_t offset, loff_t maxsize) +{ + if (offset < 0 && !unsigned_offsets(file)) + return -EINVAL; + if (offset > maxsize) + return -EINVAL; + + if (offset != file->f_pos) { + file->f_pos = offset; + file->f_version = 0; + } + return offset; +} + /** - * generic_file_llseek_unlocked - lockless generic llseek implementation + * generic_file_llseek_size - generic llseek implementation for regular files * @file: file structure to seek on * @offset: file offset to seek to * @origin: type of seek + * @size: max size of file system * - * Updates the file offset to the value specified by @offset and @origin. - * Locking must be provided by the caller. + * This is a variant of generic_file_llseek that allows passing in a custom + * file size. + * + * Synchronization: + * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms) + * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes. + * read/writes behave like SEEK_SET against seeks. */ loff_t -generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin) +generic_file_llseek_size(struct file *file, loff_t offset, int origin, + loff_t maxsize) { struct inode *inode = file->f_mapping->host; switch (origin) { case SEEK_END: - offset += inode->i_size; + offset += i_size_read(inode); break; case SEEK_CUR: /* @@ -62,14 +84,22 @@ generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin) */ if (offset == 0) return file->f_pos; - offset += file->f_pos; - break; + /* + * f_lock protects against read/modify/write race with other + * SEEK_CURs. Note that parallel writes and reads behave + * like SEEK_SET. + */ + spin_lock(&file->f_lock); + offset = lseek_execute(file, inode, file->f_pos + offset, + maxsize); + spin_unlock(&file->f_lock); + return offset; case SEEK_DATA: /* * In the generic case the entire file is data, so as long as * offset isn't at the end of the file then the offset is data. */ - if (offset >= inode->i_size) + if (offset >= i_size_read(inode)) return -ENXIO; break; case SEEK_HOLE: @@ -77,26 +107,15 @@ generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin) * There is a virtual hole at the end of the file, so as long as * offset isn't i_size or larger, return i_size. */ - if (offset >= inode->i_size) + if (offset >= i_size_read(inode)) return -ENXIO; - offset = inode->i_size; + offset = i_size_read(inode); break; } - if (offset < 0 && !unsigned_offsets(file)) - return -EINVAL; - if (offset > inode->i_sb->s_maxbytes) - return -EINVAL; - - /* Special lock needed here? */ - if (offset != file->f_pos) { - file->f_pos = offset; - file->f_version = 0; - } - - return offset; + return lseek_execute(file, inode, offset, maxsize); } -EXPORT_SYMBOL(generic_file_llseek_unlocked); +EXPORT_SYMBOL(generic_file_llseek_size); /** * generic_file_llseek - generic llseek implementation for regular files @@ -110,13 +129,10 @@ EXPORT_SYMBOL(generic_file_llseek_unlocked); */ loff_t generic_file_llseek(struct file *file, loff_t offset, int origin) { - loff_t rval; - - mutex_lock(&file->f_dentry->d_inode->i_mutex); - rval = generic_file_llseek_unlocked(file, offset, origin); - mutex_unlock(&file->f_dentry->d_inode->i_mutex); + struct inode *inode = file->f_mapping->host; - return rval; + return generic_file_llseek_size(file, offset, origin, + inode->i_sb->s_maxbytes); } EXPORT_SYMBOL(generic_file_llseek); @@ -166,8 +182,10 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin) * long as offset isn't at the end of the file then the * offset is data. */ - if (offset >= inode->i_size) - return -ENXIO; + if (offset >= inode->i_size) { + retval = -ENXIO; + goto out; + } break; case SEEK_HOLE: /* @@ -175,8 +193,10 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin) * as long as offset isn't i_size or larger, return * i_size. */ - if (offset >= inode->i_size) - return -ENXIO; + if (offset >= inode->i_size) { + retval = -ENXIO; + goto out; + } offset = inode->i_size; break; } @@ -613,7 +633,8 @@ ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov, ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, unsigned long nr_segs, unsigned long fast_segs, struct iovec *fast_pointer, - struct iovec **ret_pointer) + struct iovec **ret_pointer, + int check_access) { unsigned long seg; ssize_t ret; @@ -669,7 +690,8 @@ ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, ret = -EINVAL; goto out; } - if (unlikely(!access_ok(vrfy_dir(type), buf, len))) { + if (check_access + && unlikely(!access_ok(vrfy_dir(type), buf, len))) { ret = -EFAULT; goto out; } @@ -701,7 +723,7 @@ static ssize_t do_readv_writev(int type, struct file *file, } ret = rw_copy_check_uvector(type, uvector, nr_segs, - ARRAY_SIZE(iovstack), iovstack, &iov); + ARRAY_SIZE(iovstack), iovstack, &iov, 1); if (ret <= 0) goto out; diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c index 483442e66ed6..d1aca1df4f92 100644 --- a/fs/reiserfs/bitmap.c +++ b/fs/reiserfs/bitmap.c @@ -214,7 +214,7 @@ static int scan_bitmap_block(struct reiserfs_transaction_handle *th, } /* otherwise we clear all bit were set ... */ while (--i >= *beg) - reiserfs_test_and_clear_le_bit + reiserfs_clear_le_bit (i, bh->b_data); reiserfs_restore_prepared_buffer(s, bh); *beg = org; @@ -1222,15 +1222,11 @@ void reiserfs_cache_bitmap_metadata(struct super_block *sb, info->free_count = 0; while (--cur >= (unsigned long *)bh->b_data) { - int i; - /* 0 and ~0 are special, we can optimize for them */ if (*cur == 0) info->free_count += BITS_PER_LONG; else if (*cur != ~0L) /* A mix, investigate */ - for (i = BITS_PER_LONG - 1; i >= 0; i--) - if (!reiserfs_test_le_bit(i, cur)) - info->free_count++; + info->free_count += BITS_PER_LONG - hweight_long(*cur); } } diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index c7156dc39ce7..ace635053a36 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -319,5 +319,5 @@ const struct inode_operations reiserfs_file_inode_operations = { .listxattr = reiserfs_listxattr, .removexattr = reiserfs_removexattr, .permission = reiserfs_permission, - .check_acl = reiserfs_check_acl, + .get_acl = reiserfs_get_acl, }; diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 2922b90ceac1..950f13af0951 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -1154,7 +1154,7 @@ static void init_inode(struct inode *inode, struct treepath *path) set_inode_item_key_version(inode, KEY_FORMAT_3_5); set_inode_sd_version(inode, STAT_DATA_V1); inode->i_mode = sd_v1_mode(sd); - inode->i_nlink = sd_v1_nlink(sd); + set_nlink(inode, sd_v1_nlink(sd)); inode->i_uid = sd_v1_uid(sd); inode->i_gid = sd_v1_gid(sd); inode->i_size = sd_v1_size(sd); @@ -1199,7 +1199,7 @@ static void init_inode(struct inode *inode, struct treepath *path) struct stat_data *sd = (struct stat_data *)B_I_PITEM(bh, ih); inode->i_mode = sd_v2_mode(sd); - inode->i_nlink = sd_v2_nlink(sd); + set_nlink(inode, sd_v2_nlink(sd)); inode->i_uid = sd_v2_uid(sd); inode->i_size = sd_v2_size(sd); inode->i_gid = sd_v2_gid(sd); @@ -1444,7 +1444,7 @@ void reiserfs_read_locked_inode(struct inode *inode, /* a stale NFS handle can trigger this without it being an error */ pathrelse(&path_to_sd); reiserfs_make_bad_inode(inode); - inode->i_nlink = 0; + clear_nlink(inode); return; } @@ -1475,6 +1475,11 @@ void reiserfs_read_locked_inode(struct inode *inode, reiserfs_check_path(&path_to_sd); /* init inode should be relsing */ + /* + * Stat data v1 doesn't support ACLs. + */ + if (get_inode_sd_version(inode) == STAT_DATA_V1) + cache_no_acl(inode); } /** @@ -1827,7 +1832,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, #endif /* fill stat data */ - inode->i_nlink = (S_ISDIR(mode) ? 2 : 1); + set_nlink(inode, (S_ISDIR(mode) ? 2 : 1)); /* uid and gid must already be set by the caller for quota init */ @@ -1982,7 +1987,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, make_bad_inode(inode); out_inserted_sd: - inode->i_nlink = 0; + clear_nlink(inode); th->t_trans_id = 0; /* so the caller can't use this handle later */ unlock_new_inode(inode); /* OK to do even if we hadn't locked it */ iput(inode); diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index c5e82ece7c6c..eb711060a6f2 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -291,14 +291,13 @@ int reiserfs_allocate_list_bitmaps(struct super_block *sb, for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) { jb = jb_array + i; jb->journal_list = NULL; - jb->bitmaps = vmalloc(mem); + jb->bitmaps = vzalloc(mem); if (!jb->bitmaps) { reiserfs_warning(sb, "clm-2000", "unable to " "allocate bitmaps for journal lists"); failed = 1; break; } - memset(jb->bitmaps, 0, mem); } if (failed) { free_list_bitmaps(sb, jb_array); @@ -353,11 +352,10 @@ static struct reiserfs_journal_cnode *allocate_cnodes(int num_cnodes) if (num_cnodes <= 0) { return NULL; } - head = vmalloc(num_cnodes * sizeof(struct reiserfs_journal_cnode)); + head = vzalloc(num_cnodes * sizeof(struct reiserfs_journal_cnode)); if (!head) { return NULL; } - memset(head, 0, num_cnodes * sizeof(struct reiserfs_journal_cnode)); head[0].prev = NULL; head[0].next = head + 1; for (i = 1; i < num_cnodes; i++) { @@ -678,23 +676,19 @@ struct buffer_chunk { static void write_chunk(struct buffer_chunk *chunk) { int i; - get_fs_excl(); for (i = 0; i < chunk->nr; i++) { submit_logged_buffer(chunk->bh[i]); } chunk->nr = 0; - put_fs_excl(); } static void write_ordered_chunk(struct buffer_chunk *chunk) { int i; - get_fs_excl(); for (i = 0; i < chunk->nr; i++) { submit_ordered_buffer(chunk->bh[i]); } chunk->nr = 0; - put_fs_excl(); } static int add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh, @@ -986,8 +980,6 @@ static int flush_commit_list(struct super_block *s, return 0; } - get_fs_excl(); - /* before we can put our commit blocks on disk, we have to make sure everyone older than ** us is on disk too */ @@ -1145,7 +1137,6 @@ static int flush_commit_list(struct super_block *s, if (retval) reiserfs_abort(s, retval, "Journal write error in %s", __func__); - put_fs_excl(); return retval; } @@ -1374,8 +1365,6 @@ static int flush_journal_list(struct super_block *s, return 0; } - get_fs_excl(); - /* if all the work is already done, get out of here */ if (atomic_read(&(jl->j_nonzerolen)) <= 0 && atomic_read(&(jl->j_commit_left)) <= 0) { @@ -1597,7 +1586,6 @@ static int flush_journal_list(struct super_block *s, put_journal_list(s, jl); if (flushall) mutex_unlock(&journal->j_flush_mutex); - put_fs_excl(); return err; } @@ -2695,14 +2683,13 @@ int journal_init(struct super_block *sb, const char *j_dev_name, * dependency inversion warnings. */ reiserfs_write_unlock(sb); - journal = SB_JOURNAL(sb) = vmalloc(sizeof(struct reiserfs_journal)); + journal = SB_JOURNAL(sb) = vzalloc(sizeof(struct reiserfs_journal)); if (!journal) { reiserfs_warning(sb, "journal-1256", "unable to get memory for journal structure"); reiserfs_write_lock(sb); return 1; } - memset(journal, 0, sizeof(struct reiserfs_journal)); INIT_LIST_HEAD(&journal->j_bitmap_nodes); INIT_LIST_HEAD(&journal->j_prealloc_list); INIT_LIST_HEAD(&journal->j_working_list); @@ -3108,7 +3095,6 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th, th->t_trans_id = journal->j_trans_id; unlock_journal(sb); INIT_LIST_HEAD(&th->t_list); - get_fs_excl(); return 0; out_fail: @@ -3964,7 +3950,6 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, flush = flags & FLUSH_ALL; wait_on_commit = flags & WAIT; - put_fs_excl(); current->journal_info = th->t_handle_save; reiserfs_check_lock_depth(sb, "journal end"); if (journal->j_len == 0) { @@ -4316,4 +4301,3 @@ void reiserfs_abort_journal(struct super_block *sb, int errno) dump_stack(); #endif } - diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 551f1b79dbc4..80058e8ce361 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -19,7 +19,7 @@ #include <linux/reiserfs_xattr.h> #include <linux/quotaops.h> -#define INC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) { inc_nlink(i); if (i->i_nlink >= REISERFS_LINK_MAX) i->i_nlink=1; } +#define INC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) { inc_nlink(i); if (i->i_nlink >= REISERFS_LINK_MAX) set_nlink(i, 1); } #define DEC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) drop_nlink(i); // directory item contains array of entry headers. This performs @@ -622,7 +622,7 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode, dentry->d_name.len, inode, 1 /*visible */ ); if (retval) { int err; - inode->i_nlink--; + drop_nlink(inode); reiserfs_update_sd(&th, inode); err = journal_end(&th, dir->i_sb, jbegin_count); if (err) @@ -702,7 +702,7 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dentry->d_name.len, inode, 1 /*visible */ ); if (retval) { int err; - inode->i_nlink--; + drop_nlink(inode); reiserfs_update_sd(&th, inode); err = journal_end(&th, dir->i_sb, jbegin_count); if (err) @@ -787,7 +787,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) dentry->d_name.len, inode, 1 /*visible */ ); if (retval) { int err; - inode->i_nlink = 0; + clear_nlink(inode); DEC_DIR_INODE_NLINK(dir); reiserfs_update_sd(&th, inode); err = journal_end(&th, dir->i_sb, jbegin_count); @@ -964,7 +964,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry) reiserfs_warning(inode->i_sb, "reiserfs-7042", "deleting nonexistent file (%lu), %d", inode->i_ino, inode->i_nlink); - inode->i_nlink = 1; + set_nlink(inode, 1); } drop_nlink(inode); @@ -1086,7 +1086,7 @@ static int reiserfs_symlink(struct inode *parent_dir, dentry->d_name.len, inode, 1 /*visible */ ); if (retval) { int err; - inode->i_nlink--; + drop_nlink(inode); reiserfs_update_sd(&th, inode); err = journal_end(&th, parent_dir->i_sb, jbegin_count); if (err) @@ -1129,7 +1129,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir, retval = journal_begin(&th, dir->i_sb, jbegin_count); if (retval) { - inode->i_nlink--; + drop_nlink(inode); reiserfs_write_unlock(dir->i_sb); return retval; } @@ -1144,7 +1144,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir, if (retval) { int err; - inode->i_nlink--; + drop_nlink(inode); err = journal_end(&th, dir->i_sb, jbegin_count); reiserfs_write_unlock(dir->i_sb); return err ? err : retval; @@ -1529,7 +1529,7 @@ const struct inode_operations reiserfs_dir_inode_operations = { .listxattr = reiserfs_listxattr, .removexattr = reiserfs_removexattr, .permission = reiserfs_permission, - .check_acl = reiserfs_check_acl, + .get_acl = reiserfs_get_acl, }; /* @@ -1546,7 +1546,7 @@ const struct inode_operations reiserfs_symlink_inode_operations = { .listxattr = reiserfs_listxattr, .removexattr = reiserfs_removexattr, .permission = reiserfs_permission, - .check_acl = reiserfs_check_acl, + .get_acl = reiserfs_get_acl, }; @@ -1560,5 +1560,5 @@ const struct inode_operations reiserfs_special_inode_operations = { .listxattr = reiserfs_listxattr, .removexattr = reiserfs_removexattr, .permission = reiserfs_permission, - .check_acl = reiserfs_check_acl, + .get_acl = reiserfs_get_acl, }; diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c index b3a94d20f0fc..7483279b482d 100644 --- a/fs/reiserfs/resize.c +++ b/fs/reiserfs/resize.c @@ -111,15 +111,13 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new) /* allocate additional bitmap blocks, reallocate array of bitmap * block pointers */ bitmap = - vmalloc(sizeof(struct reiserfs_bitmap_info) * bmap_nr_new); + vzalloc(sizeof(struct reiserfs_bitmap_info) * bmap_nr_new); if (!bitmap) { /* Journal bitmaps are still supersized, but the memory isn't * leaked, so I guess it's ok */ printk("reiserfs_resize: unable to allocate memory.\n"); return -ENOMEM; } - memset(bitmap, 0, - sizeof(struct reiserfs_bitmap_info) * bmap_nr_new); for (i = 0; i < bmap_nr; i++) bitmap[i] = old_bitmap[i]; @@ -136,7 +134,7 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new) return -EIO; } memset(bh->b_data, 0, sb_blocksize(sb)); - reiserfs_test_and_set_le_bit(0, bh->b_data); + reiserfs_set_le_bit(0, bh->b_data); reiserfs_cache_bitmap_metadata(s, bh, bitmap + i); set_buffer_uptodate(bh); @@ -172,7 +170,7 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new) reiserfs_prepare_for_journal(s, bh, 1); for (i = block_r; i < s->s_blocksize * 8; i++) - reiserfs_test_and_clear_le_bit(i, bh->b_data); + reiserfs_clear_le_bit(i, bh->b_data); info->free_count += s->s_blocksize * 8 - block_r; journal_mark_dirty(&th, s, bh); @@ -190,7 +188,7 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new) reiserfs_prepare_for_journal(s, bh, 1); for (i = block_r_new; i < s->s_blocksize * 8; i++) - reiserfs_test_and_set_le_bit(i, bh->b_data); + reiserfs_set_le_bit(i, bh->b_data); journal_mark_dirty(&th, s, bh); brelse(bh); diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 6938d8c68d6e..6bc346c160e7 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -867,33 +867,6 @@ out: return err; } -int reiserfs_check_acl(struct inode *inode, int mask) -{ - struct posix_acl *acl; - int error = -EAGAIN; /* do regular unix permission checks by default */ - - /* - * Stat data v1 doesn't support ACLs. - */ - if (get_inode_sd_version(inode) == STAT_DATA_V1) - return -EAGAIN; - - if (mask & MAY_NOT_BLOCK) - return -ECHILD; - - acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS); - - if (acl) { - if (!IS_ERR(acl)) { - error = posix_acl_permission(inode, acl, mask); - posix_acl_release(acl); - } else if (PTR_ERR(acl) != -ENODATA) - error = PTR_ERR(acl); - } - - return error; -} - static int create_privroot(struct dentry *dentry) { int err; diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index 3dc38f1206fc..6da0396e5052 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -272,12 +272,10 @@ reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode, case ACL_TYPE_ACCESS: name = POSIX_ACL_XATTR_ACCESS; if (acl) { - mode_t mode = inode->i_mode; - error = posix_acl_equiv_mode(acl, &mode); + error = posix_acl_equiv_mode(acl, &inode->i_mode); if (error < 0) return error; else { - inode->i_mode = mode; if (error == 0) acl = NULL; } @@ -354,10 +352,6 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th, return PTR_ERR(acl); if (acl) { - struct posix_acl *acl_copy; - mode_t mode = inode->i_mode; - int need_acl; - /* Copy the default ACL to the default ACL of a new directory */ if (S_ISDIR(inode->i_mode)) { err = reiserfs_set_acl(th, inode, ACL_TYPE_DEFAULT, @@ -368,29 +362,13 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th, /* Now we reconcile the new ACL and the mode, potentially modifying both */ - acl_copy = posix_acl_clone(acl, GFP_NOFS); - if (!acl_copy) { - err = -ENOMEM; - goto cleanup; - } - - need_acl = posix_acl_create_masq(acl_copy, &mode); - if (need_acl >= 0) { - if (mode != inode->i_mode) { - inode->i_mode = mode; - } + err = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); + if (err < 0) + return err; - /* If we need an ACL.. */ - if (need_acl > 0) { - err = reiserfs_set_acl(th, inode, - ACL_TYPE_ACCESS, - acl_copy); - if (err) - goto cleanup_copy; - } - } - cleanup_copy: - posix_acl_release(acl_copy); + /* If we need an ACL.. */ + if (err > 0) + err = reiserfs_set_acl(th, inode, ACL_TYPE_ACCESS, acl); cleanup: posix_acl_release(acl); } else { @@ -445,7 +423,10 @@ int reiserfs_cache_default_acl(struct inode *inode) int reiserfs_acl_chmod(struct inode *inode) { - struct posix_acl *acl, *clone; + struct reiserfs_transaction_handle th; + struct posix_acl *acl; + size_t size; + int depth; int error; if (S_ISLNK(inode->i_mode)) @@ -463,30 +444,22 @@ int reiserfs_acl_chmod(struct inode *inode) return 0; if (IS_ERR(acl)) return PTR_ERR(acl); - clone = posix_acl_clone(acl, GFP_NOFS); - posix_acl_release(acl); - if (!clone) - return -ENOMEM; - error = posix_acl_chmod_masq(clone, inode->i_mode); + error = posix_acl_chmod(&acl, GFP_NOFS, inode->i_mode); + if (error) + return error; + + size = reiserfs_xattr_nblocks(inode, reiserfs_acl_size(acl->a_count)); + depth = reiserfs_write_lock_once(inode->i_sb); + error = journal_begin(&th, inode->i_sb, size * 2); if (!error) { - struct reiserfs_transaction_handle th; - size_t size = reiserfs_xattr_nblocks(inode, - reiserfs_acl_size(clone->a_count)); - int depth; - - depth = reiserfs_write_lock_once(inode->i_sb); - error = journal_begin(&th, inode->i_sb, size * 2); - if (!error) { - int error2; - error = reiserfs_set_acl(&th, inode, ACL_TYPE_ACCESS, - clone); - error2 = journal_end(&th, inode->i_sb, size * 2); - if (error2) - error = error2; - } - reiserfs_write_unlock_once(inode->i_sb, depth); + int error2; + error = reiserfs_set_acl(&th, inode, ACL_TYPE_ACCESS, acl); + error2 = journal_end(&th, inode->i_sb, size * 2); + if (error2) + error = error2; } - posix_acl_release(clone); + reiserfs_write_unlock_once(inode->i_sb, depth); + posix_acl_release(acl); return error; } diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c index ef66c18a9332..534668fa41be 100644 --- a/fs/reiserfs/xattr_security.c +++ b/fs/reiserfs/xattr_security.c @@ -66,8 +66,8 @@ int reiserfs_security_init(struct inode *dir, struct inode *inode, if (IS_PRIVATE(dir)) return 0; - error = security_inode_init_security(inode, dir, qstr, &sec->name, - &sec->value, &sec->length); + error = security_old_inode_init_security(inode, dir, qstr, &sec->name, + &sec->value, &sec->length); if (error) { if (error == -EOPNOTSUPP) error = 0; diff --git a/fs/romfs/super.c b/fs/romfs/super.c index 2305e3121cb1..8b4089f30408 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -337,7 +337,7 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos) inode->i_metasize = (ROMFH_SIZE + nlen + 1 + ROMFH_PAD) & ROMFH_MASK; inode->i_dataoffset = pos + inode->i_metasize; - i->i_nlink = 1; /* Hard to decide.. */ + set_nlink(i, 1); /* Hard to decide.. */ i->i_size = be32_to_cpu(ri.size); i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0; i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0; diff --git a/fs/splice.c b/fs/splice.c index aa866d309695..fa2defa8afcf 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -132,7 +132,7 @@ error: return err; } -static const struct pipe_buf_operations page_cache_pipe_buf_ops = { +const struct pipe_buf_operations page_cache_pipe_buf_ops = { .can_merge = 0, .map = generic_pipe_buf_map, .unmap = generic_pipe_buf_unmap, @@ -264,7 +264,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, return ret; } -static void spd_release_page(struct splice_pipe_desc *spd, unsigned int i) +void spd_release_page(struct splice_pipe_desc *spd, unsigned int i) { page_cache_release(spd->pages[i]); } diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig index 7797218d0b30..c70111ebefd4 100644 --- a/fs/squashfs/Kconfig +++ b/fs/squashfs/Kconfig @@ -1,7 +1,6 @@ config SQUASHFS tristate "SquashFS 4.0 - Squashed file system support" depends on BLOCK - select ZLIB_INFLATE help Saying Y here includes support for SquashFS 4.0 (a Compressed Read-Only File System). Squashfs is a highly compressed read-only @@ -20,9 +19,9 @@ config SQUASHFS If you want to compile this as a module ( = code which can be inserted in and removed from the running kernel whenever you want), - say M here and read <file:Documentation/modules.txt>. The module - will be called squashfs. Note that the root file system (the one - containing the directory /) cannot be compiled as a module. + say M here. The module will be called squashfs. Note that the root + file system (the one containing the directory /) cannot be compiled + as a module. If unsure, say N. @@ -36,6 +35,19 @@ config SQUASHFS_XATTR If unsure, say N. +config SQUASHFS_ZLIB + bool "Include support for ZLIB compressed file systems" + depends on SQUASHFS + select ZLIB_INFLATE + default y + help + ZLIB compression is the standard compression used by Squashfs + file systems. It offers a good trade-off between compression + achieved and the amount of CPU time and memory necessary to + compress and decompress. + + If unsure, say Y. + config SQUASHFS_LZO bool "Include support for LZO compressed file systems" depends on SQUASHFS @@ -66,6 +78,28 @@ config SQUASHFS_XZ If unsure, say N. +config SQUASHFS_4K_DEVBLK_SIZE + bool "Use 4K device block size?" + depends on SQUASHFS + help + By default Squashfs sets the dev block size (sb_min_blocksize) + to 1K or the smallest block size supported by the block device + (if larger). This, because blocks are packed together and + unaligned in Squashfs, should reduce latency. + + This, however, gives poor performance on MTD NAND devices where + the optimal I/O size is 4K (even though the devices can support + smaller block sizes). + + Using a 4K device block size may also improve overall I/O + performance for some file access patterns (e.g. sequential + accesses of files in filesystem order) on all media. + + Setting this option will force Squashfs to use a 4K device block + size by default. + + If unsure, say N. + config SQUASHFS_EMBEDDED bool "Additional option for memory-constrained systems" depends on SQUASHFS diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile index cecf2bea07af..110b0476f3b4 100644 --- a/fs/squashfs/Makefile +++ b/fs/squashfs/Makefile @@ -4,7 +4,8 @@ obj-$(CONFIG_SQUASHFS) += squashfs.o squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o -squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o +squashfs-y += namei.o super.o symlink.o decompressor.o squashfs-$(CONFIG_SQUASHFS_XATTR) += xattr.o xattr_id.o squashfs-$(CONFIG_SQUASHFS_LZO) += lzo_wrapper.o squashfs-$(CONFIG_SQUASHFS_XZ) += xz_wrapper.o +squashfs-$(CONFIG_SQUASHFS_ZLIB) += zlib_wrapper.o diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c index 9f1b0bb96f13..3f6271d86abc 100644 --- a/fs/squashfs/decompressor.c +++ b/fs/squashfs/decompressor.c @@ -52,6 +52,12 @@ static const struct squashfs_decompressor squashfs_xz_comp_ops = { }; #endif +#ifndef CONFIG_SQUASHFS_ZLIB +static const struct squashfs_decompressor squashfs_zlib_comp_ops = { + NULL, NULL, NULL, ZLIB_COMPRESSION, "zlib", 0 +}; +#endif + static const struct squashfs_decompressor squashfs_unknown_comp_ops = { NULL, NULL, NULL, 0, "unknown", 0 }; diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h index 8ba70cff09a6..330073e29029 100644 --- a/fs/squashfs/decompressor.h +++ b/fs/squashfs/decompressor.h @@ -56,4 +56,8 @@ extern const struct squashfs_decompressor squashfs_xz_comp_ops; extern const struct squashfs_decompressor squashfs_lzo_comp_ops; #endif +#ifdef CONFIG_SQUASHFS_ZLIB +extern const struct squashfs_decompressor squashfs_zlib_comp_ops; +#endif + #endif diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c index 04bebcaa2373..fd7b3b3bda13 100644 --- a/fs/squashfs/inode.c +++ b/fs/squashfs/inode.c @@ -159,7 +159,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) frag_offset = 0; } - inode->i_nlink = 1; + set_nlink(inode, 1); inode->i_size = le32_to_cpu(sqsh_ino->file_size); inode->i_fop = &generic_ro_fops; inode->i_mode |= S_IFREG; @@ -203,7 +203,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) } xattr_id = le32_to_cpu(sqsh_ino->xattr); - inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); + set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); inode->i_size = le64_to_cpu(sqsh_ino->file_size); inode->i_op = &squashfs_inode_ops; inode->i_fop = &generic_ro_fops; @@ -232,7 +232,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) if (err < 0) goto failed_read; - inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); + set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); inode->i_size = le16_to_cpu(sqsh_ino->file_size); inode->i_op = &squashfs_dir_inode_ops; inode->i_fop = &squashfs_dir_ops; @@ -257,7 +257,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) goto failed_read; xattr_id = le32_to_cpu(sqsh_ino->xattr); - inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); + set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); inode->i_size = le32_to_cpu(sqsh_ino->file_size); inode->i_op = &squashfs_dir_inode_ops; inode->i_fop = &squashfs_dir_ops; @@ -284,7 +284,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) if (err < 0) goto failed_read; - inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); + set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); inode->i_size = le32_to_cpu(sqsh_ino->symlink_size); inode->i_op = &squashfs_symlink_inode_ops; inode->i_data.a_ops = &squashfs_symlink_aops; @@ -325,7 +325,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) inode->i_mode |= S_IFCHR; else inode->i_mode |= S_IFBLK; - inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); + set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); rdev = le32_to_cpu(sqsh_ino->rdev); init_special_inode(inode, inode->i_mode, new_decode_dev(rdev)); @@ -349,7 +349,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) inode->i_mode |= S_IFBLK; xattr_id = le32_to_cpu(sqsh_ino->xattr); inode->i_op = &squashfs_inode_ops; - inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); + set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); rdev = le32_to_cpu(sqsh_ino->rdev); init_special_inode(inode, inode->i_mode, new_decode_dev(rdev)); @@ -370,7 +370,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) inode->i_mode |= S_IFIFO; else inode->i_mode |= S_IFSOCK; - inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); + set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); init_special_inode(inode, inode->i_mode, 0); break; } @@ -389,7 +389,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) inode->i_mode |= S_IFSOCK; xattr_id = le32_to_cpu(sqsh_ino->xattr); inode->i_op = &squashfs_inode_ops; - inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); + set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); init_special_inode(inode, inode->i_mode, 0); break; } diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h index e3be6a71cfa7..d1266516ed08 100644 --- a/fs/squashfs/squashfs.h +++ b/fs/squashfs/squashfs.h @@ -97,6 +97,3 @@ extern const struct inode_operations squashfs_symlink_inode_ops; /* xattr.c */ extern const struct xattr_handler *squashfs_xattr_handlers[]; - -/* zlib_wrapper.c */ -extern const struct squashfs_decompressor squashfs_zlib_comp_ops; diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h index b4a4e539a08c..e8e14645de9a 100644 --- a/fs/squashfs/squashfs_fs.h +++ b/fs/squashfs/squashfs_fs.h @@ -36,6 +36,13 @@ #define SQUASHFS_FILE_SIZE 131072 #define SQUASHFS_FILE_LOG 17 +/* default size of block device I/O */ +#ifdef CONFIG_SQUASHFS_4K_DEVBLK_SIZE +#define SQUASHFS_DEVBLK_SIZE 4096 +#else +#define SQUASHFS_DEVBLK_SIZE 1024 +#endif + #define SQUASHFS_FILE_MAX_SIZE 1048576 #define SQUASHFS_FILE_MAX_LOG 20 diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 7438850c62d0..2da1715452ac 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -95,7 +95,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent) } msblk = sb->s_fs_info; - msblk->devblksize = sb_min_blocksize(sb, BLOCK_SIZE); + msblk->devblksize = sb_min_blocksize(sb, SQUASHFS_DEVBLK_SIZE); msblk->devblksize_log2 = ffz(~msblk->devblksize); mutex_init(&msblk->read_data_mutex); diff --git a/fs/stack.c b/fs/stack.c index 4a6f7f440658..9c11519245a6 100644 --- a/fs/stack.c +++ b/fs/stack.c @@ -29,10 +29,7 @@ void fsstack_copy_inode_size(struct inode *dst, struct inode *src) * * We don't actually know what locking is used at the lower level; * but if it's a filesystem that supports quotas, it will be using - * i_lock as in inode_add_bytes(). tmpfs uses other locking, and - * its 32-bit is (just) able to exceed 2TB i_size with the aid of - * holes; but its i_blocks cannot carry into the upper long without - * almost 2TB swap - let's ignore that case. + * i_lock as in inode_add_bytes(). */ if (sizeof(i_blocks) > sizeof(long)) spin_lock(&src->i_lock); @@ -74,6 +71,6 @@ void fsstack_copy_attr_all(struct inode *dest, const struct inode *src) dest->i_ctime = src->i_ctime; dest->i_blkbits = src->i_blkbits; dest->i_flags = src->i_flags; - dest->i_nlink = src->i_nlink; + set_nlink(dest, src->i_nlink); } EXPORT_SYMBOL_GPL(fsstack_copy_attr_all); diff --git a/fs/stat.c b/fs/stat.c index 961039121cb8..8806b8997d2e 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -27,12 +27,12 @@ void generic_fillattr(struct inode *inode, struct kstat *stat) stat->uid = inode->i_uid; stat->gid = inode->i_gid; stat->rdev = inode->i_rdev; + stat->size = i_size_read(inode); stat->atime = inode->i_atime; stat->mtime = inode->i_mtime; stat->ctime = inode->i_ctime; - stat->size = i_size_read(inode); - stat->blocks = inode->i_blocks; stat->blksize = (1 << inode->i_blkbits); + stat->blocks = inode->i_blocks; } EXPORT_SYMBOL(generic_fillattr); @@ -81,8 +81,6 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat, if (!(flag & AT_SYMLINK_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; - if (flag & AT_NO_AUTOMOUNT) - lookup_flags |= LOOKUP_NO_AUTOMOUNT; if (flag & AT_EMPTY_PATH) lookup_flags |= LOOKUP_EMPTY; @@ -296,15 +294,16 @@ SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname, { struct path path; int error; + int empty = 0; if (bufsiz <= 0) return -EINVAL; - error = user_path_at(dfd, pathname, LOOKUP_EMPTY, &path); + error = user_path_at_empty(dfd, pathname, LOOKUP_EMPTY, &path, &empty); if (!error) { struct inode *inode = path.dentry->d_inode; - error = -EINVAL; + error = empty ? -ENOENT : -EINVAL; if (inode->i_op->readlink) { error = security_inode_readlink(path.dentry); if (!error) { diff --git a/fs/statfs.c b/fs/statfs.c index 8244924dec55..9cf04a118965 100644 --- a/fs/statfs.c +++ b/fs/statfs.c @@ -76,7 +76,7 @@ EXPORT_SYMBOL(vfs_statfs); int user_statfs(const char __user *pathname, struct kstatfs *st) { struct path path; - int error = user_path(pathname, &path); + int error = user_path_at(AT_FDCWD, pathname, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path); if (!error) { error = vfs_statfs(&path, st); path_put(&path); diff --git a/fs/super.c b/fs/super.c index 7943f04cb3a9..afd0f1ad45e0 100644 --- a/fs/super.c +++ b/fs/super.c @@ -61,7 +61,7 @@ static int prune_super(struct shrinker *shrink, struct shrink_control *sc) return -1; if (!grab_super_passive(sb)) - return -1; + return !sc->nr_to_scan ? 0 : -1; if (sb->s_op && sb->s_op->nr_cached_objects) fs_objects = sb->s_op->nr_cached_objects(sb); @@ -351,13 +351,11 @@ bool grab_super_passive(struct super_block *sb) */ void lock_super(struct super_block * sb) { - get_fs_excl(); mutex_lock(&sb->s_lock); } void unlock_super(struct super_block * sb) { - put_fs_excl(); mutex_unlock(&sb->s_lock); } @@ -385,7 +383,6 @@ void generic_shutdown_super(struct super_block *sb) if (sb->s_root) { shrink_dcache_for_umount(sb); sync_filesystem(sb); - get_fs_excl(); sb->s_flags &= ~MS_ACTIVE; fsnotify_unmount_inodes(&sb->s_inodes); @@ -400,7 +397,6 @@ void generic_shutdown_super(struct super_block *sb) "Self-destruct in 5 seconds. Have a nice day...\n", sb->s_id); } - put_fs_excl(); } spin_lock(&sb_lock); /* should be initialized for __put_super_and_need_restart() */ @@ -731,8 +727,13 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) if (sb->s_op->remount_fs) { retval = sb->s_op->remount_fs(sb, &flags, data); - if (retval) - return retval; + if (retval) { + if (!force) + return retval; + /* If forced remount, go ahead despite any errors */ + WARN(1, "forced remount of a %s fs returned %i\n", + sb->s_type->name, retval); + } } sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK); diff --git a/fs/sync.c b/fs/sync.c index c98a7477edfd..101b8ef901d7 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -43,7 +43,7 @@ static int __sync_filesystem(struct super_block *sb, int wait) if (wait) sync_inodes_sb(sb); else - writeback_inodes_sb(sb); + writeback_inodes_sb(sb, WB_REASON_SYNC); if (sb->s_op->sync_fs) sb->s_op->sync_fs(sb, wait); @@ -98,7 +98,7 @@ static void sync_filesystems(int wait) */ SYSCALL_DEFINE0(sync) { - wakeup_flusher_threads(0); + wakeup_flusher_threads(0, WB_REASON_SYNC); sync_filesystems(0); sync_filesystems(1); if (unlikely(laptop_mode)) diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index ea9120a830d8..7fdf6a7b7436 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -43,20 +43,48 @@ static DEFINE_IDA(sysfs_ino_ida); static void sysfs_link_sibling(struct sysfs_dirent *sd) { struct sysfs_dirent *parent_sd = sd->s_parent; - struct sysfs_dirent **pos; - BUG_ON(sd->s_sibling); - - /* Store directory entries in order by ino. This allows - * readdir to properly restart without having to add a - * cursor into the s_dir.children list. - */ - for (pos = &parent_sd->s_dir.children; *pos; pos = &(*pos)->s_sibling) { - if (sd->s_ino < (*pos)->s_ino) - break; + struct rb_node **p; + struct rb_node *parent; + + if (sysfs_type(sd) == SYSFS_DIR) + parent_sd->s_dir.subdirs++; + + p = &parent_sd->s_dir.inode_tree.rb_node; + parent = NULL; + while (*p) { + parent = *p; +#define node rb_entry(parent, struct sysfs_dirent, inode_node) + if (sd->s_ino < node->s_ino) { + p = &node->inode_node.rb_left; + } else if (sd->s_ino > node->s_ino) { + p = &node->inode_node.rb_right; + } else { + printk(KERN_CRIT "sysfs: inserting duplicate inode '%lx'\n", + (unsigned long) sd->s_ino); + BUG(); + } +#undef node } - sd->s_sibling = *pos; - *pos = sd; + rb_link_node(&sd->inode_node, parent, p); + rb_insert_color(&sd->inode_node, &parent_sd->s_dir.inode_tree); + + p = &parent_sd->s_dir.name_tree.rb_node; + parent = NULL; + while (*p) { + int c; + parent = *p; +#define node rb_entry(parent, struct sysfs_dirent, name_node) + c = strcmp(sd->s_name, node->s_name); + if (c < 0) { + p = &node->name_node.rb_left; + } else { + p = &node->name_node.rb_right; + } +#undef node + } + rb_link_node(&sd->name_node, parent, p); + rb_insert_color(&sd->name_node, &parent_sd->s_dir.name_tree); } /** @@ -71,16 +99,11 @@ static void sysfs_link_sibling(struct sysfs_dirent *sd) */ static void sysfs_unlink_sibling(struct sysfs_dirent *sd) { - struct sysfs_dirent **pos; + if (sysfs_type(sd) == SYSFS_DIR) + sd->s_parent->s_dir.subdirs--; - for (pos = &sd->s_parent->s_dir.children; *pos; - pos = &(*pos)->s_sibling) { - if (*pos == sd) { - *pos = sd->s_sibling; - sd->s_sibling = NULL; - break; - } - } + rb_erase(&sd->inode_node, &sd->s_parent->s_dir.inode_tree); + rb_erase(&sd->name_node, &sd->s_parent->s_dir.name_tree); } /** @@ -126,7 +149,6 @@ struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd) */ void sysfs_put_active(struct sysfs_dirent *sd) { - struct completion *cmpl; int v; if (unlikely(!sd)) @@ -138,10 +160,9 @@ void sysfs_put_active(struct sysfs_dirent *sd) return; /* atomic_dec_return() is a mb(), we'll always see the updated - * sd->s_sibling. + * sd->u.completion. */ - cmpl = (void *)sd->s_sibling; - complete(cmpl); + complete(sd->u.completion); } /** @@ -155,16 +176,16 @@ static void sysfs_deactivate(struct sysfs_dirent *sd) DECLARE_COMPLETION_ONSTACK(wait); int v; - BUG_ON(sd->s_sibling || !(sd->s_flags & SYSFS_FLAG_REMOVED)); + BUG_ON(!(sd->s_flags & SYSFS_FLAG_REMOVED)); if (!(sysfs_type(sd) & SYSFS_ACTIVE_REF)) return; - sd->s_sibling = (void *)&wait; + sd->u.completion = (void *)&wait; rwsem_acquire(&sd->dep_map, 0, 0, _RET_IP_); /* atomic_add_return() is a mb(), put_active() will always see - * the updated sd->s_sibling. + * the updated sd->u.completion. */ v = atomic_add_return(SD_DEACTIVATED_BIAS, &sd->s_active); @@ -173,8 +194,6 @@ static void sysfs_deactivate(struct sysfs_dirent *sd) wait_for_completion(&wait); } - sd->s_sibling = NULL; - lock_acquired(&sd->dep_map, _RET_IP_); rwsem_release(&sd->dep_map, 1, _RET_IP_); } @@ -384,6 +403,13 @@ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) { struct sysfs_inode_attrs *ps_iattr; + if (!!sysfs_ns_type(acxt->parent_sd) != !!sd->s_ns) { + WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n", + sysfs_ns_type(acxt->parent_sd)? "required": "invalid", + acxt->parent_sd->s_name, sd->s_name); + return -EINVAL; + } + if (sysfs_find_dirent(acxt->parent_sd, sd->s_ns, sd->s_name)) return -EEXIST; @@ -490,7 +516,7 @@ void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) } sd->s_flags |= SYSFS_FLAG_REMOVED; - sd->s_sibling = acxt->removed; + sd->u.removed_list = acxt->removed; acxt->removed = sd; } @@ -514,8 +540,7 @@ void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt) while (acxt->removed) { struct sysfs_dirent *sd = acxt->removed; - acxt->removed = sd->s_sibling; - sd->s_sibling = NULL; + acxt->removed = sd->u.removed_list; sysfs_deactivate(sd); unmap_bin_file(sd); @@ -540,15 +565,43 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd, const void *ns, const unsigned char *name) { - struct sysfs_dirent *sd; + struct rb_node *p = parent_sd->s_dir.name_tree.rb_node; + struct sysfs_dirent *found = NULL; - for (sd = parent_sd->s_dir.children; sd; sd = sd->s_sibling) { - if (ns && sd->s_ns && (sd->s_ns != ns)) - continue; - if (!strcmp(sd->s_name, name)) - return sd; + if (!!sysfs_ns_type(parent_sd) != !!ns) { + WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n", + sysfs_ns_type(parent_sd)? "required": "invalid", + parent_sd->s_name, name); + return NULL; } - return NULL; + + while (p) { + int c; +#define node rb_entry(p, struct sysfs_dirent, name_node) + c = strcmp(name, node->s_name); + if (c < 0) { + p = node->name_node.rb_left; + } else if (c > 0) { + p = node->name_node.rb_right; + } else { + found = node; + p = node->name_node.rb_left; + } +#undef node + } + + if (found) { + while (found->s_ns != ns) { + p = rb_next(&found->name_node); + if (!p) + return NULL; + found = rb_entry(p, struct sysfs_dirent, name_node); + if (strcmp(name, found->s_name)) + return NULL; + } + } + + return found; } /** @@ -744,21 +797,19 @@ void sysfs_remove_subdir(struct sysfs_dirent *sd) static void __sysfs_remove_dir(struct sysfs_dirent *dir_sd) { struct sysfs_addrm_cxt acxt; - struct sysfs_dirent **pos; + struct rb_node *pos; if (!dir_sd) return; pr_debug("sysfs %s: removing dir\n", dir_sd->s_name); sysfs_addrm_start(&acxt, dir_sd); - pos = &dir_sd->s_dir.children; - while (*pos) { - struct sysfs_dirent *sd = *pos; - + pos = rb_first(&dir_sd->s_dir.inode_tree); + while (pos) { + struct sysfs_dirent *sd = rb_entry(pos, struct sysfs_dirent, inode_node); + pos = rb_next(pos); if (sysfs_type(sd) != SYSFS_DIR) sysfs_remove_one(&acxt, sd); - else - pos = &(*pos)->s_sibling; } sysfs_addrm_finish(&acxt); @@ -814,15 +865,13 @@ int sysfs_rename(struct sysfs_dirent *sd, sd->s_name = new_name; } - /* Remove from old parent's list and insert into new parent's list. */ - if (sd->s_parent != new_parent_sd) { - sysfs_unlink_sibling(sd); - sysfs_get(new_parent_sd); - sysfs_put(sd->s_parent); - sd->s_parent = new_parent_sd; - sysfs_link_sibling(sd); - } + /* Move to the appropriate place in the appropriate directories rbtree. */ + sysfs_unlink_sibling(sd); + sysfs_get(new_parent_sd); + sysfs_put(sd->s_parent); sd->s_ns = new_ns; + sd->s_parent = new_parent_sd; + sysfs_link_sibling(sd); error = 0; out: @@ -881,12 +930,28 @@ static struct sysfs_dirent *sysfs_dir_pos(const void *ns, pos = NULL; } if (!pos && (ino > 1) && (ino < INT_MAX)) { - pos = parent_sd->s_dir.children; - while (pos && (ino > pos->s_ino)) - pos = pos->s_sibling; + struct rb_node *p = parent_sd->s_dir.inode_tree.rb_node; + while (p) { +#define node rb_entry(p, struct sysfs_dirent, inode_node) + if (ino < node->s_ino) { + pos = node; + p = node->inode_node.rb_left; + } else if (ino > node->s_ino) { + p = node->inode_node.rb_right; + } else { + pos = node; + break; + } +#undef node + } + } + while (pos && pos->s_ns != ns) { + struct rb_node *p = rb_next(&pos->inode_node); + if (!p) + pos = NULL; + else + pos = rb_entry(p, struct sysfs_dirent, inode_node); } - while (pos && pos->s_ns && pos->s_ns != ns) - pos = pos->s_sibling; return pos; } @@ -894,10 +959,13 @@ static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns, struct sysfs_dirent *parent_sd, ino_t ino, struct sysfs_dirent *pos) { pos = sysfs_dir_pos(ns, parent_sd, ino, pos); - if (pos) - pos = pos->s_sibling; - while (pos && pos->s_ns && pos->s_ns != ns) - pos = pos->s_sibling; + if (pos) do { + struct rb_node *p = rb_next(&pos->inode_node); + if (!p) + pos = NULL; + else + pos = rb_entry(p, struct sysfs_dirent, inode_node); + } while (pos && pos->s_ns != ns); return pos; } diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 1ad8c93c1b85..d4e6080b4b20 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -466,9 +466,6 @@ void sysfs_notify(struct kobject *k, const char *dir, const char *attr) mutex_lock(&sysfs_mutex); if (sd && dir) - /* Only directories are tagged, so no need to pass - * a tag explicitly. - */ sd = sysfs_find_dirent(sd, NULL, dir); if (sd && attr) sd = sysfs_find_dirent(sd, NULL, attr); @@ -488,17 +485,56 @@ const struct file_operations sysfs_file_operations = { .poll = sysfs_poll, }; +int sysfs_attr_ns(struct kobject *kobj, const struct attribute *attr, + const void **pns) +{ + struct sysfs_dirent *dir_sd = kobj->sd; + const struct sysfs_ops *ops; + const void *ns = NULL; + int err; + + err = 0; + if (!sysfs_ns_type(dir_sd)) + goto out; + + err = -EINVAL; + if (!kobj->ktype) + goto out; + ops = kobj->ktype->sysfs_ops; + if (!ops) + goto out; + if (!ops->namespace) + goto out; + + err = 0; + ns = ops->namespace(kobj, attr); +out: + if (err) { + WARN(1, KERN_ERR "missing sysfs namespace attribute operation for " + "kobject: %s\n", kobject_name(kobj)); + } + *pns = ns; + return err; +} + int sysfs_add_file_mode(struct sysfs_dirent *dir_sd, const struct attribute *attr, int type, mode_t amode) { umode_t mode = (amode & S_IALLUGO) | S_IFREG; struct sysfs_addrm_cxt acxt; struct sysfs_dirent *sd; + const void *ns; int rc; + rc = sysfs_attr_ns(dir_sd->s_dir.kobj, attr, &ns); + if (rc) + return rc; + sd = sysfs_new_dirent(attr->name, mode, type); if (!sd) return -ENOMEM; + + sd->s_ns = ns; sd->s_attr.attr = (void *)attr; sysfs_dirent_init_lockdep(sd); @@ -586,12 +622,17 @@ int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr, { struct sysfs_dirent *sd; struct iattr newattrs; + const void *ns; int rc; + rc = sysfs_attr_ns(kobj, attr, &ns); + if (rc) + return rc; + mutex_lock(&sysfs_mutex); rc = -ENOENT; - sd = sysfs_find_dirent(kobj->sd, NULL, attr->name); + sd = sysfs_find_dirent(kobj->sd, ns, attr->name); if (!sd) goto out; @@ -616,7 +657,12 @@ EXPORT_SYMBOL_GPL(sysfs_chmod_file); void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr) { - sysfs_hash_and_remove(kobj->sd, NULL, attr->name); + const void *ns; + + if (sysfs_attr_ns(kobj, attr, &ns)) + return; + + sysfs_hash_and_remove(kobj->sd, ns, attr->name); } void sysfs_remove_files(struct kobject * kobj, const struct attribute **ptr) diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index e3f091a81c72..c81b22f3ace1 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -202,18 +202,6 @@ static inline void set_inode_attr(struct inode * inode, struct iattr * iattr) inode->i_ctime = iattr->ia_ctime; } -static int sysfs_count_nlink(struct sysfs_dirent *sd) -{ - struct sysfs_dirent *child; - int nr = 0; - - for (child = sd->s_dir.children; child; child = child->s_sibling) - if (sysfs_type(child) == SYSFS_DIR) - nr++; - - return nr + 2; -} - static void sysfs_refresh_inode(struct sysfs_dirent *sd, struct inode *inode) { struct sysfs_inode_attrs *iattrs = sd->s_iattr; @@ -230,7 +218,7 @@ static void sysfs_refresh_inode(struct sysfs_dirent *sd, struct inode *inode) } if (sysfs_type(sd) == SYSFS_DIR) - inode->i_nlink = sysfs_count_nlink(sd); + set_nlink(inode, sd->s_dir.subdirs + 2); } int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) @@ -336,8 +324,6 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const cha sysfs_addrm_start(&acxt, dir_sd); sd = sysfs_find_dirent(dir_sd, ns, name); - if (sd && (sd->s_ns != ns)) - sd = NULL; if (sd) sysfs_remove_one(&acxt, sd); diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index 845ab3ad229d..ce29e28b766d 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -11,14 +11,18 @@ #include <linux/lockdep.h> #include <linux/kobject_ns.h> #include <linux/fs.h> +#include <linux/rbtree.h> struct sysfs_open_dirent; /* type-specific structures for sysfs_dirent->s_* union members */ struct sysfs_elem_dir { struct kobject *kobj; - /* children list starts here and goes through sd->s_sibling */ - struct sysfs_dirent *children; + + unsigned long subdirs; + + struct rb_root inode_tree; + struct rb_root name_tree; }; struct sysfs_elem_symlink { @@ -56,9 +60,16 @@ struct sysfs_dirent { struct lockdep_map dep_map; #endif struct sysfs_dirent *s_parent; - struct sysfs_dirent *s_sibling; const char *s_name; + struct rb_node inode_node; + struct rb_node name_node; + + union { + struct completion *completion; + struct sysfs_dirent *removed_list; + } u; + const void *s_ns; /* namespace tag */ union { struct sysfs_elem_dir s_dir; diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index 0630eb969a28..25ffb3e9a3f8 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -219,7 +219,7 @@ struct inode *sysv_iget(struct super_block *sb, unsigned int ino) inode->i_mode = fs16_to_cpu(sbi, raw_inode->i_mode); inode->i_uid = (uid_t)fs16_to_cpu(sbi, raw_inode->i_uid); inode->i_gid = (gid_t)fs16_to_cpu(sbi, raw_inode->i_gid); - inode->i_nlink = fs16_to_cpu(sbi, raw_inode->i_nlink); + set_nlink(inode, fs16_to_cpu(sbi, raw_inode->i_nlink)); inode->i_size = fs32_to_cpu(sbi, raw_inode->i_size); inode->i_atime.tv_sec = fs32_to_cpu(sbi, raw_inode->i_atime); inode->i_mtime.tv_sec = fs32_to_cpu(sbi, raw_inode->i_mtime); diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c index 315de66e52b2..bc4f94b28706 100644 --- a/fs/ubifs/budget.c +++ b/fs/ubifs/budget.c @@ -63,7 +63,7 @@ static void shrink_liability(struct ubifs_info *c, int nr_to_write) { down_read(&c->vfs_sb->s_umount); - writeback_inodes_sb(c->vfs_sb); + writeback_inodes_sb(c->vfs_sb, WB_REASON_FS_FREE_SPACE); up_read(&c->vfs_sb->s_umount); } diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index eef109a1a927..b09ba2dd8b62 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -870,6 +870,22 @@ void dbg_dump_lpt_info(struct ubifs_info *c) spin_unlock(&dbg_lock); } +void dbg_dump_sleb(const struct ubifs_info *c, + const struct ubifs_scan_leb *sleb, int offs) +{ + struct ubifs_scan_node *snod; + + printk(KERN_DEBUG "(pid %d) start dumping scanned data from LEB %d:%d\n", + current->pid, sleb->lnum, offs); + + list_for_each_entry(snod, &sleb->nodes, list) { + cond_resched(); + printk(KERN_DEBUG "Dumping node at LEB %d:%d len %d\n", sleb->lnum, + snod->offs, snod->len); + dbg_dump_node(c, snod->node); + } +} + void dbg_dump_leb(const struct ubifs_info *c, int lnum) { struct ubifs_scan_leb *sleb; diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h index 45174b534377..8d9c46810189 100644 --- a/fs/ubifs/debug.h +++ b/fs/ubifs/debug.h @@ -269,6 +269,8 @@ void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp); void dbg_dump_lprops(struct ubifs_info *c); void dbg_dump_lpt_info(struct ubifs_info *c); void dbg_dump_leb(const struct ubifs_info *c, int lnum); +void dbg_dump_sleb(const struct ubifs_info *c, + const struct ubifs_scan_leb *sleb, int offs); void dbg_dump_znode(const struct ubifs_info *c, const struct ubifs_znode *znode); void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat); @@ -335,9 +337,9 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c); #define DBGKEY(key) ((char *)(key)) #define DBGKEY1(key) ((char *)(key)) -#define ubifs_dbg_msg(fmt, ...) do { \ - if (0) \ - pr_debug(fmt "\n", ##__VA_ARGS__); \ +#define ubifs_dbg_msg(fmt, ...) do { \ + if (0) \ + printk(KERN_DEBUG fmt "\n", ##__VA_ARGS__); \ } while (0) #define dbg_dump_stack() @@ -387,6 +389,9 @@ static inline void dbg_dump_lpt_info(struct ubifs_info *c) { return; } static inline void dbg_dump_leb(const struct ubifs_info *c, int lnum) { return; } static inline void +dbg_dump_sleb(const struct ubifs_info *c, + const struct ubifs_scan_leb *sleb, int offs) { return; } +static inline void dbg_dump_znode(const struct ubifs_info *c, const struct ubifs_znode *znode) { return; } static inline void dbg_dump_heap(struct ubifs_info *c, diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c index af02790d9328..ee4f43f4bb99 100644 --- a/fs/ubifs/recovery.c +++ b/fs/ubifs/recovery.c @@ -983,7 +983,7 @@ int ubifs_recover_inl_heads(struct ubifs_info *c, void *sbuf) } /** - * clean_an_unclean_leb - read and write a LEB to remove corruption. + * clean_an_unclean_leb - read and write a LEB to remove corruption. * @c: UBIFS file-system description object * @ucleb: unclean LEB information * @sbuf: LEB-sized buffer to use diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c index 93d938ad3d2a..6094c5a5d7a8 100644 --- a/fs/ubifs/sb.c +++ b/fs/ubifs/sb.c @@ -247,7 +247,7 @@ static int create_default_filesystem(struct ubifs_info *c) mst->total_dirty = cpu_to_le64(tmp64); /* The indexing LEB does not contribute to dark space */ - tmp64 = (c->main_lebs - 1) * c->dark_wm; + tmp64 = ((long long)(c->main_lebs - 1) * c->dark_wm); mst->total_dark = cpu_to_le64(tmp64); mst->total_used = cpu_to_le64(UBIFS_INO_NODE_SZ); diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index b28121278d46..20403dc5d437 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -129,7 +129,7 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum) goto out_ino; inode->i_flags |= (S_NOCMTIME | S_NOATIME); - inode->i_nlink = le32_to_cpu(ino->nlink); + set_nlink(inode, le32_to_cpu(ino->nlink)); inode->i_uid = le32_to_cpu(ino->uid); inode->i_gid = le32_to_cpu(ino->gid); inode->i_atime.tv_sec = (int64_t)le64_to_cpu(ino->atime_sec); diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index 16f19f55e63f..bf18f7a04544 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -558,10 +558,10 @@ int ubifs_removexattr(struct dentry *dentry, const char *name) } ubifs_assert(inode->i_nlink == 1); - inode->i_nlink = 0; + clear_nlink(inode); err = remove_xattr(c, host, inode, &nm); if (err) - inode->i_nlink = 1; + set_nlink(inode, 1); /* If @i_nlink is 0, 'iput()' will delete the inode */ iput(inode); diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c index 95518a9f589e..987585bb0a1d 100644 --- a/fs/udf/balloc.c +++ b/fs/udf/balloc.c @@ -59,8 +59,8 @@ static int __load_block_bitmap(struct super_block *sb, int nr_groups = bitmap->s_nr_groups; if (block_group >= nr_groups) { - udf_debug("block_group (%d) > nr_groups (%d)\n", block_group, - nr_groups); + udf_debug("block_group (%d) > nr_groups (%d)\n", + block_group, nr_groups); } if (bitmap->s_block_bitmap[block_group]) { @@ -126,8 +126,9 @@ static void udf_bitmap_free_blocks(struct super_block *sb, if (bloc->logicalBlockNum + count < count || (bloc->logicalBlockNum + count) > partmap->s_partition_len) { udf_debug("%d < %d || %d + %d > %d\n", - bloc->logicalBlockNum, 0, bloc->logicalBlockNum, - count, partmap->s_partition_len); + bloc->logicalBlockNum, 0, + bloc->logicalBlockNum, count, + partmap->s_partition_len); goto error_return; } @@ -155,7 +156,7 @@ static void udf_bitmap_free_blocks(struct super_block *sb, if (udf_set_bit(bit + i, bh->b_data)) { udf_debug("bit %ld already set\n", bit + i); udf_debug("byte=%2x\n", - ((char *)bh->b_data)[(bit + i) >> 3]); + ((char *)bh->b_data)[(bit + i) >> 3]); } } udf_add_free_space(sb, sbi->s_partition, count); @@ -369,7 +370,8 @@ static void udf_table_free_blocks(struct super_block *sb, if (bloc->logicalBlockNum + count < count || (bloc->logicalBlockNum + count) > partmap->s_partition_len) { udf_debug("%d < %d || %d + %d > %d\n", - bloc->logicalBlockNum, 0, bloc->logicalBlockNum, count, + bloc->logicalBlockNum, 0, + bloc->logicalBlockNum, count, partmap->s_partition_len); goto error_return; } diff --git a/fs/udf/directory.c b/fs/udf/directory.c index 2ffdb6733af1..3e44f575fb9c 100644 --- a/fs/udf/directory.c +++ b/fs/udf/directory.c @@ -162,8 +162,8 @@ struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize, int *offset) int padlen; if ((!buffer) || (!offset)) { - udf_debug("invalidparms\n, buffer=%p, offset=%p\n", buffer, - offset); + udf_debug("invalidparms, buffer=%p, offset=%p\n", + buffer, offset); return NULL; } @@ -201,7 +201,7 @@ struct short_ad *udf_get_fileshortad(uint8_t *ptr, int maxoffset, uint32_t *offs struct short_ad *sa; if ((!ptr) || (!offset)) { - printk(KERN_ERR "udf: udf_get_fileshortad() invalidparms\n"); + pr_err("%s: invalidparms\n", __func__); return NULL; } @@ -223,7 +223,7 @@ struct long_ad *udf_get_filelongad(uint8_t *ptr, int maxoffset, uint32_t *offset struct long_ad *la; if ((!ptr) || (!offset)) { - printk(KERN_ERR "udf: udf_get_filelongad() invalidparms\n"); + pr_err("%s: invalidparms\n", __func__); return NULL; } diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 1d1358ed80c1..4fd1d809738c 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -37,6 +37,7 @@ #include <linux/writeback.h> #include <linux/slab.h> #include <linux/crc-itu-t.h> +#include <linux/mpage.h> #include "udf_i.h" #include "udf_sb.h" @@ -83,12 +84,10 @@ void udf_evict_inode(struct inode *inode) end_writeback(inode); if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && inode->i_size != iinfo->i_lenExtents) { - printk(KERN_WARNING "UDF-fs (%s): Inode %lu (mode %o) has " - "inode size %llu different from extent length %llu. " - "Filesystem need not be standards compliant.\n", - inode->i_sb->s_id, inode->i_ino, inode->i_mode, - (unsigned long long)inode->i_size, - (unsigned long long)iinfo->i_lenExtents); + udf_warn(inode->i_sb, "Inode %lu (mode %o) has inode size %llu different from extent length %llu. Filesystem need not be standards compliant.\n", + inode->i_ino, inode->i_mode, + (unsigned long long)inode->i_size, + (unsigned long long)iinfo->i_lenExtents); } kfree(iinfo->i_ext.i_data); iinfo->i_ext.i_data = NULL; @@ -104,7 +103,13 @@ static int udf_writepage(struct page *page, struct writeback_control *wbc) static int udf_readpage(struct file *file, struct page *page) { - return block_read_full_page(page, udf_get_block); + return mpage_readpage(page, udf_get_block); +} + +static int udf_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, udf_get_block); } static int udf_write_begin(struct file *file, struct address_space *mapping, @@ -139,6 +144,7 @@ static sector_t udf_bmap(struct address_space *mapping, sector_t block) const struct address_space_operations udf_aops = { .readpage = udf_readpage, + .readpages = udf_readpages, .writepage = udf_writepage, .write_begin = udf_write_begin, .write_end = generic_write_end, @@ -1169,16 +1175,15 @@ static void __udf_read_inode(struct inode *inode) */ bh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 0, &ident); if (!bh) { - printk(KERN_ERR "udf: udf_read_inode(ino %ld) failed !bh\n", - inode->i_ino); + udf_err(inode->i_sb, "(ino %ld) failed !bh\n", inode->i_ino); make_bad_inode(inode); return; } if (ident != TAG_IDENT_FE && ident != TAG_IDENT_EFE && ident != TAG_IDENT_USE) { - printk(KERN_ERR "udf: udf_read_inode(ino %ld) " - "failed ident=%d\n", inode->i_ino, ident); + udf_err(inode->i_sb, "(ino %ld) failed ident=%d\n", + inode->i_ino, ident); brelse(bh); make_bad_inode(inode); return; @@ -1218,8 +1223,8 @@ static void __udf_read_inode(struct inode *inode) } brelse(ibh); } else if (fe->icbTag.strategyType != cpu_to_le16(4)) { - printk(KERN_ERR "udf: unsupported strategy type: %d\n", - le16_to_cpu(fe->icbTag.strategyType)); + udf_err(inode->i_sb, "unsupported strategy type: %d\n", + le16_to_cpu(fe->icbTag.strategyType)); brelse(bh); make_bad_inode(inode); return; @@ -1236,6 +1241,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh) int offset; struct udf_sb_info *sbi = UDF_SB(inode->i_sb); struct udf_inode_info *iinfo = UDF_I(inode); + unsigned int link_count; fe = (struct fileEntry *)bh->b_data; efe = (struct extendedFileEntry *)bh->b_data; @@ -1318,9 +1324,10 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh) inode->i_mode &= ~sbi->s_umask; read_unlock(&sbi->s_cred_lock); - inode->i_nlink = le16_to_cpu(fe->fileLinkCount); - if (!inode->i_nlink) - inode->i_nlink = 1; + link_count = le16_to_cpu(fe->fileLinkCount); + if (!link_count) + link_count = 1; + set_nlink(inode, link_count); inode->i_size = le64_to_cpu(fe->informationLength); iinfo->i_lenExtents = inode->i_size; @@ -1413,9 +1420,8 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh) udf_debug("METADATA BITMAP FILE-----\n"); break; default: - printk(KERN_ERR "udf: udf_fill_inode(ino %ld) failed unknown " - "file type=%d\n", inode->i_ino, - fe->icbTag.fileType); + udf_err(inode->i_sb, "(ino %ld) failed unknown file type=%d\n", + inode->i_ino, fe->icbTag.fileType); make_bad_inode(inode); return; } @@ -1438,8 +1444,8 @@ static int udf_alloc_i_data(struct inode *inode, size_t size) iinfo->i_ext.i_data = kmalloc(size, GFP_KERNEL); if (!iinfo->i_ext.i_data) { - printk(KERN_ERR "udf:udf_alloc_i_data (ino %ld) " - "no free memory\n", inode->i_ino); + udf_err(inode->i_sb, "(ino %ld) no free memory\n", + inode->i_ino); return -ENOMEM; } @@ -1689,9 +1695,8 @@ out: if (do_sync) { sync_dirty_buffer(bh); if (buffer_write_io_error(bh)) { - printk(KERN_WARNING "IO error syncing udf inode " - "[%s:%08lx]\n", inode->i_sb->s_id, - inode->i_ino); + udf_warn(inode->i_sb, "IO error syncing udf inode [%08lx]\n", + inode->i_ino); err = -EIO; } } @@ -1982,8 +1987,7 @@ int8_t udf_current_aext(struct inode *inode, struct extent_position *epos, *elen = le32_to_cpu(lad->extLength) & UDF_EXTENT_LENGTH_MASK; break; default: - udf_debug("alloc_type = %d unsupported\n", - iinfo->i_alloc_type); + udf_debug("alloc_type = %d unsupported\n", iinfo->i_alloc_type); return -1; } diff --git a/fs/udf/lowlevel.c b/fs/udf/lowlevel.c index 43e24a3b8e10..6583fe9b0645 100644 --- a/fs/udf/lowlevel.c +++ b/fs/udf/lowlevel.c @@ -38,7 +38,7 @@ unsigned int udf_get_last_session(struct super_block *sb) if (i == 0) { udf_debug("XA disk: %s, vol_desc_start=%d\n", - (ms_info.xa_flag ? "yes" : "no"), ms_info.addr.lba); + ms_info.xa_flag ? "yes" : "no", ms_info.addr.lba); if (ms_info.xa_flag) /* necessary for a valid ms_info.addr */ vol_desc_start = ms_info.addr.lba; } else { diff --git a/fs/udf/misc.c b/fs/udf/misc.c index 9215700c00a4..c175b4dabc14 100644 --- a/fs/udf/misc.c +++ b/fs/udf/misc.c @@ -204,6 +204,7 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block, { struct tag *tag_p; struct buffer_head *bh = NULL; + u8 checksum; /* Read the block */ if (block == 0xFFFFFFFF) @@ -211,8 +212,8 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block, bh = udf_tread(sb, block); if (!bh) { - udf_debug("block=%d, location=%d: read failed\n", - block, location); + udf_err(sb, "read failed, block=%u, location=%d\n", + block, location); return NULL; } @@ -227,16 +228,18 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block, } /* Verify the tag checksum */ - if (udf_tag_checksum(tag_p) != tag_p->tagChecksum) { - printk(KERN_ERR "udf: tag checksum failed block %d\n", block); + checksum = udf_tag_checksum(tag_p); + if (checksum != tag_p->tagChecksum) { + udf_err(sb, "tag checksum failed, block %u: 0x%02x != 0x%02x\n", + block, checksum, tag_p->tagChecksum); goto error_out; } /* Verify the tag version */ if (tag_p->descVersion != cpu_to_le16(0x0002U) && tag_p->descVersion != cpu_to_le16(0x0003U)) { - udf_debug("tag version 0x%04x != 0x0002 || 0x0003 block %d\n", - le16_to_cpu(tag_p->descVersion), block); + udf_err(sb, "tag version 0x%04x != 0x0002 || 0x0003, block %u\n", + le16_to_cpu(tag_p->descVersion), block); goto error_out; } @@ -248,8 +251,8 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block, return bh; udf_debug("Crc failure block %d: crc = %d, crclen = %d\n", block, - le16_to_cpu(tag_p->descCRC), le16_to_cpu(tag_p->descCRCLength)); - + le16_to_cpu(tag_p->descCRC), + le16_to_cpu(tag_p->descCRCLength)); error_out: brelse(bh); return NULL; diff --git a/fs/udf/namei.c b/fs/udf/namei.c index f1dce848ef96..4639e137222f 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -577,8 +577,7 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode, fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); if (!fi) { - inode->i_nlink--; - mark_inode_dirty(inode); + inode_dec_link_count(inode); iput(inode); return err; } @@ -618,8 +617,7 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode, init_special_inode(inode, mode, rdev); fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); if (!fi) { - inode->i_nlink--; - mark_inode_dirty(inode); + inode_dec_link_count(inode); iput(inode); return err; } @@ -665,12 +663,11 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode) inode->i_fop = &udf_dir_operations; fi = udf_add_entry(inode, NULL, &fibh, &cfi, &err); if (!fi) { - inode->i_nlink--; - mark_inode_dirty(inode); + inode_dec_link_count(inode); iput(inode); goto out; } - inode->i_nlink = 2; + set_nlink(inode, 2); cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); cfi.icb.extLocation = cpu_to_lelb(dinfo->i_location); *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse = @@ -683,7 +680,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode) fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); if (!fi) { - inode->i_nlink = 0; + clear_nlink(inode); mark_inode_dirty(inode); iput(inode); goto out; @@ -799,9 +796,8 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry) if (retval) goto end_rmdir; if (inode->i_nlink != 2) - udf_warning(inode->i_sb, "udf_rmdir", - "empty directory has nlink != 2 (%d)", - inode->i_nlink); + udf_warn(inode->i_sb, "empty directory has nlink != 2 (%d)\n", + inode->i_nlink); clear_nlink(inode); inode->i_size = 0; inode_dec_link_count(dir); @@ -840,7 +836,7 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry) if (!inode->i_nlink) { udf_debug("Deleting nonexistent file (%lu), %d\n", inode->i_ino, inode->i_nlink); - inode->i_nlink = 1; + set_nlink(inode, 1); } retval = udf_delete_entry(dir, fi, &fibh, &cfi); if (retval) diff --git a/fs/udf/partition.c b/fs/udf/partition.c index a71090ea0e07..d6caf01a2097 100644 --- a/fs/udf/partition.c +++ b/fs/udf/partition.c @@ -33,8 +33,8 @@ uint32_t udf_get_pblock(struct super_block *sb, uint32_t block, struct udf_sb_info *sbi = UDF_SB(sb); struct udf_part_map *map; if (partition >= sbi->s_partitions) { - udf_debug("block=%d, partition=%d, offset=%d: " - "invalid partition\n", block, partition, offset); + udf_debug("block=%d, partition=%d, offset=%d: invalid partition\n", + block, partition, offset); return 0xFFFFFFFF; } map = &sbi->s_partmaps[partition]; @@ -60,8 +60,8 @@ uint32_t udf_get_pblock_virt15(struct super_block *sb, uint32_t block, vdata = &map->s_type_specific.s_virtual; if (block > vdata->s_num_entries) { - udf_debug("Trying to access block beyond end of VAT " - "(%d max %d)\n", block, vdata->s_num_entries); + udf_debug("Trying to access block beyond end of VAT (%d max %d)\n", + block, vdata->s_num_entries); return 0xFFFFFFFF; } @@ -321,9 +321,14 @@ uint32_t udf_get_pblock_meta25(struct super_block *sb, uint32_t block, /* We shouldn't mount such media... */ BUG_ON(!inode); retblk = udf_try_read_meta(inode, block, partition, offset); - if (retblk == 0xFFFFFFFF) { - udf_warning(sb, __func__, "error reading from METADATA, " - "trying to read from MIRROR"); + if (retblk == 0xFFFFFFFF && mdata->s_metadata_fe) { + udf_warn(sb, "error reading from METADATA, trying to read from MIRROR\n"); + if (!(mdata->s_flags & MF_MIRROR_FE_LOADED)) { + mdata->s_mirror_fe = udf_find_metadata_inode_efe(sb, + mdata->s_mirror_file_loc, map->s_partition_num); + mdata->s_flags |= MF_MIRROR_FE_LOADED; + } + inode = mdata->s_mirror_fe; if (!inode) return 0xFFFFFFFF; diff --git a/fs/udf/super.c b/fs/udf/super.c index 7b27b063ff6d..e185253470df 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -75,8 +75,6 @@ #define UDF_DEFAULT_BLOCKSIZE 2048 -static char error_buf[1024]; - /* These are the "meat" - everything else is stuffing */ static int udf_fill_super(struct super_block *, void *, int); static void udf_put_super(struct super_block *); @@ -92,8 +90,6 @@ static void udf_close_lvid(struct super_block *); static unsigned int udf_count_free(struct super_block *); static int udf_statfs(struct dentry *, struct kstatfs *); static int udf_show_options(struct seq_file *, struct vfsmount *); -static void udf_error(struct super_block *sb, const char *function, - const char *fmt, ...); struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct udf_sb_info *sbi) { @@ -244,9 +240,8 @@ static int udf_sb_alloc_partition_maps(struct super_block *sb, u32 count) sbi->s_partmaps = kcalloc(count, sizeof(struct udf_part_map), GFP_KERNEL); if (!sbi->s_partmaps) { - udf_error(sb, __func__, - "Unable to allocate space for %d partition maps", - count); + udf_err(sb, "Unable to allocate space for %d partition maps\n", + count); sbi->s_partitions = 0; return -ENOMEM; } @@ -550,8 +545,7 @@ static int udf_parse_options(char *options, struct udf_options *uopt, uopt->dmode = option & 0777; break; default: - printk(KERN_ERR "udf: bad mount option \"%s\" " - "or missing value\n", p); + pr_err("bad mount option \"%s\" or missing value\n", p); return 0; } } @@ -645,20 +639,16 @@ static loff_t udf_check_vsd(struct super_block *sb) udf_debug("ISO9660 Boot Record found\n"); break; case 1: - udf_debug("ISO9660 Primary Volume Descriptor " - "found\n"); + udf_debug("ISO9660 Primary Volume Descriptor found\n"); break; case 2: - udf_debug("ISO9660 Supplementary Volume " - "Descriptor found\n"); + udf_debug("ISO9660 Supplementary Volume Descriptor found\n"); break; case 3: - udf_debug("ISO9660 Volume Partition Descriptor " - "found\n"); + udf_debug("ISO9660 Volume Partition Descriptor found\n"); break; case 255: - udf_debug("ISO9660 Volume Descriptor Set " - "Terminator found\n"); + udf_debug("ISO9660 Volume Descriptor Set Terminator found\n"); break; default: udf_debug("ISO9660 VRS (%u) found\n", @@ -809,8 +799,7 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block) pvoldesc->recordingDateAndTime)) { #ifdef UDFFS_DEBUG struct timestamp *ts = &pvoldesc->recordingDateAndTime; - udf_debug("recording time %04u/%02u/%02u" - " %02u:%02u (%x)\n", + udf_debug("recording time %04u/%02u/%02u %02u:%02u (%x)\n", le16_to_cpu(ts->year), ts->month, ts->day, ts->hour, ts->minute, le16_to_cpu(ts->typeAndTimezone)); #endif @@ -821,7 +810,7 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block) strncpy(UDF_SB(sb)->s_volume_ident, outstr->u_name, outstr->u_len > 31 ? 31 : outstr->u_len); udf_debug("volIdent[] = '%s'\n", - UDF_SB(sb)->s_volume_ident); + UDF_SB(sb)->s_volume_ident); } if (!udf_build_ustr(instr, pvoldesc->volSetIdent, 128)) @@ -837,64 +826,57 @@ out1: return ret; } +struct inode *udf_find_metadata_inode_efe(struct super_block *sb, + u32 meta_file_loc, u32 partition_num) +{ + struct kernel_lb_addr addr; + struct inode *metadata_fe; + + addr.logicalBlockNum = meta_file_loc; + addr.partitionReferenceNum = partition_num; + + metadata_fe = udf_iget(sb, &addr); + + if (metadata_fe == NULL) + udf_warn(sb, "metadata inode efe not found\n"); + else if (UDF_I(metadata_fe)->i_alloc_type != ICBTAG_FLAG_AD_SHORT) { + udf_warn(sb, "metadata inode efe does not have short allocation descriptors!\n"); + iput(metadata_fe); + metadata_fe = NULL; + } + + return metadata_fe; +} + static int udf_load_metadata_files(struct super_block *sb, int partition) { struct udf_sb_info *sbi = UDF_SB(sb); struct udf_part_map *map; struct udf_meta_data *mdata; struct kernel_lb_addr addr; - int fe_error = 0; map = &sbi->s_partmaps[partition]; mdata = &map->s_type_specific.s_metadata; /* metadata address */ - addr.logicalBlockNum = mdata->s_meta_file_loc; - addr.partitionReferenceNum = map->s_partition_num; - udf_debug("Metadata file location: block = %d part = %d\n", - addr.logicalBlockNum, addr.partitionReferenceNum); + mdata->s_meta_file_loc, map->s_partition_num); - mdata->s_metadata_fe = udf_iget(sb, &addr); + mdata->s_metadata_fe = udf_find_metadata_inode_efe(sb, + mdata->s_meta_file_loc, map->s_partition_num); if (mdata->s_metadata_fe == NULL) { - udf_warning(sb, __func__, "metadata inode efe not found, " - "will try mirror inode."); - fe_error = 1; - } else if (UDF_I(mdata->s_metadata_fe)->i_alloc_type != - ICBTAG_FLAG_AD_SHORT) { - udf_warning(sb, __func__, "metadata inode efe does not have " - "short allocation descriptors!"); - fe_error = 1; - iput(mdata->s_metadata_fe); - mdata->s_metadata_fe = NULL; - } + /* mirror file entry */ + udf_debug("Mirror metadata file location: block = %d part = %d\n", + mdata->s_mirror_file_loc, map->s_partition_num); - /* mirror file entry */ - addr.logicalBlockNum = mdata->s_mirror_file_loc; - addr.partitionReferenceNum = map->s_partition_num; - - udf_debug("Mirror metadata file location: block = %d part = %d\n", - addr.logicalBlockNum, addr.partitionReferenceNum); + mdata->s_mirror_fe = udf_find_metadata_inode_efe(sb, + mdata->s_mirror_file_loc, map->s_partition_num); - mdata->s_mirror_fe = udf_iget(sb, &addr); - - if (mdata->s_mirror_fe == NULL) { - if (fe_error) { - udf_error(sb, __func__, "mirror inode efe not found " - "and metadata inode is missing too, exiting..."); - goto error_exit; - } else - udf_warning(sb, __func__, "mirror inode efe not found," - " but metadata inode is OK"); - } else if (UDF_I(mdata->s_mirror_fe)->i_alloc_type != - ICBTAG_FLAG_AD_SHORT) { - udf_warning(sb, __func__, "mirror inode efe does not have " - "short allocation descriptors!"); - iput(mdata->s_mirror_fe); - mdata->s_mirror_fe = NULL; - if (fe_error) + if (mdata->s_mirror_fe == NULL) { + udf_err(sb, "Both metadata and mirror metadata inode efe can not found\n"); goto error_exit; + } } /* @@ -907,18 +889,15 @@ static int udf_load_metadata_files(struct super_block *sb, int partition) addr.partitionReferenceNum = map->s_partition_num; udf_debug("Bitmap file location: block = %d part = %d\n", - addr.logicalBlockNum, addr.partitionReferenceNum); + addr.logicalBlockNum, addr.partitionReferenceNum); mdata->s_bitmap_fe = udf_iget(sb, &addr); if (mdata->s_bitmap_fe == NULL) { if (sb->s_flags & MS_RDONLY) - udf_warning(sb, __func__, "bitmap inode efe " - "not found but it's ok since the disc" - " is mounted read-only"); + udf_warn(sb, "bitmap inode efe not found but it's ok since the disc is mounted read-only\n"); else { - udf_error(sb, __func__, "bitmap inode efe not " - "found and attempted read-write mount"); + udf_err(sb, "bitmap inode efe not found and attempted read-write mount\n"); goto error_exit; } } @@ -971,9 +950,8 @@ static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index) bitmap = vzalloc(size); /* TODO: get rid of vzalloc */ if (bitmap == NULL) { - udf_error(sb, __func__, - "Unable to allocate space for bitmap " - "and %d buffer_head pointers", nr_groups); + udf_err(sb, "Unable to allocate space for bitmap and %d buffer_head pointers\n", + nr_groups); return NULL; } @@ -1003,10 +981,9 @@ static int udf_fill_partdesc_info(struct super_block *sb, if (p->accessType == cpu_to_le32(PD_ACCESS_TYPE_OVERWRITABLE)) map->s_partition_flags |= UDF_PART_FLAG_OVERWRITABLE; - udf_debug("Partition (%d type %x) starts at physical %d, " - "block length %d\n", p_index, - map->s_partition_type, map->s_partition_root, - map->s_partition_len); + udf_debug("Partition (%d type %x) starts at physical %d, block length %d\n", + p_index, map->s_partition_type, + map->s_partition_root, map->s_partition_len); if (strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR02) && strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR03)) @@ -1023,12 +1000,12 @@ static int udf_fill_partdesc_info(struct super_block *sb, map->s_uspace.s_table = udf_iget(sb, &loc); if (!map->s_uspace.s_table) { udf_debug("cannot load unallocSpaceTable (part %d)\n", - p_index); + p_index); return 1; } map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_TABLE; udf_debug("unallocSpaceTable (part %d) @ %ld\n", - p_index, map->s_uspace.s_table->i_ino); + p_index, map->s_uspace.s_table->i_ino); } if (phd->unallocSpaceBitmap.extLength) { @@ -1041,8 +1018,8 @@ static int udf_fill_partdesc_info(struct super_block *sb, bitmap->s_extPosition = le32_to_cpu( phd->unallocSpaceBitmap.extPosition); map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_BITMAP; - udf_debug("unallocSpaceBitmap (part %d) @ %d\n", p_index, - bitmap->s_extPosition); + udf_debug("unallocSpaceBitmap (part %d) @ %d\n", + p_index, bitmap->s_extPosition); } if (phd->partitionIntegrityTable.extLength) @@ -1058,13 +1035,13 @@ static int udf_fill_partdesc_info(struct super_block *sb, map->s_fspace.s_table = udf_iget(sb, &loc); if (!map->s_fspace.s_table) { udf_debug("cannot load freedSpaceTable (part %d)\n", - p_index); + p_index); return 1; } map->s_partition_flags |= UDF_PART_FLAG_FREED_TABLE; udf_debug("freedSpaceTable (part %d) @ %ld\n", - p_index, map->s_fspace.s_table->i_ino); + p_index, map->s_fspace.s_table->i_ino); } if (phd->freedSpaceBitmap.extLength) { @@ -1077,8 +1054,8 @@ static int udf_fill_partdesc_info(struct super_block *sb, bitmap->s_extPosition = le32_to_cpu( phd->freedSpaceBitmap.extPosition); map->s_partition_flags |= UDF_PART_FLAG_FREED_BITMAP; - udf_debug("freedSpaceBitmap (part %d) @ %d\n", p_index, - bitmap->s_extPosition); + udf_debug("freedSpaceBitmap (part %d) @ %d\n", + p_index, bitmap->s_extPosition); } return 0; } @@ -1118,11 +1095,9 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index) udf_find_vat_block(sb, p_index, type1_index, sbi->s_last_block); if (!sbi->s_vat_inode && sbi->s_last_block != blocks - 1) { - printk(KERN_NOTICE "UDF-fs: Failed to read VAT inode from the" - " last recorded block (%lu), retrying with the last " - "block of the device (%lu).\n", - (unsigned long)sbi->s_last_block, - (unsigned long)blocks - 1); + pr_notice("Failed to read VAT inode from the last recorded block (%lu), retrying with the last block of the device (%lu).\n", + (unsigned long)sbi->s_last_block, + (unsigned long)blocks - 1); udf_find_vat_block(sb, p_index, type1_index, blocks - 1); } if (!sbi->s_vat_inode) @@ -1220,8 +1195,8 @@ static int udf_load_partdesc(struct super_block *sb, sector_t block) if (map->s_partition_type == UDF_METADATA_MAP25) { ret = udf_load_metadata_files(sb, i); if (ret) { - printk(KERN_ERR "UDF-fs: error loading MetaData " - "partition map %d\n", i); + udf_err(sb, "error loading MetaData partition map %d\n", + i); goto out_bh; } } else { @@ -1234,9 +1209,7 @@ static int udf_load_partdesc(struct super_block *sb, sector_t block) * overwrite blocks instead of relocating them). */ sb->s_flags |= MS_RDONLY; - printk(KERN_NOTICE "UDF-fs: Filesystem marked read-only " - "because writing to pseudooverwrite partition is " - "not implemented.\n"); + pr_notice("Filesystem marked read-only because writing to pseudooverwrite partition is not implemented\n"); } out_bh: /* In case loading failed, we handle cleanup in udf_fill_super */ @@ -1344,9 +1317,8 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block, struct metadataPartitionMap *mdm = (struct metadataPartitionMap *) &(lvd->partitionMaps[offset]); - udf_debug("Parsing Logical vol part %d " - "type %d id=%s\n", i, type, - UDF_ID_METADATA); + udf_debug("Parsing Logical vol part %d type %d id=%s\n", + i, type, UDF_ID_METADATA); map->s_partition_type = UDF_METADATA_MAP25; map->s_partition_func = udf_get_pblock_meta25; @@ -1361,25 +1333,24 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block, le32_to_cpu(mdm->allocUnitSize); mdata->s_align_unit_size = le16_to_cpu(mdm->alignUnitSize); - mdata->s_dup_md_flag = - mdm->flags & 0x01; + if (mdm->flags & 0x01) + mdata->s_flags |= MF_DUPLICATE_MD; udf_debug("Metadata Ident suffix=0x%x\n", - (le16_to_cpu( - ((__le16 *) - mdm->partIdent.identSuffix)[0]))); + le16_to_cpu(*(__le16 *) + mdm->partIdent.identSuffix)); udf_debug("Metadata part num=%d\n", - le16_to_cpu(mdm->partitionNum)); + le16_to_cpu(mdm->partitionNum)); udf_debug("Metadata part alloc unit size=%d\n", - le32_to_cpu(mdm->allocUnitSize)); + le32_to_cpu(mdm->allocUnitSize)); udf_debug("Metadata file loc=%d\n", - le32_to_cpu(mdm->metadataFileLoc)); + le32_to_cpu(mdm->metadataFileLoc)); udf_debug("Mirror file loc=%d\n", - le32_to_cpu(mdm->metadataMirrorFileLoc)); + le32_to_cpu(mdm->metadataMirrorFileLoc)); udf_debug("Bitmap file loc=%d\n", - le32_to_cpu(mdm->metadataBitmapFileLoc)); - udf_debug("Duplicate Flag: %d %d\n", - mdata->s_dup_md_flag, mdm->flags); + le32_to_cpu(mdm->metadataBitmapFileLoc)); + udf_debug("Flags: %d %d\n", + mdata->s_flags, mdm->flags); } else { udf_debug("Unknown ident: %s\n", upm2->partIdent.ident); @@ -1389,16 +1360,15 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block, map->s_partition_num = le16_to_cpu(upm2->partitionNum); } udf_debug("Partition (%d:%d) type %d on volume %d\n", - i, map->s_partition_num, type, - map->s_volumeseqnum); + i, map->s_partition_num, type, map->s_volumeseqnum); } if (fileset) { struct long_ad *la = (struct long_ad *)&(lvd->logicalVolContentsUse[0]); *fileset = lelb_to_cpu(la->extLocation); - udf_debug("FileSet found in LogicalVolDesc at block=%d, " - "partition=%d\n", fileset->logicalBlockNum, + udf_debug("FileSet found in LogicalVolDesc at block=%d, partition=%d\n", + fileset->logicalBlockNum, fileset->partitionReferenceNum); } if (lvd->integritySeqExt.extLength) @@ -1478,9 +1448,9 @@ static noinline int udf_process_sequence(struct super_block *sb, long block, bh = udf_read_tagged(sb, block, block, &ident); if (!bh) { - printk(KERN_ERR "udf: Block %Lu of volume descriptor " - "sequence is corrupted or we could not read " - "it.\n", (unsigned long long)block); + udf_err(sb, + "Block %llu of volume descriptor sequence is corrupted or we could not read it\n", + (unsigned long long)block); return 1; } @@ -1553,7 +1523,7 @@ static noinline int udf_process_sequence(struct super_block *sb, long block, * in a suitable order */ if (!vds[VDS_POS_PRIMARY_VOL_DESC].block) { - printk(KERN_ERR "udf: Primary Volume Descriptor not found!\n"); + udf_err(sb, "Primary Volume Descriptor not found!\n"); return 1; } if (udf_load_pvoldesc(sb, vds[VDS_POS_PRIMARY_VOL_DESC].block)) @@ -1740,7 +1710,7 @@ static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt, if (!sb_set_blocksize(sb, uopt->blocksize)) { if (!silent) - printk(KERN_WARNING "UDF-fs: Bad block size\n"); + udf_warn(sb, "Bad block size\n"); return 0; } sbi->s_last_block = uopt->lastblock; @@ -1749,12 +1719,11 @@ static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt, nsr_off = udf_check_vsd(sb); if (!nsr_off) { if (!silent) - printk(KERN_WARNING "UDF-fs: No VRS found\n"); + udf_warn(sb, "No VRS found\n"); return 0; } if (nsr_off == -1) - udf_debug("Failed to read byte 32768. Assuming open " - "disc. Skipping validity check\n"); + udf_debug("Failed to read byte 32768. Assuming open disc. Skipping validity check\n"); if (!sbi->s_last_block) sbi->s_last_block = udf_get_last_block(sb); } else { @@ -1765,7 +1734,7 @@ static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt, sbi->s_anchor = uopt->anchor; if (!udf_find_anchor(sb, fileset)) { if (!silent) - printk(KERN_WARNING "UDF-fs: No anchor found\n"); + udf_warn(sb, "No anchor found\n"); return 0; } return 1; @@ -1937,8 +1906,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) if (uopt.flags & (1 << UDF_FLAG_UTF8) && uopt.flags & (1 << UDF_FLAG_NLS_MAP)) { - udf_error(sb, "udf_read_super", - "utf8 cannot be combined with iocharset\n"); + udf_err(sb, "utf8 cannot be combined with iocharset\n"); goto error_out; } #ifdef CONFIG_UDF_NLS @@ -1987,15 +1955,14 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) ret = udf_load_vrs(sb, &uopt, silent, &fileset); if (!ret && uopt.blocksize != UDF_DEFAULT_BLOCKSIZE) { if (!silent) - printk(KERN_NOTICE - "UDF-fs: Rescanning with blocksize " - "%d\n", UDF_DEFAULT_BLOCKSIZE); + pr_notice("Rescanning with blocksize %d\n", + UDF_DEFAULT_BLOCKSIZE); uopt.blocksize = UDF_DEFAULT_BLOCKSIZE; ret = udf_load_vrs(sb, &uopt, silent, &fileset); } } if (!ret) { - printk(KERN_WARNING "UDF-fs: No partition found (1)\n"); + udf_warn(sb, "No partition found (1)\n"); goto error_out; } @@ -2010,10 +1977,9 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) le16_to_cpu(lvidiu->maxUDFWriteRev); */ if (minUDFReadRev > UDF_MAX_READ_VERSION) { - printk(KERN_ERR "UDF-fs: minUDFReadRev=%x " - "(max is %x)\n", - le16_to_cpu(lvidiu->minUDFReadRev), - UDF_MAX_READ_VERSION); + udf_err(sb, "minUDFReadRev=%x (max is %x)\n", + le16_to_cpu(lvidiu->minUDFReadRev), + UDF_MAX_READ_VERSION); goto error_out; } else if (minUDFWriteRev > UDF_MAX_WRITE_VERSION) sb->s_flags |= MS_RDONLY; @@ -2027,28 +1993,27 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) } if (!sbi->s_partitions) { - printk(KERN_WARNING "UDF-fs: No partition found (2)\n"); + udf_warn(sb, "No partition found (2)\n"); goto error_out; } if (sbi->s_partmaps[sbi->s_partition].s_partition_flags & UDF_PART_FLAG_READ_ONLY) { - printk(KERN_NOTICE "UDF-fs: Partition marked readonly; " - "forcing readonly mount\n"); + pr_notice("Partition marked readonly; forcing readonly mount\n"); sb->s_flags |= MS_RDONLY; } if (udf_find_fileset(sb, &fileset, &rootdir)) { - printk(KERN_WARNING "UDF-fs: No fileset found\n"); + udf_warn(sb, "No fileset found\n"); goto error_out; } if (!silent) { struct timestamp ts; udf_time_to_disk_stamp(&ts, sbi->s_record_time); - udf_info("UDF: Mounting volume '%s', " - "timestamp %04u/%02u/%02u %02u:%02u (%x)\n", - sbi->s_volume_ident, le16_to_cpu(ts.year), ts.month, ts.day, + udf_info("Mounting volume '%s', timestamp %04u/%02u/%02u %02u:%02u (%x)\n", + sbi->s_volume_ident, + le16_to_cpu(ts.year), ts.month, ts.day, ts.hour, ts.minute, le16_to_cpu(ts.typeAndTimezone)); } if (!(sb->s_flags & MS_RDONLY)) @@ -2059,8 +2024,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) /* perhaps it's not extensible enough, but for now ... */ inode = udf_iget(sb, &rootdir); if (!inode) { - printk(KERN_ERR "UDF-fs: Error in udf_iget, block=%d, " - "partition=%d\n", + udf_err(sb, "Error in udf_iget, block=%d, partition=%d\n", rootdir.logicalBlockNum, rootdir.partitionReferenceNum); goto error_out; } @@ -2068,7 +2032,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) /* Allocate a dentry for the root inode */ sb->s_root = d_alloc_root(inode); if (!sb->s_root) { - printk(KERN_ERR "UDF-fs: Couldn't allocate root dentry\n"); + udf_err(sb, "Couldn't allocate root dentry\n"); iput(inode); goto error_out; } @@ -2096,32 +2060,40 @@ error_out: return -EINVAL; } -static void udf_error(struct super_block *sb, const char *function, - const char *fmt, ...) +void _udf_err(struct super_block *sb, const char *function, + const char *fmt, ...) { + struct va_format vaf; va_list args; - if (!(sb->s_flags & MS_RDONLY)) { - /* mark sb error */ + /* mark sb error */ + if (!(sb->s_flags & MS_RDONLY)) sb->s_dirt = 1; - } + va_start(args, fmt); - vsnprintf(error_buf, sizeof(error_buf), fmt, args); + + vaf.fmt = fmt; + vaf.va = &args; + + pr_err("error (device %s): %s: %pV", sb->s_id, function, &vaf); + va_end(args); - printk(KERN_CRIT "UDF-fs error (device %s): %s: %s\n", - sb->s_id, function, error_buf); } -void udf_warning(struct super_block *sb, const char *function, - const char *fmt, ...) +void _udf_warn(struct super_block *sb, const char *function, + const char *fmt, ...) { + struct va_format vaf; va_list args; va_start(args, fmt); - vsnprintf(error_buf, sizeof(error_buf), fmt, args); + + vaf.fmt = fmt; + vaf.va = &args; + + pr_warn("warning (device %s): %s: %pV", sb->s_id, function, &vaf); + va_end(args); - printk(KERN_WARNING "UDF-fs warning (device %s): %s: %s\n", - sb->s_id, function, error_buf); } static void udf_put_super(struct super_block *sb) @@ -2213,11 +2185,11 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb, bh = udf_read_ptagged(sb, &loc, 0, &ident); if (!bh) { - printk(KERN_ERR "udf: udf_count_free failed\n"); + udf_err(sb, "udf_count_free failed\n"); goto out; } else if (ident != TAG_IDENT_SBD) { brelse(bh); - printk(KERN_ERR "udf: udf_count_free failed\n"); + udf_err(sb, "udf_count_free failed\n"); goto out; } diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c index 8424308db4b4..4b98fee8e161 100644 --- a/fs/udf/truncate.c +++ b/fs/udf/truncate.c @@ -95,23 +95,21 @@ void udf_truncate_tail_extent(struct inode *inode) lbcount += elen; if (lbcount > inode->i_size) { if (lbcount - inode->i_size >= inode->i_sb->s_blocksize) - printk(KERN_WARNING - "udf_truncate_tail_extent(): Too long " - "extent after EOF in inode %u: i_size: " - "%Ld lbcount: %Ld extent %u+%u\n", - (unsigned)inode->i_ino, - (long long)inode->i_size, - (long long)lbcount, - (unsigned)eloc.logicalBlockNum, - (unsigned)elen); + udf_warn(inode->i_sb, + "Too long extent after EOF in inode %u: i_size: %lld lbcount: %lld extent %u+%u\n", + (unsigned)inode->i_ino, + (long long)inode->i_size, + (long long)lbcount, + (unsigned)eloc.logicalBlockNum, + (unsigned)elen); nelen = elen - (lbcount - inode->i_size); epos.offset -= adsize; extent_trunc(inode, &epos, &eloc, etype, elen, nelen); epos.offset += adsize; if (udf_next_aext(inode, &epos, &eloc, &elen, 1) != -1) - printk(KERN_ERR "udf_truncate_tail_extent(): " - "Extent after EOF in inode %u.\n", - (unsigned)inode->i_ino); + udf_err(inode->i_sb, + "Extent after EOF in inode %u\n", + (unsigned)inode->i_ino); break; } } diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h index 4858c191242b..5142a82e3276 100644 --- a/fs/udf/udf_sb.h +++ b/fs/udf/udf_sb.h @@ -54,13 +54,16 @@ #pragma pack(1) /* XXX(hch): Why? This file just defines in-core structures */ +#define MF_DUPLICATE_MD 0x01 +#define MF_MIRROR_FE_LOADED 0x02 + struct udf_meta_data { __u32 s_meta_file_loc; __u32 s_mirror_file_loc; __u32 s_bitmap_file_loc; __u32 s_alloc_unit_size; __u16 s_align_unit_size; - __u8 s_dup_md_flag; + int s_flags; struct inode *s_metadata_fe; struct inode *s_mirror_fe; struct inode *s_bitmap_fe; diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index dbd52d4b5eed..f34e6fc0cdaa 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -1,6 +1,8 @@ #ifndef __UDF_DECL_H #define __UDF_DECL_H +#define pr_fmt(fmt) "UDF-fs: " fmt + #include "ecma_167.h" #include "osta_udf.h" @@ -16,23 +18,30 @@ #define UDF_PREALLOCATE #define UDF_DEFAULT_PREALLOC_BLOCKS 8 +extern __printf(3, 4) void _udf_err(struct super_block *sb, + const char *function, const char *fmt, ...); +#define udf_err(sb, fmt, ...) \ + _udf_err(sb, __func__, fmt, ##__VA_ARGS__) + +extern __printf(3, 4) void _udf_warn(struct super_block *sb, + const char *function, const char *fmt, ...); +#define udf_warn(sb, fmt, ...) \ + _udf_warn(sb, __func__, fmt, ##__VA_ARGS__) + +#define udf_info(fmt, ...) \ + pr_info("INFO " fmt, ##__VA_ARGS__) + #undef UDFFS_DEBUG #ifdef UDFFS_DEBUG -#define udf_debug(f, a...) \ -do { \ - printk(KERN_DEBUG "UDF-fs DEBUG %s:%d:%s: ", \ - __FILE__, __LINE__, __func__); \ - printk(f, ##a); \ -} while (0) +#define udf_debug(fmt, ...) \ + printk(KERN_DEBUG pr_fmt("%s:%d:%s: " fmt), \ + __FILE__, __LINE__, __func__, ##__VA_ARGS__) #else -#define udf_debug(f, a...) /**/ +#define udf_debug(fmt, ...) \ + no_printk(fmt, ##__VA_ARGS__) #endif -#define udf_info(f, a...) \ - printk(KERN_INFO "UDF-fs INFO " f, ##a); - - #define udf_fixed_to_variable(x) ( ( ( (x) >> 5 ) * 39 ) + ( (x) & 0x0000001F ) ) #define udf_variable_to_fixed(x) ( ( ( (x) / 39 ) << 5 ) + ( (x) % 39 ) ) @@ -112,8 +121,6 @@ struct extent_position { /* super.c */ -__attribute__((format(printf, 3, 4))) -extern void udf_warning(struct super_block *, const char *, const char *, ...); static inline void udf_updated_lvid(struct super_block *sb) { struct buffer_head *bh = UDF_SB(sb)->s_lvid_bh; @@ -126,6 +133,8 @@ static inline void udf_updated_lvid(struct super_block *sb) UDF_SB(sb)->s_lvid_dirty = 1; } extern u64 lvid_get_unique_id(struct super_block *sb); +struct inode *udf_find_metadata_inode_efe(struct super_block *sb, + u32 meta_file_loc, u32 partition_num); /* namei.c */ extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *, diff --git a/fs/udf/udftime.c b/fs/udf/udftime.c index b8c828c4d200..1f11483eba6a 100644 --- a/fs/udf/udftime.c +++ b/fs/udf/udftime.c @@ -34,9 +34,10 @@ * http://www.boulder.nist.gov/timefreq/pubs/bulletin/leapsecond.htm */ +#include "udfdecl.h" + #include <linux/types.h> #include <linux/kernel.h> -#include "udfdecl.h" #define EPOCH_YEAR 1970 diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c index d03a90b6ad69..44b815e57f94 100644 --- a/fs/udf/unicode.c +++ b/fs/udf/unicode.c @@ -114,7 +114,7 @@ int udf_CS0toUTF8(struct ustr *utf_o, const struct ustr *ocu_i) cmp_id = ocu_i->u_cmpID; if (cmp_id != 8 && cmp_id != 16) { memset(utf_o, 0, sizeof(struct ustr)); - printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n", + pr_err("unknown compression code (%d) stri=%s\n", cmp_id, ocu_i->u_name); return 0; } @@ -242,7 +242,7 @@ try_again: if (utf_cnt) { error_out: ocu[++u_len] = '?'; - printk(KERN_DEBUG "udf: bad UTF-8 character\n"); + printk(KERN_DEBUG pr_fmt("bad UTF-8 character\n")); } ocu[length - 1] = (uint8_t)u_len + 1; @@ -267,7 +267,7 @@ static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o, cmp_id = ocu_i->u_cmpID; if (cmp_id != 8 && cmp_id != 16) { memset(utf_o, 0, sizeof(struct ustr)); - printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n", + pr_err("unknown compression code (%d) stri=%s\n", cmp_id, ocu_i->u_name); return 0; } diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c index 2eabf04af3de..78a4c70d46b5 100644 --- a/fs/ufs/ialloc.c +++ b/fs/ufs/ialloc.c @@ -341,7 +341,7 @@ cg_found: fail_remove_inode: unlock_super(sb); - inode->i_nlink = 0; + clear_nlink(inode); iput(inode); UFSD("EXIT (FAILED): err %d\n", err); return ERR_PTR(err); diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index b4d791a83207..879b13436fa4 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -589,7 +589,7 @@ static int ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode) * Copy data to the in-core inode. */ inode->i_mode = mode = fs16_to_cpu(sb, ufs_inode->ui_mode); - inode->i_nlink = fs16_to_cpu(sb, ufs_inode->ui_nlink); + set_nlink(inode, fs16_to_cpu(sb, ufs_inode->ui_nlink)); if (inode->i_nlink == 0) { ufs_error (sb, "ufs_read_inode", "inode %lu has zero nlink\n", inode->i_ino); return -1; @@ -637,7 +637,7 @@ static int ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode) * Copy data to the in-core inode. */ inode->i_mode = mode = fs16_to_cpu(sb, ufs2_inode->ui_mode); - inode->i_nlink = fs16_to_cpu(sb, ufs2_inode->ui_nlink); + set_nlink(inode, fs16_to_cpu(sb, ufs2_inode->ui_nlink)); if (inode->i_nlink == 0) { ufs_error (sb, "ufs_read_inode", "inode %lu has zero nlink\n", inode->i_ino); return -1; diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h index 5be2755dd715..c26f2bcec264 100644 --- a/fs/ufs/ufs.h +++ b/fs/ufs/ufs.h @@ -117,9 +117,12 @@ extern int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buf extern const struct file_operations ufs_dir_operations; /* super.c */ -extern void ufs_warning (struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4))); -extern void ufs_error (struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4))); -extern void ufs_panic (struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4))); +extern __printf(3, 4) +void ufs_warning(struct super_block *, const char *, const char *, ...); +extern __printf(3, 4) +void ufs_error(struct super_block *, const char *, const char *, ...); +extern __printf(3, 4) +void ufs_panic(struct super_block *, const char *, const char *, ...); /* symlink.c */ extern const struct inode_operations ufs_fast_symlink_inode_operations; diff --git a/fs/xattr.c b/fs/xattr.c index f060663ab70c..67583de8218c 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -14,6 +14,7 @@ #include <linux/mount.h> #include <linux/namei.h> #include <linux/security.h> +#include <linux/evm.h> #include <linux/syscalls.h> #include <linux/module.h> #include <linux/fsnotify.h> @@ -166,6 +167,64 @@ out_noalloc: } EXPORT_SYMBOL_GPL(xattr_getsecurity); +/* + * vfs_getxattr_alloc - allocate memory, if necessary, before calling getxattr + * + * Allocate memory, if not already allocated, or re-allocate correct size, + * before retrieving the extended attribute. + * + * Returns the result of alloc, if failed, or the getxattr operation. + */ +ssize_t +vfs_getxattr_alloc(struct dentry *dentry, const char *name, char **xattr_value, + size_t xattr_size, gfp_t flags) +{ + struct inode *inode = dentry->d_inode; + char *value = *xattr_value; + int error; + + error = xattr_permission(inode, name, MAY_READ); + if (error) + return error; + + if (!inode->i_op->getxattr) + return -EOPNOTSUPP; + + error = inode->i_op->getxattr(dentry, name, NULL, 0); + if (error < 0) + return error; + + if (!value || (error > xattr_size)) { + value = krealloc(*xattr_value, error + 1, flags); + if (!value) + return -ENOMEM; + memset(value, 0, error + 1); + } + + error = inode->i_op->getxattr(dentry, name, value, error); + *xattr_value = value; + return error; +} + +/* Compare an extended attribute value with the given value */ +int vfs_xattr_cmp(struct dentry *dentry, const char *xattr_name, + const char *value, size_t size, gfp_t flags) +{ + char *xattr_value = NULL; + int rc; + + rc = vfs_getxattr_alloc(dentry, xattr_name, &xattr_value, 0, flags); + if (rc < 0) + return rc; + + if ((rc != size) || (memcmp(xattr_value, value, rc) != 0)) + rc = -EINVAL; + else + rc = 0; + kfree(xattr_value); + return rc; +} + ssize_t vfs_getxattr(struct dentry *dentry, const char *name, void *value, size_t size) { @@ -243,8 +302,10 @@ vfs_removexattr(struct dentry *dentry, const char *name) error = inode->i_op->removexattr(dentry, name); mutex_unlock(&inode->i_mutex); - if (!error) + if (!error) { fsnotify_xattr(dentry); + evm_inode_post_removexattr(dentry, name); + } return error; } EXPORT_SYMBOL_GPL(vfs_removexattr); diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 75bb316529dd..427a4e82a588 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -16,44 +16,53 @@ # Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA # -ccflags-y := -I$(src) -I$(src)/linux-2.6 -ccflags-$(CONFIG_XFS_DEBUG) += -g +ccflags-y += -I$(src) # needed for trace events -XFS_LINUX := linux-2.6 +ccflags-$(CONFIG_XFS_DEBUG) += -g obj-$(CONFIG_XFS_FS) += xfs.o -xfs-y += linux-2.6/xfs_trace.o - -xfs-$(CONFIG_XFS_QUOTA) += $(addprefix quota/, \ - xfs_dquot.o \ - xfs_dquot_item.o \ - xfs_trans_dquot.o \ - xfs_qm_syscalls.o \ - xfs_qm_bhv.o \ - xfs_qm.o) -xfs-$(CONFIG_XFS_QUOTA) += linux-2.6/xfs_quotaops.o - -ifeq ($(CONFIG_XFS_QUOTA),y) -xfs-$(CONFIG_PROC_FS) += quota/xfs_qm_stats.o -endif - -xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o -xfs-$(CONFIG_XFS_POSIX_ACL) += $(XFS_LINUX)/xfs_acl.o -xfs-$(CONFIG_PROC_FS) += $(XFS_LINUX)/xfs_stats.o -xfs-$(CONFIG_SYSCTL) += $(XFS_LINUX)/xfs_sysctl.o -xfs-$(CONFIG_COMPAT) += $(XFS_LINUX)/xfs_ioctl32.o +# this one should be compiled first, as the tracing macros can easily blow up +xfs-y += xfs_trace.o +# highlevel code +xfs-y += xfs_aops.o \ + xfs_bit.o \ + xfs_buf.o \ + xfs_dfrag.o \ + xfs_discard.o \ + xfs_error.o \ + xfs_export.o \ + xfs_file.o \ + xfs_filestream.o \ + xfs_fsops.o \ + xfs_fs_subr.o \ + xfs_globals.o \ + xfs_iget.o \ + xfs_ioctl.o \ + xfs_iomap.o \ + xfs_iops.o \ + xfs_itable.o \ + xfs_message.o \ + xfs_mru_cache.o \ + xfs_super.o \ + xfs_sync.o \ + xfs_xattr.o \ + xfs_rename.o \ + xfs_rw.o \ + xfs_utils.o \ + xfs_vnodeops.o \ + kmem.o \ + uuid.o +# code shared with libxfs xfs-y += xfs_alloc.o \ xfs_alloc_btree.o \ xfs_attr.o \ xfs_attr_leaf.o \ - xfs_bit.o \ xfs_bmap.o \ xfs_bmap_btree.o \ xfs_btree.o \ - xfs_buf_item.o \ xfs_da_btree.o \ xfs_dir2.o \ xfs_dir2_block.o \ @@ -61,49 +70,37 @@ xfs-y += xfs_alloc.o \ xfs_dir2_leaf.o \ xfs_dir2_node.o \ xfs_dir2_sf.o \ - xfs_error.o \ - xfs_extfree_item.o \ - xfs_filestream.o \ - xfs_fsops.o \ xfs_ialloc.o \ xfs_ialloc_btree.o \ - xfs_iget.o \ xfs_inode.o \ - xfs_inode_item.o \ - xfs_iomap.o \ - xfs_itable.o \ - xfs_dfrag.o \ - xfs_log.o \ - xfs_log_cil.o \ xfs_log_recover.o \ xfs_mount.o \ - xfs_mru_cache.o \ - xfs_rename.o \ - xfs_trans.o \ + xfs_trans.o + +# low-level transaction/log code +xfs-y += xfs_log.o \ + xfs_log_cil.o \ + xfs_buf_item.o \ + xfs_extfree_item.o \ + xfs_inode_item.o \ xfs_trans_ail.o \ xfs_trans_buf.o \ xfs_trans_extfree.o \ xfs_trans_inode.o \ - xfs_utils.o \ - xfs_vnodeops.o \ - xfs_rw.o - -# Objects in linux/ -xfs-y += $(addprefix $(XFS_LINUX)/, \ - kmem.o \ - xfs_aops.o \ - xfs_buf.o \ - xfs_discard.o \ - xfs_export.o \ - xfs_file.o \ - xfs_fs_subr.o \ - xfs_globals.o \ - xfs_ioctl.o \ - xfs_iops.o \ - xfs_message.o \ - xfs_super.o \ - xfs_sync.o \ - xfs_xattr.o) -# Objects in support/ -xfs-y += support/uuid.o +# optional features +xfs-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \ + xfs_dquot_item.o \ + xfs_trans_dquot.o \ + xfs_qm_syscalls.o \ + xfs_qm_bhv.o \ + xfs_qm.o \ + xfs_quotaops.o +ifeq ($(CONFIG_XFS_QUOTA),y) +xfs-$(CONFIG_PROC_FS) += xfs_qm_stats.o +endif +xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o +xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o +xfs-$(CONFIG_PROC_FS) += xfs_stats.o +xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o +xfs-$(CONFIG_COMPAT) += xfs_ioctl32.o diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/kmem.c index a907de565db3..a907de565db3 100644 --- a/fs/xfs/linux-2.6/kmem.c +++ b/fs/xfs/kmem.c diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/kmem.h index f7c8f7a9ea6d..292eff198030 100644 --- a/fs/xfs/linux-2.6/kmem.h +++ b/fs/xfs/kmem.h @@ -61,12 +61,7 @@ extern void kmem_free(const void *); static inline void *kmem_zalloc_large(size_t size) { - void *ptr; - - ptr = vmalloc(size); - if (ptr) - memset(ptr, 0, size); - return ptr; + return vzalloc(size); } static inline void kmem_free_large(void *ptr) { diff --git a/fs/xfs/linux-2.6/xfs_message.h b/fs/xfs/linux-2.6/xfs_message.h deleted file mode 100644 index 7fb7ea007672..000000000000 --- a/fs/xfs/linux-2.6/xfs_message.h +++ /dev/null @@ -1,39 +0,0 @@ -#ifndef __XFS_MESSAGE_H -#define __XFS_MESSAGE_H 1 - -struct xfs_mount; - -extern void xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); -extern void xfs_alert(const struct xfs_mount *mp, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); -extern void xfs_alert_tag(const struct xfs_mount *mp, int tag, - const char *fmt, ...) - __attribute__ ((format (printf, 3, 4))); -extern void xfs_crit(const struct xfs_mount *mp, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); -extern void xfs_err(const struct xfs_mount *mp, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); -extern void xfs_warn(const struct xfs_mount *mp, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); -extern void xfs_notice(const struct xfs_mount *mp, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); -extern void xfs_info(const struct xfs_mount *mp, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); - -#ifdef DEBUG -extern void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); -#else -static inline void -__attribute__ ((format (printf, 2, 3))) -xfs_debug(const struct xfs_mount *mp, const char *fmt, ...) -{ -} -#endif - -extern void assfail(char *expr, char *f, int l); - -extern void xfs_hex_dump(void *p, int length); - -#endif /* __XFS_MESSAGE_H */ diff --git a/fs/xfs/linux-2.6/mrlock.h b/fs/xfs/mrlock.h index ff6a19873e5c..ff6a19873e5c 100644 --- a/fs/xfs/linux-2.6/mrlock.h +++ b/fs/xfs/mrlock.h diff --git a/fs/xfs/linux-2.6/time.h b/fs/xfs/time.h index 387e695a184c..387e695a184c 100644 --- a/fs/xfs/linux-2.6/time.h +++ b/fs/xfs/time.h diff --git a/fs/xfs/support/uuid.c b/fs/xfs/uuid.c index b83f76b6d410..b83f76b6d410 100644 --- a/fs/xfs/support/uuid.c +++ b/fs/xfs/uuid.c diff --git a/fs/xfs/support/uuid.h b/fs/xfs/uuid.h index 4732d71262cc..4732d71262cc 100644 --- a/fs/xfs/support/uuid.h +++ b/fs/xfs/uuid.h diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h index 53ec3ea9a625..d8b11b7f94aa 100644 --- a/fs/xfs/xfs.h +++ b/fs/xfs/xfs.h @@ -24,5 +24,6 @@ #define XFS_BUF_LOCK_TRACKING 1 #endif -#include <linux-2.6/xfs_linux.h> +#include "xfs_linux.h" + #endif /* __XFS_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/xfs_acl.c index cac48fe22ad5..b6c4b3795c4a 100644 --- a/fs/xfs/linux-2.6/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -114,6 +114,8 @@ xfs_get_acl(struct inode *inode, int type) if (acl != ACL_NOT_CACHED) return acl; + trace_xfs_get_acl(ip); + switch (type) { case ACL_TYPE_ACCESS: ea_name = SGI_ACL_FILE; @@ -218,42 +220,8 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) return error; } -int -xfs_check_acl(struct inode *inode, int mask) -{ - struct xfs_inode *ip; - struct posix_acl *acl; - int error = -EAGAIN; - - ip = XFS_I(inode); - trace_xfs_check_acl(ip); - - /* - * If there is no attribute fork no ACL exists on this inode and - * we can skip the whole exercise. - */ - if (!XFS_IFORK_Q(ip)) - return -EAGAIN; - - if (mask & MAY_NOT_BLOCK) { - if (!negative_cached_acl(inode, ACL_TYPE_ACCESS)) - return -ECHILD; - return -EAGAIN; - } - - acl = xfs_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl) { - error = posix_acl_permission(inode, acl, mask); - posix_acl_release(acl); - } - - return error; -} - static int -xfs_set_mode(struct inode *inode, mode_t mode) +xfs_set_mode(struct inode *inode, umode_t mode) { int error = 0; @@ -297,29 +265,23 @@ posix_acl_default_exists(struct inode *inode) * No need for i_mutex because the inode is not yet exposed to the VFS. */ int -xfs_inherit_acl(struct inode *inode, struct posix_acl *default_acl) +xfs_inherit_acl(struct inode *inode, struct posix_acl *acl) { - struct posix_acl *clone; - mode_t mode; + umode_t mode = inode->i_mode; int error = 0, inherit = 0; if (S_ISDIR(inode->i_mode)) { - error = xfs_set_acl(inode, ACL_TYPE_DEFAULT, default_acl); + error = xfs_set_acl(inode, ACL_TYPE_DEFAULT, acl); if (error) - return error; + goto out; } - clone = posix_acl_clone(default_acl, GFP_KERNEL); - if (!clone) - return -ENOMEM; - - mode = inode->i_mode; - error = posix_acl_create_masq(clone, &mode); + error = posix_acl_create(&acl, GFP_KERNEL, &mode); if (error < 0) - goto out_release_clone; + return error; /* - * If posix_acl_create_masq returns a positive value we need to + * If posix_acl_create returns a positive value we need to * inherit a permission that can't be represented using the Unix * mode bits and we actually need to set an ACL. */ @@ -328,20 +290,20 @@ xfs_inherit_acl(struct inode *inode, struct posix_acl *default_acl) error = xfs_set_mode(inode, mode); if (error) - goto out_release_clone; + goto out; if (inherit) - error = xfs_set_acl(inode, ACL_TYPE_ACCESS, clone); + error = xfs_set_acl(inode, ACL_TYPE_ACCESS, acl); - out_release_clone: - posix_acl_release(clone); +out: + posix_acl_release(acl); return error; } int xfs_acl_chmod(struct inode *inode) { - struct posix_acl *acl, *clone; + struct posix_acl *acl; int error; if (S_ISLNK(inode->i_mode)) @@ -351,16 +313,12 @@ xfs_acl_chmod(struct inode *inode) if (IS_ERR(acl) || !acl) return PTR_ERR(acl); - clone = posix_acl_clone(acl, GFP_KERNEL); - posix_acl_release(acl); - if (!clone) - return -ENOMEM; - - error = posix_acl_chmod_masq(clone, inode->i_mode); - if (!error) - error = xfs_set_acl(inode, ACL_TYPE_ACCESS, clone); + error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); + if (error) + return error; - posix_acl_release(clone); + error = xfs_set_acl(inode, ACL_TYPE_ACCESS, acl); + posix_acl_release(acl); return error; } @@ -423,7 +381,7 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name, goto out_release; if (type == ACL_TYPE_ACCESS) { - mode_t mode = inode->i_mode; + umode_t mode = inode->i_mode; error = posix_acl_equiv_mode(acl, &mode); if (error <= 0) { diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h index 0135e2a669d7..39632d941354 100644 --- a/fs/xfs/xfs_acl.h +++ b/fs/xfs/xfs_acl.h @@ -42,7 +42,6 @@ struct xfs_acl { #define SGI_ACL_DEFAULT_SIZE (sizeof(SGI_ACL_DEFAULT)-1) #ifdef CONFIG_XFS_POSIX_ACL -extern int xfs_check_acl(struct inode *inode, int mask); extern struct posix_acl *xfs_get_acl(struct inode *inode, int type); extern int xfs_inherit_acl(struct inode *inode, struct posix_acl *default_acl); extern int xfs_acl_chmod(struct inode *inode); @@ -52,8 +51,10 @@ extern int posix_acl_default_exists(struct inode *inode); extern const struct xattr_handler xfs_xattr_acl_access_handler; extern const struct xattr_handler xfs_xattr_acl_default_handler; #else -# define xfs_check_acl NULL -# define xfs_get_acl(inode, type) NULL +static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type) +{ + return NULL; +} # define xfs_inherit_acl(inode, default_acl) 0 # define xfs_acl_chmod(inode) 0 # define posix_acl_access_exists(inode) 0 diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index 6530769a999b..4805f009f923 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -103,7 +103,7 @@ typedef struct xfs_agf { /* disk block (xfs_daddr_t) in the AG */ #define XFS_AGF_DADDR(mp) ((xfs_daddr_t)(1 << (mp)->m_sectbb_log)) #define XFS_AGF_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp)) -#define XFS_BUF_TO_AGF(bp) ((xfs_agf_t *)XFS_BUF_PTR(bp)) +#define XFS_BUF_TO_AGF(bp) ((xfs_agf_t *)((bp)->b_addr)) extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, int flags, struct xfs_buf **bpp); @@ -156,7 +156,7 @@ typedef struct xfs_agi { /* disk block (xfs_daddr_t) in the AG */ #define XFS_AGI_DADDR(mp) ((xfs_daddr_t)(2 << (mp)->m_sectbb_log)) #define XFS_AGI_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGI_DADDR(mp)) -#define XFS_BUF_TO_AGI(bp) ((xfs_agi_t *)XFS_BUF_PTR(bp)) +#define XFS_BUF_TO_AGI(bp) ((xfs_agi_t *)((bp)->b_addr)) extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, struct xfs_buf **bpp); @@ -168,7 +168,7 @@ extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp, #define XFS_AGFL_DADDR(mp) ((xfs_daddr_t)(3 << (mp)->m_sectbb_log)) #define XFS_AGFL_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGFL_DADDR(mp)) #define XFS_AGFL_SIZE(mp) ((mp)->m_sb.sb_sectsize / sizeof(xfs_agblock_t)) -#define XFS_BUF_TO_AGFL(bp) ((xfs_agfl_t *)XFS_BUF_PTR(bp)) +#define XFS_BUF_TO_AGFL(bp) ((xfs_agfl_t *)((bp)->b_addr)) typedef struct xfs_agfl { __be32 agfl_bno[1]; /* actually XFS_AGFL_SIZE(mp) */ diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 1e00b3ef6274..ce84ffd0264c 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -451,9 +451,8 @@ xfs_alloc_read_agfl( XFS_FSS_TO_BB(mp, 1), 0, &bp); if (error) return error; - ASSERT(bp); - ASSERT(!XFS_BUF_GETERROR(bp)); - XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGFL, XFS_AGFL_REF); + ASSERT(!xfs_buf_geterror(bp)); + xfs_buf_set_ref(bp, XFS_AGFL_REF); *bpp = bp; return 0; } @@ -2116,7 +2115,7 @@ xfs_read_agf( if (!*bpp) return 0; - ASSERT(!XFS_BUF_GETERROR(*bpp)); + ASSERT(!(*bpp)->b_error); agf = XFS_BUF_TO_AGF(*bpp); /* @@ -2140,7 +2139,7 @@ xfs_read_agf( xfs_trans_brelse(tp, *bpp); return XFS_ERROR(EFSCORRUPTED); } - XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_AGF, XFS_AGF_REF); + xfs_buf_set_ref(*bpp, XFS_AGF_REF); return 0; } @@ -2168,7 +2167,7 @@ xfs_alloc_read_agf( return error; if (!*bpp) return 0; - ASSERT(!XFS_BUF_GETERROR(*bpp)); + ASSERT(!(*bpp)->b_error); agf = XFS_BUF_TO_AGF(*bpp); pag = xfs_perag_get(mp, agno); diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/xfs_aops.c index 63e971e2b837..574d4ee9b625 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -38,40 +38,6 @@ #include <linux/pagevec.h> #include <linux/writeback.h> - -/* - * Prime number of hash buckets since address is used as the key. - */ -#define NVSYNC 37 -#define to_ioend_wq(v) (&xfs_ioend_wq[((unsigned long)v) % NVSYNC]) -static wait_queue_head_t xfs_ioend_wq[NVSYNC]; - -void __init -xfs_ioend_init(void) -{ - int i; - - for (i = 0; i < NVSYNC; i++) - init_waitqueue_head(&xfs_ioend_wq[i]); -} - -void -xfs_ioend_wait( - xfs_inode_t *ip) -{ - wait_queue_head_t *wq = to_ioend_wq(ip); - - wait_event(*wq, (atomic_read(&ip->i_iocount) == 0)); -} - -STATIC void -xfs_ioend_wake( - xfs_inode_t *ip) -{ - if (atomic_dec_and_test(&ip->i_iocount)) - wake_up(to_ioend_wq(ip)); -} - void xfs_count_page_state( struct page *page, @@ -115,25 +81,20 @@ xfs_destroy_ioend( xfs_ioend_t *ioend) { struct buffer_head *bh, *next; - struct xfs_inode *ip = XFS_I(ioend->io_inode); for (bh = ioend->io_buffer_head; bh; bh = next) { next = bh->b_private; bh->b_end_io(bh, !ioend->io_error); } - /* - * Volume managers supporting multiple paths can send back ENODEV - * when the final path disappears. In this case continuing to fill - * the page cache with dirty data which cannot be written out is - * evil, so prevent that. - */ - if (unlikely(ioend->io_error == -ENODEV)) { - xfs_do_force_shutdown(ip->i_mount, SHUTDOWN_DEVICE_REQ, - __FILE__, __LINE__); + if (ioend->io_iocb) { + if (ioend->io_isasync) { + aio_complete(ioend->io_iocb, ioend->io_error ? + ioend->io_error : ioend->io_result, 0); + } + inode_dio_done(ioend->io_inode); } - xfs_ioend_wake(ip); mempool_free(ioend, xfs_ioend_pool); } @@ -156,6 +117,15 @@ xfs_ioend_new_eof( } /* + * Fast and loose check if this write could update the on-disk inode size. + */ +static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend) +{ + return ioend->io_offset + ioend->io_size > + XFS_I(ioend->io_inode)->i_d.di_size; +} + +/* * Update on-disk file size now that data has been written to disk. The * current in-memory file size is i_size. If a write is beyond eof i_new_size * will be the intended file size until i_size is updated. If this write does @@ -173,9 +143,6 @@ xfs_setfilesize( xfs_inode_t *ip = XFS_I(ioend->io_inode); xfs_fsize_t isize; - if (unlikely(ioend->io_error)) - return 0; - if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) return EAGAIN; @@ -192,6 +159,9 @@ xfs_setfilesize( /* * Schedule IO completion handling on the final put of an ioend. + * + * If there is no work to do we might as well call it a day and free the + * ioend right now. */ STATIC void xfs_finish_ioend( @@ -200,8 +170,10 @@ xfs_finish_ioend( if (atomic_dec_and_test(&ioend->io_remaining)) { if (ioend->io_type == IO_UNWRITTEN) queue_work(xfsconvertd_workqueue, &ioend->io_work); - else + else if (xfs_ioend_is_append(ioend)) queue_work(xfsdatad_workqueue, &ioend->io_work); + else + xfs_destroy_ioend(ioend); } } @@ -216,17 +188,24 @@ xfs_end_io( struct xfs_inode *ip = XFS_I(ioend->io_inode); int error = 0; + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { + ioend->io_error = -EIO; + goto done; + } + if (ioend->io_error) + goto done; + /* * For unwritten extents we need to issue transactions to convert a * range to normal written extens after the data I/O has finished. */ - if (ioend->io_type == IO_UNWRITTEN && - likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) { - + if (ioend->io_type == IO_UNWRITTEN) { error = xfs_iomap_write_unwritten(ip, ioend->io_offset, ioend->io_size); - if (error) - ioend->io_error = error; + if (error) { + ioend->io_error = -error; + goto done; + } } /* @@ -236,6 +215,7 @@ xfs_end_io( error = xfs_setfilesize(ioend); ASSERT(!error || error == EAGAIN); +done: /* * If we didn't complete processing of the ioend, requeue it to the * tail of the workqueue for another attempt later. Otherwise destroy @@ -247,8 +227,6 @@ xfs_end_io( /* ensure we don't spin on blocked ioends */ delay(1); } else { - if (ioend->io_iocb) - aio_complete(ioend->io_iocb, ioend->io_result, 0); xfs_destroy_ioend(ioend); } } @@ -285,13 +263,13 @@ xfs_alloc_ioend( * all the I/O from calling the completion routine too early. */ atomic_set(&ioend->io_remaining, 1); + ioend->io_isasync = 0; ioend->io_error = 0; ioend->io_list = NULL; ioend->io_type = type; ioend->io_inode = inode; ioend->io_buffer_head = NULL; ioend->io_buffer_tail = NULL; - atomic_inc(&XFS_I(ioend->io_inode)->i_iocount); ioend->io_offset = 0; ioend->io_size = 0; ioend->io_iocb = NULL; @@ -337,8 +315,8 @@ xfs_map_blocks( count = mp->m_maxioffset - offset; end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); offset_fsb = XFS_B_TO_FSBT(mp, offset); - error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb, - bmapi_flags, NULL, 0, imap, &nimaps, NULL); + error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, + imap, &nimaps, bmapi_flags); xfs_iunlock(ip, XFS_ILOCK_SHARED); if (error) @@ -551,7 +529,6 @@ xfs_cancel_ioend( unlock_buffer(bh); } while ((bh = next_bh) != NULL); - xfs_ioend_wake(XFS_I(ioend->io_inode)); mempool_free(ioend, xfs_ioend_pool); } while ((ioend = next) != NULL); } @@ -925,11 +902,11 @@ xfs_vm_writepage( * random callers for direct reclaim or memcg reclaim. We explicitly * allow reclaim from kswapd as the stack usage there is relatively low. * - * This should really be done by the core VM, but until that happens - * filesystems like XFS, btrfs and ext4 have to take care of this - * by themselves. + * This should never happen except in the case of a VM regression so + * warn about it. */ - if ((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == PF_MEMALLOC) + if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == + PF_MEMALLOC)) goto redirty; /* @@ -1161,8 +1138,8 @@ __xfs_get_blocks( end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size); offset_fsb = XFS_B_TO_FSBT(mp, offset); - error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb, - XFS_BMAPI_ENTIRE, NULL, 0, &imap, &nimaps, NULL); + error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, + &imap, &nimaps, XFS_BMAPI_ENTIRE); if (error) goto out_unlock; @@ -1310,28 +1287,17 @@ xfs_end_io_direct_write( ioend->io_offset = offset; ioend->io_size = size; + ioend->io_iocb = iocb; + ioend->io_result = ret; if (private && size > 0) ioend->io_type = IO_UNWRITTEN; if (is_async) { - /* - * If we are converting an unwritten extent we need to delay - * the AIO completion until after the unwrittent extent - * conversion has completed, otherwise do it ASAP. - */ - if (ioend->io_type == IO_UNWRITTEN) { - ioend->io_iocb = iocb; - ioend->io_result = ret; - } else { - aio_complete(iocb, ret, 0); - } + ioend->io_isasync = 1; xfs_finish_ioend(ioend); } else { xfs_finish_ioend_sync(ioend); } - - /* XXX: probably should move into the real I/O completion handler */ - inode_dio_done(ioend->io_inode); } STATIC ssize_t diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/xfs_aops.h index 71f721e1a71f..116dd5c37034 100644 --- a/fs/xfs/linux-2.6/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -47,6 +47,7 @@ typedef struct xfs_ioend { unsigned int io_type; /* delalloc / unwritten */ int io_error; /* I/O error code */ atomic_t io_remaining; /* hold count */ + unsigned int io_isasync : 1; /* needs aio_complete */ struct inode *io_inode; /* file being written to */ struct buffer_head *io_buffer_head;/* buffer linked list head */ struct buffer_head *io_buffer_tail;/* buffer linked list tail */ @@ -60,9 +61,6 @@ typedef struct xfs_ioend { extern const struct address_space_operations xfs_address_space_operations; extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int); -extern void xfs_ioend_init(void); -extern void xfs_ioend_wait(struct xfs_inode *); - extern void xfs_count_page_state(struct page *, int *, int *); #endif /* __XFS_AOPS_H__ */ diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index cbae424fe1ba..1e5d97f86ea8 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -319,7 +319,7 @@ xfs_attr_set_int( return (error); } - xfs_trans_ijoin(args.trans, dp); + xfs_trans_ijoin(args.trans, dp, 0); /* * If the attribute list is non-existent or a shortform list, @@ -389,7 +389,7 @@ xfs_attr_set_int( * a new one. We need the inode to be in all transactions. */ if (committed) - xfs_trans_ijoin(args.trans, dp); + xfs_trans_ijoin(args.trans, dp, 0); /* * Commit the leaf transformation. We'll need another (linked) @@ -537,7 +537,7 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags) * No need to make quota reservations here. We expect to release some * blocks not allocate in the common case. */ - xfs_trans_ijoin(args.trans, dp); + xfs_trans_ijoin(args.trans, dp, 0); /* * Decide on what work routines to call based on the inode size. @@ -809,7 +809,7 @@ xfs_attr_inactive(xfs_inode_t *dp) * No need to make quota reservations here. We expect to release some * blocks, not allocate, in the common case. */ - xfs_trans_ijoin(trans, dp); + xfs_trans_ijoin(trans, dp, 0); /* * Decide on what work routines to call based on the inode size. @@ -823,18 +823,6 @@ xfs_attr_inactive(xfs_inode_t *dp) if (error) goto out; - /* - * Signal synchronous inactive transactions unless this is a - * synchronous mount filesystem in which case we know that we're here - * because we've been called out of xfs_inactive which means that the - * last reference is gone and the unlink transaction has already hit - * the disk so async inactive transactions are safe. - */ - if (!(mp->m_flags & XFS_MOUNT_WSYNC)) { - if (dp->i_d.di_anextents > 0) - xfs_trans_set_sync(trans); - } - error = xfs_itruncate_extents(&trans, dp, XFS_ATTR_FORK, 0); if (error) goto out; @@ -973,7 +961,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * a new one. We need the inode to be in all transactions. */ if (committed) - xfs_trans_ijoin(args->trans, dp); + xfs_trans_ijoin(args->trans, dp, 0); /* * Commit the current trans (including the inode) and start @@ -1075,7 +1063,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * in all transactions. */ if (committed) - xfs_trans_ijoin(args->trans, dp); + xfs_trans_ijoin(args->trans, dp, 0); } else xfs_da_buf_done(bp); @@ -1149,7 +1137,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) * a new one. We need the inode to be in all transactions. */ if (committed) - xfs_trans_ijoin(args->trans, dp); + xfs_trans_ijoin(args->trans, dp, 0); } else xfs_da_buf_done(bp); return(0); @@ -1303,7 +1291,7 @@ restart: * in all transactions. */ if (committed) - xfs_trans_ijoin(args->trans, dp); + xfs_trans_ijoin(args->trans, dp, 0); /* * Commit the node conversion and start the next @@ -1340,7 +1328,7 @@ restart: * a new one. We need the inode to be in all transactions. */ if (committed) - xfs_trans_ijoin(args->trans, dp); + xfs_trans_ijoin(args->trans, dp, 0); } else { /* * Addition succeeded, update Btree hashvals. @@ -1452,7 +1440,7 @@ restart: * in all transactions. */ if (committed) - xfs_trans_ijoin(args->trans, dp); + xfs_trans_ijoin(args->trans, dp, 0); } /* @@ -1584,7 +1572,7 @@ xfs_attr_node_removename(xfs_da_args_t *args) * a new one. We need the inode to be in all transactions. */ if (committed) - xfs_trans_ijoin(args->trans, dp); + xfs_trans_ijoin(args->trans, dp, 0); /* * Commit the Btree join operation and start a new trans. @@ -1635,7 +1623,7 @@ xfs_attr_node_removename(xfs_da_args_t *args) * in all transactions. */ if (committed) - xfs_trans_ijoin(args->trans, dp); + xfs_trans_ijoin(args->trans, dp, 0); } else xfs_da_brelse(args->trans, bp); } @@ -1975,10 +1963,9 @@ xfs_attr_rmtval_get(xfs_da_args_t *args) lblkno = args->rmtblkno; while (valuelen > 0) { nmap = ATTR_RMTVALUE_MAPSIZE; - error = xfs_bmapi(args->trans, args->dp, (xfs_fileoff_t)lblkno, - args->rmtblkcnt, - XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, - NULL, 0, map, &nmap, NULL); + error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno, + args->rmtblkcnt, map, &nmap, + XFS_BMAPI_ATTRFORK); if (error) return(error); ASSERT(nmap >= 1); @@ -2052,10 +2039,9 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) */ xfs_bmap_init(args->flist, args->firstblock); nmap = 1; - error = xfs_bmapi(args->trans, dp, (xfs_fileoff_t)lblkno, + error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno, blkcnt, - XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA | - XFS_BMAPI_WRITE, + XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, args->firstblock, args->total, &map, &nmap, args->flist); if (!error) { @@ -2074,7 +2060,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) * a new one. We need the inode to be in all transactions. */ if (committed) - xfs_trans_ijoin(args->trans, dp); + xfs_trans_ijoin(args->trans, dp, 0); ASSERT(nmap == 1); ASSERT((map.br_startblock != DELAYSTARTBLOCK) && @@ -2104,14 +2090,11 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) */ xfs_bmap_init(args->flist, args->firstblock); nmap = 1; - error = xfs_bmapi(NULL, dp, (xfs_fileoff_t)lblkno, - args->rmtblkcnt, - XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, - args->firstblock, 0, &map, &nmap, - NULL); - if (error) { + error = xfs_bmapi_read(dp, (xfs_fileoff_t)lblkno, + args->rmtblkcnt, &map, &nmap, + XFS_BMAPI_ATTRFORK); + if (error) return(error); - } ASSERT(nmap == 1); ASSERT((map.br_startblock != DELAYSTARTBLOCK) && (map.br_startblock != HOLESTARTBLOCK)); @@ -2121,17 +2104,17 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt, XBF_LOCK | XBF_DONT_BLOCK); - ASSERT(bp); - ASSERT(!XFS_BUF_GETERROR(bp)); - + if (!bp) + return ENOMEM; tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen : XFS_BUF_SIZE(bp); xfs_buf_iomove(bp, 0, tmp, src, XBRW_WRITE); if (tmp < XFS_BUF_SIZE(bp)) xfs_buf_zero(bp, tmp, XFS_BUF_SIZE(bp) - tmp); - if ((error = xfs_bwrite(mp, bp))) {/* GROT: NOTE: synchronous write */ - return (error); - } + error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */ + xfs_buf_relse(bp); + if (error) + return error; src += tmp; valuelen -= tmp; @@ -2167,16 +2150,12 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args) /* * Try to remember where we decided to put the value. */ - xfs_bmap_init(args->flist, args->firstblock); nmap = 1; - error = xfs_bmapi(NULL, args->dp, (xfs_fileoff_t)lblkno, - args->rmtblkcnt, - XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, - args->firstblock, 0, &map, &nmap, - args->flist); - if (error) { + error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno, + args->rmtblkcnt, &map, &nmap, + XFS_BMAPI_ATTRFORK); + if (error) return(error); - } ASSERT(nmap == 1); ASSERT((map.br_startblock != DELAYSTARTBLOCK) && (map.br_startblock != HOLESTARTBLOCK)); @@ -2189,8 +2168,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args) */ bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt, XBF_TRYLOCK); if (bp) { - XFS_BUF_STALE(bp); - XFS_BUF_UNDELAYWRITE(bp); + xfs_buf_stale(bp); xfs_buf_relse(bp); bp = NULL; } @@ -2228,7 +2206,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args) * a new one. We need the inode to be in all transactions. */ if (committed) - xfs_trans_ijoin(args->trans, args->dp); + xfs_trans_ijoin(args->trans, args->dp, 0); /* * Close out trans and start the next one in the chain. diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index 8fad9602542b..d4906e7c9787 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -2926,9 +2926,8 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp, * Try to remember where we decided to put the value. */ nmap = 1; - error = xfs_bmapi(*trans, dp, (xfs_fileoff_t)tblkno, tblkcnt, - XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, - NULL, 0, &map, &nmap, NULL); + error = xfs_bmapi_read(dp, (xfs_fileoff_t)tblkno, tblkcnt, + &map, &nmap, XFS_BMAPI_ATTRFORK); if (error) { return(error); } @@ -2948,6 +2947,8 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp, bp = xfs_trans_get_buf(*trans, dp->i_mount->m_ddev_targp, dblkno, dblkcnt, XBF_LOCK); + if (!bp) + return ENOMEM; xfs_trans_binval(*trans, bp); /* * Roll to next transaction. diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index c51a3f903633..c68baeb0974a 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -50,17 +50,22 @@ #include "xfs_trace.h" -#ifdef DEBUG -STATIC void -xfs_bmap_check_leaf_extents(xfs_btree_cur_t *cur, xfs_inode_t *ip, int whichfork); -#endif - kmem_zone_t *xfs_bmap_free_item_zone; /* * Prototypes for internal bmap routines. */ +#ifdef DEBUG +STATIC void +xfs_bmap_check_leaf_extents( + struct xfs_btree_cur *cur, + struct xfs_inode *ip, + int whichfork); +#else +#define xfs_bmap_check_leaf_extents(cur, ip, whichfork) do { } while (0) +#endif + /* * Called from xfs_bmap_add_attrfork to handle extents format files. @@ -85,58 +90,6 @@ xfs_bmap_add_attrfork_local( int *flags); /* inode logging flags */ /* - * Called by xfs_bmap_add_extent to handle cases converting a delayed - * allocation to a real allocation. - */ -STATIC int /* error */ -xfs_bmap_add_extent_delay_real( - struct xfs_trans *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t *idx, /* extent number to update/insert */ - xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ - xfs_bmbt_irec_t *new, /* new data to add to file extents */ - xfs_filblks_t *dnew, /* new delayed-alloc indirect blocks */ - xfs_fsblock_t *first, /* pointer to firstblock variable */ - xfs_bmap_free_t *flist, /* list of extents to be freed */ - int *logflagsp); /* inode logging flags */ - -/* - * Called by xfs_bmap_add_extent to handle cases converting a hole - * to a delayed allocation. - */ -STATIC int /* error */ -xfs_bmap_add_extent_hole_delay( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t *idx, /* extent number to update/insert */ - xfs_bmbt_irec_t *new, /* new data to add to file extents */ - int *logflagsp); /* inode logging flags */ - -/* - * Called by xfs_bmap_add_extent to handle cases converting a hole - * to a real allocation. - */ -STATIC int /* error */ -xfs_bmap_add_extent_hole_real( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t *idx, /* extent number to update/insert */ - xfs_btree_cur_t *cur, /* if null, not a btree */ - xfs_bmbt_irec_t *new, /* new data to add to file extents */ - int *logflagsp, /* inode logging flags */ - int whichfork); /* data or attr fork */ - -/* - * Called by xfs_bmap_add_extent to handle cases converting an unwritten - * allocation to a real allocation or vice versa. - */ -STATIC int /* error */ -xfs_bmap_add_extent_unwritten_real( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t *idx, /* extent number to update/insert */ - xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ - xfs_bmbt_irec_t *new, /* new data to add to file extents */ - int *logflagsp); /* inode logging flags */ - -/* * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file. * It figures out where to ask the underlying allocator to put the new extent. */ @@ -215,19 +168,6 @@ xfs_bmap_search_extents( xfs_bmbt_irec_t *prevp); /* out: previous extent entry found */ /* - * Check the last inode extent to determine whether this allocation will result - * in blocks being allocated at the end of the file. When we allocate new data - * blocks at the end of the file which do not start at the previous data block, - * we will try to align the new blocks at stripe unit boundaries. - */ -STATIC int /* error */ -xfs_bmap_isaeof( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fileoff_t off, /* file offset in fsblocks */ - int whichfork, /* data or attribute fork */ - char *aeof); /* return value */ - -/* * Compute the worst-case number of indirect blocks that will be used * for ip's delayed extent of length "len". */ @@ -414,7 +354,7 @@ xfs_bmap_add_attrfork_local( if (ip->i_df.if_bytes <= XFS_IFORK_DSIZE(ip)) return 0; - if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) { + if (S_ISDIR(ip->i_d.di_mode)) { mp = ip->i_mount; memset(&dargs, 0, sizeof(dargs)); dargs.dp = ip; @@ -431,188 +371,13 @@ xfs_bmap_add_attrfork_local( } /* - * Called by xfs_bmapi to update file extent records and the btree - * after allocating space (or doing a delayed allocation). - */ -STATIC int /* error */ -xfs_bmap_add_extent( - struct xfs_trans *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t *idx, /* extent number to update/insert */ - xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ - xfs_bmbt_irec_t *new, /* new data to add to file extents */ - xfs_fsblock_t *first, /* pointer to firstblock variable */ - xfs_bmap_free_t *flist, /* list of extents to be freed */ - int *logflagsp, /* inode logging flags */ - int whichfork) /* data or attr fork */ -{ - xfs_btree_cur_t *cur; /* btree cursor or null */ - xfs_filblks_t da_new; /* new count del alloc blocks used */ - xfs_filblks_t da_old; /* old count del alloc blocks used */ - int error; /* error return value */ - xfs_ifork_t *ifp; /* inode fork ptr */ - int logflags; /* returned value */ - xfs_extnum_t nextents; /* number of extents in file now */ - - XFS_STATS_INC(xs_add_exlist); - - cur = *curp; - ifp = XFS_IFORK_PTR(ip, whichfork); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - da_old = da_new = 0; - error = 0; - - ASSERT(*idx >= 0); - ASSERT(*idx <= nextents); - - /* - * This is the first extent added to a new/empty file. - * Special case this one, so other routines get to assume there are - * already extents in the list. - */ - if (nextents == 0) { - xfs_iext_insert(ip, *idx, 1, new, - whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0); - - ASSERT(cur == NULL); - - if (!isnullstartblock(new->br_startblock)) { - XFS_IFORK_NEXT_SET(ip, whichfork, 1); - logflags = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); - } else - logflags = 0; - } - /* - * Any kind of new delayed allocation goes here. - */ - else if (isnullstartblock(new->br_startblock)) { - if (cur) - ASSERT((cur->bc_private.b.flags & - XFS_BTCUR_BPRV_WASDEL) == 0); - error = xfs_bmap_add_extent_hole_delay(ip, idx, new, - &logflags); - } - /* - * Real allocation off the end of the file. - */ - else if (*idx == nextents) { - if (cur) - ASSERT((cur->bc_private.b.flags & - XFS_BTCUR_BPRV_WASDEL) == 0); - error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new, - &logflags, whichfork); - } else { - xfs_bmbt_irec_t prev; /* old extent at offset idx */ - - /* - * Get the record referred to by idx. - */ - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &prev); - /* - * If it's a real allocation record, and the new allocation ends - * after the start of the referred to record, then we're filling - * in a delayed or unwritten allocation with a real one, or - * converting real back to unwritten. - */ - if (!isnullstartblock(new->br_startblock) && - new->br_startoff + new->br_blockcount > prev.br_startoff) { - if (prev.br_state != XFS_EXT_UNWRITTEN && - isnullstartblock(prev.br_startblock)) { - da_old = startblockval(prev.br_startblock); - if (cur) - ASSERT(cur->bc_private.b.flags & - XFS_BTCUR_BPRV_WASDEL); - error = xfs_bmap_add_extent_delay_real(tp, ip, - idx, &cur, new, &da_new, - first, flist, &logflags); - } else { - ASSERT(new->br_state == XFS_EXT_NORM || - new->br_state == XFS_EXT_UNWRITTEN); - - error = xfs_bmap_add_extent_unwritten_real(ip, - idx, &cur, new, &logflags); - if (error) - goto done; - } - } - /* - * Otherwise we're filling in a hole with an allocation. - */ - else { - if (cur) - ASSERT((cur->bc_private.b.flags & - XFS_BTCUR_BPRV_WASDEL) == 0); - error = xfs_bmap_add_extent_hole_real(ip, idx, cur, - new, &logflags, whichfork); - } - } - - if (error) - goto done; - ASSERT(*curp == cur || *curp == NULL); - - /* - * Convert to a btree if necessary. - */ - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max) { - int tmp_logflags; /* partial log flag return val */ - - ASSERT(cur == NULL); - error = xfs_bmap_extents_to_btree(tp, ip, first, - flist, &cur, da_old > 0, &tmp_logflags, whichfork); - logflags |= tmp_logflags; - if (error) - goto done; - } - /* - * Adjust for changes in reserved delayed indirect blocks. - * Nothing to do for disk quotas here. - */ - if (da_old || da_new) { - xfs_filblks_t nblks; - - nblks = da_new; - if (cur) - nblks += cur->bc_private.b.allocated; - ASSERT(nblks <= da_old); - if (nblks < da_old) - xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS, - (int64_t)(da_old - nblks), 0); - } - /* - * Clear out the allocated field, done with it now in any case. - */ - if (cur) { - cur->bc_private.b.allocated = 0; - *curp = cur; - } -done: -#ifdef DEBUG - if (!error) - xfs_bmap_check_leaf_extents(*curp, ip, whichfork); -#endif - *logflagsp = logflags; - return error; -} - -/* - * Called by xfs_bmap_add_extent to handle cases converting a delayed - * allocation to a real allocation. + * Convert a delayed allocation to a real allocation. */ STATIC int /* error */ xfs_bmap_add_extent_delay_real( - struct xfs_trans *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t *idx, /* extent number to update/insert */ - xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ - xfs_bmbt_irec_t *new, /* new data to add to file extents */ - xfs_filblks_t *dnew, /* new delayed-alloc indirect blocks */ - xfs_fsblock_t *first, /* pointer to firstblock variable */ - xfs_bmap_free_t *flist, /* list of extents to be freed */ - int *logflagsp) /* inode logging flags */ + struct xfs_bmalloca *bma) { - xfs_btree_cur_t *cur; /* btree cursor */ + struct xfs_bmbt_irec *new = &bma->got; int diff; /* temp value */ xfs_bmbt_rec_host_t *ep; /* extent entry for idx */ int error; /* error return value */ @@ -623,10 +388,22 @@ xfs_bmap_add_extent_delay_real( /* left is 0, right is 1, prev is 2 */ int rval=0; /* return value (logging flags) */ int state = 0;/* state bits, accessed thru macros */ - xfs_filblks_t temp=0; /* value for dnew calculations */ - xfs_filblks_t temp2=0;/* value for dnew calculations */ + xfs_filblks_t da_new; /* new count del alloc blocks used */ + xfs_filblks_t da_old; /* old count del alloc blocks used */ + xfs_filblks_t temp=0; /* value for da_new calculations */ + xfs_filblks_t temp2=0;/* value for da_new calculations */ int tmp_rval; /* partial logging flags */ + ifp = XFS_IFORK_PTR(bma->ip, XFS_DATA_FORK); + + ASSERT(bma->idx >= 0); + ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); + ASSERT(!isnullstartblock(new->br_startblock)); + ASSERT(!bma->cur || + (bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); + + XFS_STATS_INC(xs_add_exlist); + #define LEFT r[0] #define RIGHT r[1] #define PREV r[2] @@ -634,14 +411,15 @@ xfs_bmap_add_extent_delay_real( /* * Set up a bunch of variables to make the tests simpler. */ - cur = *curp; - ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); - ep = xfs_iext_get_ext(ifp, *idx); + ep = xfs_iext_get_ext(ifp, bma->idx); xfs_bmbt_get_all(ep, &PREV); new_endoff = new->br_startoff + new->br_blockcount; ASSERT(PREV.br_startoff <= new->br_startoff); ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff); + da_old = startblockval(PREV.br_startblock); + da_new = 0; + /* * Set flags determining what part of the previous delayed allocation * extent is being replaced by a real allocation. @@ -655,9 +433,9 @@ xfs_bmap_add_extent_delay_real( * Check and set flags if this segment has a left neighbor. * Don't set contiguous if the combined extent would be too large. */ - if (*idx > 0) { + if (bma->idx > 0) { state |= BMAP_LEFT_VALID; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &LEFT); + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &LEFT); if (isnullstartblock(LEFT.br_startblock)) state |= BMAP_LEFT_DELAY; @@ -675,9 +453,9 @@ xfs_bmap_add_extent_delay_real( * Don't set contiguous if the combined extent would be too large. * Also check for all-three-contiguous being too large. */ - if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) { + if (bma->idx < bma->ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) { state |= BMAP_RIGHT_VALID; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT); + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx + 1), &RIGHT); if (isnullstartblock(RIGHT.br_startblock)) state |= BMAP_RIGHT_DELAY; @@ -708,38 +486,41 @@ xfs_bmap_add_extent_delay_real( * Filling in all of a previously delayed allocation extent. * The left and right neighbors are both contiguous with new. */ - --*idx; - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), + bma->idx--; + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx), LEFT.br_blockcount + PREV.br_blockcount + RIGHT.br_blockcount); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); - xfs_iext_remove(ip, *idx + 1, 2, state); - ip->i_d.di_nextents--; - if (cur == NULL) + xfs_iext_remove(bma->ip, bma->idx + 1, 2, state); + bma->ip->i_d.di_nextents--; + if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; - if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff, + error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff, RIGHT.br_startblock, - RIGHT.br_blockcount, &i))) + RIGHT.br_blockcount, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_btree_delete(cur, &i))) + error = xfs_btree_delete(bma->cur, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_btree_decrement(cur, 0, &i))) + error = xfs_btree_decrement(bma->cur, 0, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, + error = xfs_bmbt_update(bma->cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount + PREV.br_blockcount + - RIGHT.br_blockcount, LEFT.br_state))) + RIGHT.br_blockcount, LEFT.br_state); + if (error) goto done; } - *dnew = 0; break; case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG: @@ -747,30 +528,31 @@ xfs_bmap_add_extent_delay_real( * Filling in all of a previously delayed allocation extent. * The left neighbor is contiguous, the right is not. */ - --*idx; + bma->idx--; - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx), LEFT.br_blockcount + PREV.br_blockcount); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); - xfs_iext_remove(ip, *idx + 1, 1, state); - if (cur == NULL) + xfs_iext_remove(bma->ip, bma->idx + 1, 1, state); + if (bma->cur == NULL) rval = XFS_ILOG_DEXT; else { rval = 0; - if ((error = xfs_bmbt_lookup_eq(cur, LEFT.br_startoff, + error = xfs_bmbt_lookup_eq(bma->cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount, - &i))) + &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, + error = xfs_bmbt_update(bma->cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount + - PREV.br_blockcount, LEFT.br_state))) + PREV.br_blockcount, LEFT.br_state); + if (error) goto done; } - *dnew = 0; break; case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: @@ -778,30 +560,30 @@ xfs_bmap_add_extent_delay_real( * Filling in all of a previously delayed allocation extent. * The right neighbor is contiguous, the left is not. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); xfs_bmbt_set_startblock(ep, new->br_startblock); xfs_bmbt_set_blockcount(ep, PREV.br_blockcount + RIGHT.br_blockcount); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); - xfs_iext_remove(ip, *idx + 1, 1, state); - if (cur == NULL) + xfs_iext_remove(bma->ip, bma->idx + 1, 1, state); + if (bma->cur == NULL) rval = XFS_ILOG_DEXT; else { rval = 0; - if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff, + error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff, RIGHT.br_startblock, - RIGHT.br_blockcount, &i))) + RIGHT.br_blockcount, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_bmbt_update(cur, PREV.br_startoff, + error = xfs_bmbt_update(bma->cur, PREV.br_startoff, new->br_startblock, PREV.br_blockcount + - RIGHT.br_blockcount, PREV.br_state))) + RIGHT.br_blockcount, PREV.br_state); + if (error) goto done; } - - *dnew = 0; break; case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING: @@ -810,27 +592,27 @@ xfs_bmap_add_extent_delay_real( * Neither the left nor right neighbors are contiguous with * the new one. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); xfs_bmbt_set_startblock(ep, new->br_startblock); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); - ip->i_d.di_nextents++; - if (cur == NULL) + bma->ip->i_d.di_nextents++; + if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; - if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, + error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff, new->br_startblock, new->br_blockcount, - &i))) + &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 0, done); - cur->bc_rec.b.br_state = XFS_EXT_NORM; - if ((error = xfs_btree_insert(cur, &i))) + bma->cur->bc_rec.b.br_state = XFS_EXT_NORM; + error = xfs_btree_insert(bma->cur, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); } - - *dnew = 0; break; case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG: @@ -838,39 +620,40 @@ xfs_bmap_add_extent_delay_real( * Filling in the first part of a previous delayed allocation. * The left neighbor is contiguous. */ - trace_xfs_bmap_pre_update(ip, *idx - 1, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx - 1), + trace_xfs_bmap_pre_update(bma->ip, bma->idx - 1, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx - 1), LEFT.br_blockcount + new->br_blockcount); xfs_bmbt_set_startoff(ep, PREV.br_startoff + new->br_blockcount); - trace_xfs_bmap_post_update(ip, *idx - 1, state, _THIS_IP_); + trace_xfs_bmap_post_update(bma->ip, bma->idx - 1, state, _THIS_IP_); temp = PREV.br_blockcount - new->br_blockcount; - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); xfs_bmbt_set_blockcount(ep, temp); - if (cur == NULL) + if (bma->cur == NULL) rval = XFS_ILOG_DEXT; else { rval = 0; - if ((error = xfs_bmbt_lookup_eq(cur, LEFT.br_startoff, + error = xfs_bmbt_lookup_eq(bma->cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount, - &i))) + &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, + error = xfs_bmbt_update(bma->cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount + new->br_blockcount, - LEFT.br_state))) + LEFT.br_state); + if (error) goto done; } - temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), startblockval(PREV.br_startblock)); - xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_startblock(ep, nullstartblock(da_new)); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); - --*idx; - *dnew = temp; + bma->idx--; break; case BMAP_LEFT_FILLING: @@ -878,43 +661,43 @@ xfs_bmap_add_extent_delay_real( * Filling in the first part of a previous delayed allocation. * The left neighbor is not contiguous. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); xfs_bmbt_set_startoff(ep, new_endoff); temp = PREV.br_blockcount - new->br_blockcount; xfs_bmbt_set_blockcount(ep, temp); - xfs_iext_insert(ip, *idx, 1, new, state); - ip->i_d.di_nextents++; - if (cur == NULL) + xfs_iext_insert(bma->ip, bma->idx, 1, new, state); + bma->ip->i_d.di_nextents++; + if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; - if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, + error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff, new->br_startblock, new->br_blockcount, - &i))) + &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 0, done); - cur->bc_rec.b.br_state = XFS_EXT_NORM; - if ((error = xfs_btree_insert(cur, &i))) + bma->cur->bc_rec.b.br_state = XFS_EXT_NORM; + error = xfs_btree_insert(bma->cur, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); } - if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && - ip->i_d.di_nextents > ip->i_df.if_ext_max) { - error = xfs_bmap_extents_to_btree(tp, ip, - first, flist, &cur, 1, &tmp_rval, - XFS_DATA_FORK); + if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && + bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) { + error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, + bma->firstblock, bma->flist, + &bma->cur, 1, &tmp_rval, XFS_DATA_FORK); rval |= tmp_rval; if (error) goto done; } - temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), startblockval(PREV.br_startblock) - - (cur ? cur->bc_private.b.allocated : 0)); - ep = xfs_iext_get_ext(ifp, *idx + 1); - xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - trace_xfs_bmap_post_update(ip, *idx + 1, state, _THIS_IP_); - - *dnew = temp; + (bma->cur ? bma->cur->bc_private.b.allocated : 0)); + ep = xfs_iext_get_ext(ifp, bma->idx + 1); + xfs_bmbt_set_startblock(ep, nullstartblock(da_new)); + trace_xfs_bmap_post_update(bma->ip, bma->idx + 1, state, _THIS_IP_); break; case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: @@ -923,38 +706,39 @@ xfs_bmap_add_extent_delay_real( * The right neighbor is contiguous with the new allocation. */ temp = PREV.br_blockcount - new->br_blockcount; - trace_xfs_bmap_pre_update(ip, *idx + 1, state, _THIS_IP_); + trace_xfs_bmap_pre_update(bma->ip, bma->idx + 1, state, _THIS_IP_); xfs_bmbt_set_blockcount(ep, temp); - xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx + 1), + xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, bma->idx + 1), new->br_startoff, new->br_startblock, new->br_blockcount + RIGHT.br_blockcount, RIGHT.br_state); - trace_xfs_bmap_post_update(ip, *idx + 1, state, _THIS_IP_); - if (cur == NULL) + trace_xfs_bmap_post_update(bma->ip, bma->idx + 1, state, _THIS_IP_); + if (bma->cur == NULL) rval = XFS_ILOG_DEXT; else { rval = 0; - if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff, + error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff, RIGHT.br_startblock, - RIGHT.br_blockcount, &i))) + RIGHT.br_blockcount, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_bmbt_update(cur, new->br_startoff, + error = xfs_bmbt_update(bma->cur, new->br_startoff, new->br_startblock, new->br_blockcount + RIGHT.br_blockcount, - RIGHT.br_state))) + RIGHT.br_state); + if (error) goto done; } - temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), startblockval(PREV.br_startblock)); - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); + xfs_bmbt_set_startblock(ep, nullstartblock(da_new)); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); - ++*idx; - *dnew = temp; + bma->idx++; break; case BMAP_RIGHT_FILLING: @@ -963,42 +747,43 @@ xfs_bmap_add_extent_delay_real( * The right neighbor is not contiguous. */ temp = PREV.br_blockcount - new->br_blockcount; - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); xfs_bmbt_set_blockcount(ep, temp); - xfs_iext_insert(ip, *idx + 1, 1, new, state); - ip->i_d.di_nextents++; - if (cur == NULL) + xfs_iext_insert(bma->ip, bma->idx + 1, 1, new, state); + bma->ip->i_d.di_nextents++; + if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; - if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, + error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff, new->br_startblock, new->br_blockcount, - &i))) + &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 0, done); - cur->bc_rec.b.br_state = XFS_EXT_NORM; - if ((error = xfs_btree_insert(cur, &i))) + bma->cur->bc_rec.b.br_state = XFS_EXT_NORM; + error = xfs_btree_insert(bma->cur, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); } - if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && - ip->i_d.di_nextents > ip->i_df.if_ext_max) { - error = xfs_bmap_extents_to_btree(tp, ip, - first, flist, &cur, 1, &tmp_rval, - XFS_DATA_FORK); + if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && + bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) { + error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, + bma->firstblock, bma->flist, &bma->cur, 1, + &tmp_rval, XFS_DATA_FORK); rval |= tmp_rval; if (error) goto done; } - temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), startblockval(PREV.br_startblock) - - (cur ? cur->bc_private.b.allocated : 0)); - ep = xfs_iext_get_ext(ifp, *idx); - xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + (bma->cur ? bma->cur->bc_private.b.allocated : 0)); + ep = xfs_iext_get_ext(ifp, bma->idx); + xfs_bmbt_set_startblock(ep, nullstartblock(da_new)); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); - ++*idx; - *dnew = temp; + bma->idx++; break; case 0: @@ -1024,82 +809,65 @@ xfs_bmap_add_extent_delay_real( */ temp = new->br_startoff - PREV.br_startoff; temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff; - trace_xfs_bmap_pre_update(ip, *idx, 0, _THIS_IP_); + trace_xfs_bmap_pre_update(bma->ip, bma->idx, 0, _THIS_IP_); xfs_bmbt_set_blockcount(ep, temp); /* truncate PREV */ LEFT = *new; RIGHT.br_state = PREV.br_state; RIGHT.br_startblock = nullstartblock( - (int)xfs_bmap_worst_indlen(ip, temp2)); + (int)xfs_bmap_worst_indlen(bma->ip, temp2)); RIGHT.br_startoff = new_endoff; RIGHT.br_blockcount = temp2; /* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */ - xfs_iext_insert(ip, *idx + 1, 2, &LEFT, state); - ip->i_d.di_nextents++; - if (cur == NULL) + xfs_iext_insert(bma->ip, bma->idx + 1, 2, &LEFT, state); + bma->ip->i_d.di_nextents++; + if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; - if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, + error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff, new->br_startblock, new->br_blockcount, - &i))) + &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 0, done); - cur->bc_rec.b.br_state = XFS_EXT_NORM; - if ((error = xfs_btree_insert(cur, &i))) + bma->cur->bc_rec.b.br_state = XFS_EXT_NORM; + error = xfs_btree_insert(bma->cur, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); } - if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && - ip->i_d.di_nextents > ip->i_df.if_ext_max) { - error = xfs_bmap_extents_to_btree(tp, ip, - first, flist, &cur, 1, &tmp_rval, - XFS_DATA_FORK); + if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && + bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) { + error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, + bma->firstblock, bma->flist, &bma->cur, + 1, &tmp_rval, XFS_DATA_FORK); rval |= tmp_rval; if (error) goto done; } - temp = xfs_bmap_worst_indlen(ip, temp); - temp2 = xfs_bmap_worst_indlen(ip, temp2); + temp = xfs_bmap_worst_indlen(bma->ip, temp); + temp2 = xfs_bmap_worst_indlen(bma->ip, temp2); diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) - - (cur ? cur->bc_private.b.allocated : 0)); - if (diff > 0 && - xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS, - -((int64_t)diff), 0)) { - /* - * Ick gross gag me with a spoon. - */ - ASSERT(0); /* want to see if this ever happens! */ - while (diff > 0) { - if (temp) { - temp--; - diff--; - if (!diff || - !xfs_icsb_modify_counters(ip->i_mount, - XFS_SBS_FDBLOCKS, - -((int64_t)diff), 0)) - break; - } - if (temp2) { - temp2--; - diff--; - if (!diff || - !xfs_icsb_modify_counters(ip->i_mount, - XFS_SBS_FDBLOCKS, - -((int64_t)diff), 0)) - break; - } - } + (bma->cur ? bma->cur->bc_private.b.allocated : 0)); + if (diff > 0) { + error = xfs_icsb_modify_counters(bma->ip->i_mount, + XFS_SBS_FDBLOCKS, + -((int64_t)diff), 0); + ASSERT(!error); + if (error) + goto done; } - ep = xfs_iext_get_ext(ifp, *idx); + + ep = xfs_iext_get_ext(ifp, bma->idx); xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - trace_xfs_bmap_pre_update(ip, *idx + 2, state, _THIS_IP_); - xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx + 2), + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(bma->ip, bma->idx + 2, state, _THIS_IP_); + xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, bma->idx + 2), nullstartblock((int)temp2)); - trace_xfs_bmap_post_update(ip, *idx + 2, state, _THIS_IP_); + trace_xfs_bmap_post_update(bma->ip, bma->idx + 2, state, _THIS_IP_); - ++*idx; - *dnew = temp + temp2; + bma->idx++; + da_new = temp + temp2; break; case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: @@ -1114,9 +882,40 @@ xfs_bmap_add_extent_delay_real( */ ASSERT(0); } - *curp = cur; + + /* convert to a btree if necessary */ + if (XFS_IFORK_FORMAT(bma->ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_NEXTENTS(bma->ip, XFS_DATA_FORK) > ifp->if_ext_max) { + int tmp_logflags; /* partial log flag return val */ + + ASSERT(bma->cur == NULL); + error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, + bma->firstblock, bma->flist, &bma->cur, + da_old > 0, &tmp_logflags, XFS_DATA_FORK); + bma->logflags |= tmp_logflags; + if (error) + goto done; + } + + /* adjust for changes in reserved delayed indirect blocks */ + if (da_old || da_new) { + temp = da_new; + if (bma->cur) + temp += bma->cur->bc_private.b.allocated; + ASSERT(temp <= da_old); + if (temp < da_old) + xfs_icsb_modify_counters(bma->ip->i_mount, + XFS_SBS_FDBLOCKS, + (int64_t)(da_old - temp), 0); + } + + /* clear out the allocated field, done with it now in any case. */ + if (bma->cur) + bma->cur->bc_private.b.allocated = 0; + + xfs_bmap_check_leaf_extents(bma->cur, bma->ip, XFS_DATA_FORK); done: - *logflagsp = rval; + bma->logflags |= rval; return error; #undef LEFT #undef RIGHT @@ -1124,15 +923,17 @@ done: } /* - * Called by xfs_bmap_add_extent to handle cases converting an unwritten - * allocation to a real allocation or vice versa. + * Convert an unwritten allocation to a real allocation or vice versa. */ STATIC int /* error */ xfs_bmap_add_extent_unwritten_real( + struct xfs_trans *tp, xfs_inode_t *ip, /* incore inode pointer */ xfs_extnum_t *idx, /* extent number to update/insert */ xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ xfs_bmbt_irec_t *new, /* new data to add to file extents */ + xfs_fsblock_t *first, /* pointer to firstblock variable */ + xfs_bmap_free_t *flist, /* list of extents to be freed */ int *logflagsp) /* inode logging flags */ { xfs_btree_cur_t *cur; /* btree cursor */ @@ -1148,15 +949,25 @@ xfs_bmap_add_extent_unwritten_real( int rval=0; /* return value (logging flags) */ int state = 0;/* state bits, accessed thru macros */ + *logflagsp = 0; + + cur = *curp; + ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + + ASSERT(*idx >= 0); + ASSERT(*idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); + ASSERT(!isnullstartblock(new->br_startblock)); + + XFS_STATS_INC(xs_add_exlist); + #define LEFT r[0] #define RIGHT r[1] #define PREV r[2] + /* * Set up a bunch of variables to make the tests simpler. */ error = 0; - cur = *curp; - ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); ep = xfs_iext_get_ext(ifp, *idx); xfs_bmbt_get_all(ep, &PREV); newext = new->br_state; @@ -1406,10 +1217,11 @@ xfs_bmap_add_extent_unwritten_real( goto done; if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; - if (xfs_bmbt_update(cur, LEFT.br_startoff, + error = xfs_bmbt_update(cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount + new->br_blockcount, - LEFT.br_state)) + LEFT.br_state); + if (error) goto done; } break; @@ -1607,9 +1419,29 @@ xfs_bmap_add_extent_unwritten_real( */ ASSERT(0); } - *curp = cur; + + /* convert to a btree if necessary */ + if (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > ifp->if_ext_max) { + int tmp_logflags; /* partial log flag return val */ + + ASSERT(cur == NULL); + error = xfs_bmap_extents_to_btree(tp, ip, first, flist, &cur, + 0, &tmp_logflags, XFS_DATA_FORK); + *logflagsp |= tmp_logflags; + if (error) + goto done; + } + + /* clear out the allocated field, done with it now in any case. */ + if (cur) { + cur->bc_private.b.allocated = 0; + *curp = cur; + } + + xfs_bmap_check_leaf_extents(*curp, ip, XFS_DATA_FORK); done: - *logflagsp = rval; + *logflagsp |= rval; return error; #undef LEFT #undef RIGHT @@ -1617,16 +1449,13 @@ done: } /* - * Called by xfs_bmap_add_extent to handle cases converting a hole - * to a delayed allocation. + * Convert a hole to a delayed allocation. */ -/*ARGSUSED*/ -STATIC int /* error */ +STATIC void xfs_bmap_add_extent_hole_delay( xfs_inode_t *ip, /* incore inode pointer */ xfs_extnum_t *idx, /* extent number to update/insert */ - xfs_bmbt_irec_t *new, /* new data to add to file extents */ - int *logflagsp) /* inode logging flags */ + xfs_bmbt_irec_t *new) /* new data to add to file extents */ { xfs_ifork_t *ifp; /* inode fork pointer */ xfs_bmbt_irec_t left; /* left neighbor extent entry */ @@ -1761,23 +1590,17 @@ xfs_bmap_add_extent_hole_delay( * Nothing to do for disk quota accounting here. */ } - *logflagsp = 0; - return 0; } /* - * Called by xfs_bmap_add_extent to handle cases converting a hole - * to a real allocation. + * Convert a hole to a real allocation. */ STATIC int /* error */ xfs_bmap_add_extent_hole_real( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t *idx, /* extent number to update/insert */ - xfs_btree_cur_t *cur, /* if null, not a btree */ - xfs_bmbt_irec_t *new, /* new data to add to file extents */ - int *logflagsp, /* inode logging flags */ - int whichfork) /* data or attr fork */ + struct xfs_bmalloca *bma, + int whichfork) { + struct xfs_bmbt_irec *new = &bma->got; int error; /* error return value */ int i; /* temp state */ xfs_ifork_t *ifp; /* inode fork pointer */ @@ -1786,19 +1609,26 @@ xfs_bmap_add_extent_hole_real( int rval=0; /* return value (logging flags) */ int state; /* state bits, accessed thru macros */ - ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(*idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)); - state = 0; + ifp = XFS_IFORK_PTR(bma->ip, whichfork); + + ASSERT(bma->idx >= 0); + ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); + ASSERT(!isnullstartblock(new->br_startblock)); + ASSERT(!bma->cur || + !(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); + + XFS_STATS_INC(xs_add_exlist); + state = 0; if (whichfork == XFS_ATTR_FORK) state |= BMAP_ATTRFORK; /* * Check and set flags if this segment has a left neighbor. */ - if (*idx > 0) { + if (bma->idx > 0) { state |= BMAP_LEFT_VALID; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left); + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &left); if (isnullstartblock(left.br_startblock)) state |= BMAP_LEFT_DELAY; } @@ -1807,9 +1637,9 @@ xfs_bmap_add_extent_hole_real( * Check and set flags if this segment has a current value. * Not true if we're inserting into the "hole" at eof. */ - if (*idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) { + if (bma->idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) { state |= BMAP_RIGHT_VALID; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right); + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &right); if (isnullstartblock(right.br_startblock)) state |= BMAP_RIGHT_DELAY; } @@ -1846,39 +1676,42 @@ xfs_bmap_add_extent_hole_real( * left and on the right. * Merge all three into a single extent record. */ - --*idx; - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), + --bma->idx; + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx), left.br_blockcount + new->br_blockcount + right.br_blockcount); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); - xfs_iext_remove(ip, *idx + 1, 1, state); + xfs_iext_remove(bma->ip, bma->idx + 1, 1, state); - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) - 1); - if (cur == NULL) { + XFS_IFORK_NEXT_SET(bma->ip, whichfork, + XFS_IFORK_NEXTENTS(bma->ip, whichfork) - 1); + if (bma->cur == NULL) { rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); } else { rval = XFS_ILOG_CORE; - if ((error = xfs_bmbt_lookup_eq(cur, - right.br_startoff, - right.br_startblock, - right.br_blockcount, &i))) + error = xfs_bmbt_lookup_eq(bma->cur, right.br_startoff, + right.br_startblock, right.br_blockcount, + &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_btree_delete(cur, &i))) + error = xfs_btree_delete(bma->cur, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_btree_decrement(cur, 0, &i))) + error = xfs_btree_decrement(bma->cur, 0, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_bmbt_update(cur, left.br_startoff, + error = xfs_bmbt_update(bma->cur, left.br_startoff, left.br_startblock, left.br_blockcount + new->br_blockcount + right.br_blockcount, - left.br_state))) + left.br_state); + if (error) goto done; } break; @@ -1889,27 +1722,28 @@ xfs_bmap_add_extent_hole_real( * on the left. * Merge the new allocation with the left neighbor. */ - --*idx; - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), + --bma->idx; + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx), left.br_blockcount + new->br_blockcount); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); - if (cur == NULL) { + if (bma->cur == NULL) { rval = xfs_ilog_fext(whichfork); } else { rval = 0; - if ((error = xfs_bmbt_lookup_eq(cur, - left.br_startoff, - left.br_startblock, - left.br_blockcount, &i))) + error = xfs_bmbt_lookup_eq(bma->cur, left.br_startoff, + left.br_startblock, left.br_blockcount, + &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_bmbt_update(cur, left.br_startoff, + error = xfs_bmbt_update(bma->cur, left.br_startoff, left.br_startblock, left.br_blockcount + new->br_blockcount, - left.br_state))) + left.br_state); + if (error) goto done; } break; @@ -1920,28 +1754,30 @@ xfs_bmap_add_extent_hole_real( * on the right. * Merge the new allocation with the right neighbor. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx), + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); + xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, bma->idx), new->br_startoff, new->br_startblock, new->br_blockcount + right.br_blockcount, right.br_state); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); - if (cur == NULL) { + if (bma->cur == NULL) { rval = xfs_ilog_fext(whichfork); } else { rval = 0; - if ((error = xfs_bmbt_lookup_eq(cur, + error = xfs_bmbt_lookup_eq(bma->cur, right.br_startoff, right.br_startblock, - right.br_blockcount, &i))) + right.br_blockcount, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_bmbt_update(cur, new->br_startoff, + error = xfs_bmbt_update(bma->cur, new->br_startoff, new->br_startblock, new->br_blockcount + right.br_blockcount, - right.br_state))) + right.br_state); + if (error) goto done; } break; @@ -1952,28 +1788,50 @@ xfs_bmap_add_extent_hole_real( * real allocation. * Insert a new entry. */ - xfs_iext_insert(ip, *idx, 1, new, state); - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) + 1); - if (cur == NULL) { + xfs_iext_insert(bma->ip, bma->idx, 1, new, state); + XFS_IFORK_NEXT_SET(bma->ip, whichfork, + XFS_IFORK_NEXTENTS(bma->ip, whichfork) + 1); + if (bma->cur == NULL) { rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); } else { rval = XFS_ILOG_CORE; - if ((error = xfs_bmbt_lookup_eq(cur, + error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff, new->br_startblock, - new->br_blockcount, &i))) + new->br_blockcount, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 0, done); - cur->bc_rec.b.br_state = new->br_state; - if ((error = xfs_btree_insert(cur, &i))) + bma->cur->bc_rec.b.br_state = new->br_state; + error = xfs_btree_insert(bma->cur, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); } break; } + + /* convert to a btree if necessary */ + if (XFS_IFORK_FORMAT(bma->ip, whichfork) == XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_NEXTENTS(bma->ip, whichfork) > ifp->if_ext_max) { + int tmp_logflags; /* partial log flag return val */ + + ASSERT(bma->cur == NULL); + error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, + bma->firstblock, bma->flist, &bma->cur, + 0, &tmp_logflags, whichfork); + bma->logflags |= tmp_logflags; + if (error) + goto done; + } + + /* clear out the allocated field, done with it now in any case. */ + if (bma->cur) + bma->cur->bc_private.b.allocated = 0; + + xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork); done: - *logflagsp = rval; + bma->logflags |= rval; return error; } @@ -2160,26 +2018,26 @@ xfs_bmap_adjacent( XFS_FSB_TO_AGBNO(mp, x) < mp->m_sb.sb_agblocks) mp = ap->ip->i_mount; - nullfb = ap->firstblock == NULLFSBLOCK; + nullfb = *ap->firstblock == NULLFSBLOCK; rt = XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata; - fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock); + fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock); /* * If allocating at eof, and there's a previous real block, * try to use its last block as our starting point. */ - if (ap->eof && ap->prevp->br_startoff != NULLFILEOFF && - !isnullstartblock(ap->prevp->br_startblock) && - ISVALID(ap->prevp->br_startblock + ap->prevp->br_blockcount, - ap->prevp->br_startblock)) { - ap->rval = ap->prevp->br_startblock + ap->prevp->br_blockcount; + if (ap->eof && ap->prev.br_startoff != NULLFILEOFF && + !isnullstartblock(ap->prev.br_startblock) && + ISVALID(ap->prev.br_startblock + ap->prev.br_blockcount, + ap->prev.br_startblock)) { + ap->blkno = ap->prev.br_startblock + ap->prev.br_blockcount; /* * Adjust for the gap between prevp and us. */ - adjust = ap->off - - (ap->prevp->br_startoff + ap->prevp->br_blockcount); + adjust = ap->offset - + (ap->prev.br_startoff + ap->prev.br_blockcount); if (adjust && - ISVALID(ap->rval + adjust, ap->prevp->br_startblock)) - ap->rval += adjust; + ISVALID(ap->blkno + adjust, ap->prev.br_startblock)) + ap->blkno += adjust; } /* * If not at eof, then compare the two neighbor blocks. @@ -2196,17 +2054,17 @@ xfs_bmap_adjacent( * If there's a previous (left) block, select a requested * start block based on it. */ - if (ap->prevp->br_startoff != NULLFILEOFF && - !isnullstartblock(ap->prevp->br_startblock) && - (prevbno = ap->prevp->br_startblock + - ap->prevp->br_blockcount) && - ISVALID(prevbno, ap->prevp->br_startblock)) { + if (ap->prev.br_startoff != NULLFILEOFF && + !isnullstartblock(ap->prev.br_startblock) && + (prevbno = ap->prev.br_startblock + + ap->prev.br_blockcount) && + ISVALID(prevbno, ap->prev.br_startblock)) { /* * Calculate gap to end of previous block. */ - adjust = prevdiff = ap->off - - (ap->prevp->br_startoff + - ap->prevp->br_blockcount); + adjust = prevdiff = ap->offset - + (ap->prev.br_startoff + + ap->prev.br_blockcount); /* * Figure the startblock based on the previous block's * end and the gap size. @@ -2215,9 +2073,9 @@ xfs_bmap_adjacent( * allocating, or using it gives us an invalid block * number, then just use the end of the previous block. */ - if (prevdiff <= XFS_ALLOC_GAP_UNITS * ap->alen && + if (prevdiff <= XFS_ALLOC_GAP_UNITS * ap->length && ISVALID(prevbno + prevdiff, - ap->prevp->br_startblock)) + ap->prev.br_startblock)) prevbno += adjust; else prevdiff += adjust; @@ -2238,16 +2096,16 @@ xfs_bmap_adjacent( * If there's a following (right) block, select a requested * start block based on it. */ - if (!isnullstartblock(ap->gotp->br_startblock)) { + if (!isnullstartblock(ap->got.br_startblock)) { /* * Calculate gap to start of next block. */ - adjust = gotdiff = ap->gotp->br_startoff - ap->off; + adjust = gotdiff = ap->got.br_startoff - ap->offset; /* * Figure the startblock based on the next block's * start and the gap size. */ - gotbno = ap->gotp->br_startblock; + gotbno = ap->got.br_startblock; /* * Heuristic! * If the gap is large relative to the piece we're @@ -2255,12 +2113,12 @@ xfs_bmap_adjacent( * number, then just use the start of the next block * offset by our length. */ - if (gotdiff <= XFS_ALLOC_GAP_UNITS * ap->alen && + if (gotdiff <= XFS_ALLOC_GAP_UNITS * ap->length && ISVALID(gotbno - gotdiff, gotbno)) gotbno -= adjust; - else if (ISVALID(gotbno - ap->alen, gotbno)) { - gotbno -= ap->alen; - gotdiff += adjust - ap->alen; + else if (ISVALID(gotbno - ap->length, gotbno)) { + gotbno -= ap->length; + gotdiff += adjust - ap->length; } else gotdiff += adjust; /* @@ -2278,14 +2136,14 @@ xfs_bmap_adjacent( gotbno = NULLFSBLOCK; /* * If both valid, pick the better one, else the only good - * one, else ap->rval is already set (to 0 or the inode block). + * one, else ap->blkno is already set (to 0 or the inode block). */ if (prevbno != NULLFSBLOCK && gotbno != NULLFSBLOCK) - ap->rval = prevdiff <= gotdiff ? prevbno : gotbno; + ap->blkno = prevdiff <= gotdiff ? prevbno : gotbno; else if (prevbno != NULLFSBLOCK) - ap->rval = prevbno; + ap->blkno = prevbno; else if (gotbno != NULLFSBLOCK) - ap->rval = gotbno; + ap->blkno = gotbno; } #undef ISVALID } @@ -2305,24 +2163,24 @@ xfs_bmap_rtalloc( mp = ap->ip->i_mount; align = xfs_get_extsz_hint(ap->ip); prod = align / mp->m_sb.sb_rextsize; - error = xfs_bmap_extsize_align(mp, ap->gotp, ap->prevp, + error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, align, 1, ap->eof, 0, - ap->conv, &ap->off, &ap->alen); + ap->conv, &ap->offset, &ap->length); if (error) return error; - ASSERT(ap->alen); - ASSERT(ap->alen % mp->m_sb.sb_rextsize == 0); + ASSERT(ap->length); + ASSERT(ap->length % mp->m_sb.sb_rextsize == 0); /* * If the offset & length are not perfectly aligned * then kill prod, it will just get us in trouble. */ - if (do_mod(ap->off, align) || ap->alen % align) + if (do_mod(ap->offset, align) || ap->length % align) prod = 1; /* * Set ralen to be the actual requested length in rtextents. */ - ralen = ap->alen / mp->m_sb.sb_rextsize; + ralen = ap->length / mp->m_sb.sb_rextsize; /* * If the old value was close enough to MAXEXTLEN that * we rounded up to it, cut it back so it's valid again. @@ -2337,21 +2195,21 @@ xfs_bmap_rtalloc( * Lock out other modifications to the RT bitmap inode. */ xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL); - xfs_trans_ijoin_ref(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL); /* * If it's an allocation to an empty file at offset 0, * pick an extent that will space things out in the rt area. */ - if (ap->eof && ap->off == 0) { + if (ap->eof && ap->offset == 0) { xfs_rtblock_t uninitialized_var(rtx); /* realtime extent no */ error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx); if (error) return error; - ap->rval = rtx * mp->m_sb.sb_rextsize; + ap->blkno = rtx * mp->m_sb.sb_rextsize; } else { - ap->rval = 0; + ap->blkno = 0; } xfs_bmap_adjacent(ap); @@ -2359,23 +2217,23 @@ xfs_bmap_rtalloc( /* * Realtime allocation, done through xfs_rtallocate_extent. */ - atype = ap->rval == 0 ? XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO; - do_div(ap->rval, mp->m_sb.sb_rextsize); - rtb = ap->rval; - ap->alen = ralen; - if ((error = xfs_rtallocate_extent(ap->tp, ap->rval, 1, ap->alen, + atype = ap->blkno == 0 ? XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO; + do_div(ap->blkno, mp->m_sb.sb_rextsize); + rtb = ap->blkno; + ap->length = ralen; + if ((error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, ap->length, &ralen, atype, ap->wasdel, prod, &rtb))) return error; if (rtb == NULLFSBLOCK && prod > 1 && - (error = xfs_rtallocate_extent(ap->tp, ap->rval, 1, - ap->alen, &ralen, atype, + (error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, + ap->length, &ralen, atype, ap->wasdel, 1, &rtb))) return error; - ap->rval = rtb; - if (ap->rval != NULLFSBLOCK) { - ap->rval *= mp->m_sb.sb_rextsize; + ap->blkno = rtb; + if (ap->blkno != NULLFSBLOCK) { + ap->blkno *= mp->m_sb.sb_rextsize; ralen *= mp->m_sb.sb_rextsize; - ap->alen = ralen; + ap->length = ralen; ap->ip->i_d.di_nblocks += ralen; xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); if (ap->wasdel) @@ -2388,7 +2246,7 @@ xfs_bmap_rtalloc( ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT : XFS_TRANS_DQ_RTBCOUNT, (long) ralen); } else { - ap->alen = 0; + ap->length = 0; } return 0; } @@ -2503,7 +2361,7 @@ xfs_bmap_btalloc_nullfb( * AG as the stream may have moved. */ if (xfs_inode_is_filestream(ap->ip)) - ap->rval = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0); + ap->blkno = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0); return 0; } @@ -2528,52 +2386,52 @@ xfs_bmap_btalloc( mp = ap->ip->i_mount; align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0; if (unlikely(align)) { - error = xfs_bmap_extsize_align(mp, ap->gotp, ap->prevp, + error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, align, 0, ap->eof, 0, ap->conv, - &ap->off, &ap->alen); + &ap->offset, &ap->length); ASSERT(!error); - ASSERT(ap->alen); + ASSERT(ap->length); } - nullfb = ap->firstblock == NULLFSBLOCK; - fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock); + nullfb = *ap->firstblock == NULLFSBLOCK; + fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock); if (nullfb) { if (ap->userdata && xfs_inode_is_filestream(ap->ip)) { ag = xfs_filestream_lookup_ag(ap->ip); ag = (ag != NULLAGNUMBER) ? ag : 0; - ap->rval = XFS_AGB_TO_FSB(mp, ag, 0); + ap->blkno = XFS_AGB_TO_FSB(mp, ag, 0); } else { - ap->rval = XFS_INO_TO_FSB(mp, ap->ip->i_ino); + ap->blkno = XFS_INO_TO_FSB(mp, ap->ip->i_ino); } } else - ap->rval = ap->firstblock; + ap->blkno = *ap->firstblock; xfs_bmap_adjacent(ap); /* - * If allowed, use ap->rval; otherwise must use firstblock since + * If allowed, use ap->blkno; otherwise must use firstblock since * it's in the right allocation group. */ - if (nullfb || XFS_FSB_TO_AGNO(mp, ap->rval) == fb_agno) + if (nullfb || XFS_FSB_TO_AGNO(mp, ap->blkno) == fb_agno) ; else - ap->rval = ap->firstblock; + ap->blkno = *ap->firstblock; /* * Normal allocation, done through xfs_alloc_vextent. */ tryagain = isaligned = 0; args.tp = ap->tp; args.mp = mp; - args.fsbno = ap->rval; + args.fsbno = ap->blkno; /* Trim the allocation back to the maximum an AG can fit. */ - args.maxlen = MIN(ap->alen, XFS_ALLOC_AG_MAX_USABLE(mp)); - args.firstblock = ap->firstblock; + args.maxlen = MIN(ap->length, XFS_ALLOC_AG_MAX_USABLE(mp)); + args.firstblock = *ap->firstblock; blen = 0; if (nullfb) { error = xfs_bmap_btalloc_nullfb(ap, &args, &blen); if (error) return error; - } else if (ap->low) { + } else if (ap->flist->xbf_low) { if (xfs_inode_is_filestream(ap->ip)) args.type = XFS_ALLOCTYPE_FIRST_AG; else @@ -2587,14 +2445,14 @@ xfs_bmap_btalloc( /* apply extent size hints if obtained earlier */ if (unlikely(align)) { args.prod = align; - if ((args.mod = (xfs_extlen_t)do_mod(ap->off, args.prod))) + if ((args.mod = (xfs_extlen_t)do_mod(ap->offset, args.prod))) args.mod = (xfs_extlen_t)(args.prod - args.mod); } else if (mp->m_sb.sb_blocksize >= PAGE_CACHE_SIZE) { args.prod = 1; args.mod = 0; } else { args.prod = PAGE_CACHE_SIZE >> mp->m_sb.sb_blocklog; - if ((args.mod = (xfs_extlen_t)(do_mod(ap->off, args.prod)))) + if ((args.mod = (xfs_extlen_t)(do_mod(ap->offset, args.prod)))) args.mod = (xfs_extlen_t)(args.prod - args.mod); } /* @@ -2606,8 +2464,8 @@ xfs_bmap_btalloc( * is >= the stripe unit and the allocation offset is * at the end of file. */ - if (!ap->low && ap->aeof) { - if (!ap->off) { + if (!ap->flist->xbf_low && ap->aeof) { + if (!ap->offset) { args.alignment = mp->m_dalign; atype = args.type; isaligned = 1; @@ -2660,7 +2518,7 @@ xfs_bmap_btalloc( * turned on. */ args.type = atype; - args.fsbno = ap->rval; + args.fsbno = ap->blkno; args.alignment = mp->m_dalign; args.minlen = nextminlen; args.minalignslop = 0; @@ -2674,7 +2532,7 @@ xfs_bmap_btalloc( * try again. */ args.type = atype; - args.fsbno = ap->rval; + args.fsbno = ap->blkno; args.alignment = 0; if ((error = xfs_alloc_vextent(&args))) return error; @@ -2683,7 +2541,7 @@ xfs_bmap_btalloc( args.minlen > ap->minlen) { args.minlen = ap->minlen; args.type = XFS_ALLOCTYPE_START_BNO; - args.fsbno = ap->rval; + args.fsbno = ap->blkno; if ((error = xfs_alloc_vextent(&args))) return error; } @@ -2694,13 +2552,26 @@ xfs_bmap_btalloc( args.minleft = 0; if ((error = xfs_alloc_vextent(&args))) return error; - ap->low = 1; + ap->flist->xbf_low = 1; } if (args.fsbno != NULLFSBLOCK) { - ap->firstblock = ap->rval = args.fsbno; + /* + * check the allocation happened at the same or higher AG than + * the first block that was allocated. + */ + ASSERT(*ap->firstblock == NULLFSBLOCK || + XFS_FSB_TO_AGNO(mp, *ap->firstblock) == + XFS_FSB_TO_AGNO(mp, args.fsbno) || + (ap->flist->xbf_low && + XFS_FSB_TO_AGNO(mp, *ap->firstblock) < + XFS_FSB_TO_AGNO(mp, args.fsbno))); + + ap->blkno = args.fsbno; + if (*ap->firstblock == NULLFSBLOCK) + *ap->firstblock = args.fsbno; ASSERT(nullfb || fb_agno == args.agno || - (ap->low && fb_agno < args.agno)); - ap->alen = args.len; + (ap->flist->xbf_low && fb_agno < args.agno)); + ap->length = args.len; ap->ip->i_d.di_nblocks += args.len; xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); if (ap->wasdel) @@ -2714,8 +2585,8 @@ xfs_bmap_btalloc( XFS_TRANS_DQ_BCOUNT, (long) args.len); } else { - ap->rval = NULLFSBLOCK; - ap->alen = 0; + ap->blkno = NULLFSBLOCK; + ap->length = 0; } return 0; } @@ -3344,8 +3215,7 @@ xfs_bmap_local_to_extents( * We don't want to deal with the case of keeping inode data inline yet. * So sending the data fork of a regular inode is invalid. */ - ASSERT(!((ip->i_d.di_mode & S_IFMT) == S_IFREG && - whichfork == XFS_DATA_FORK)); + ASSERT(!(S_ISREG(ip->i_d.di_mode) && whichfork == XFS_DATA_FORK)); ifp = XFS_IFORK_PTR(ip, whichfork); ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); flags = 0; @@ -3384,8 +3254,7 @@ xfs_bmap_local_to_extents( ASSERT(args.len == 1); *firstblock = args.fsbno; bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0); - memcpy((char *)XFS_BUF_PTR(bp), ifp->if_u1.if_data, - ifp->if_bytes); + memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes); xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1); xfs_bmap_forkoff_reset(args.mp, ip, whichfork); xfs_idata_realloc(ip, -ifp->if_bytes, whichfork); @@ -3591,7 +3460,7 @@ xfs_bmap_add_attrfork( } ASSERT(ip->i_d.di_anextents == 0); - xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); switch (ip->i_d.di_format) { @@ -3784,19 +3653,11 @@ xfs_bmap_compute_maxlevels( * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi * caller. Frees all the extents that need freeing, which must be done * last due to locking considerations. We never free any extents in - * the first transaction. This is to allow the caller to make the first - * transaction a synchronous one so that the pointers to the data being - * broken in this transaction will be permanent before the data is actually - * freed. This is necessary to prevent blocks from being reallocated - * and written to before the free and reallocation are actually permanent. - * We do not just make the first transaction synchronous here, because - * there are more efficient ways to gain the same protection in some cases - * (see the file truncation code). + * the first transaction. * * Return 1 if the given transaction was committed and a new one * started, and 0 otherwise in the committed parameter. */ -/*ARGSUSED*/ int /* error */ xfs_bmap_finish( xfs_trans_t **tp, /* transaction pointer addr */ @@ -3996,42 +3857,122 @@ xfs_bmap_last_before( return 0; } +STATIC int +xfs_bmap_last_extent( + struct xfs_trans *tp, + struct xfs_inode *ip, + int whichfork, + struct xfs_bmbt_irec *rec, + int *is_empty) +{ + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + int error; + int nextents; + + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + return error; + } + + nextents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); + if (nextents == 0) { + *is_empty = 1; + return 0; + } + + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, nextents - 1), rec); + *is_empty = 0; + return 0; +} + +/* + * Check the last inode extent to determine whether this allocation will result + * in blocks being allocated at the end of the file. When we allocate new data + * blocks at the end of the file which do not start at the previous data block, + * we will try to align the new blocks at stripe unit boundaries. + * + * Returns 0 in bma->aeof if the file (fork) is empty as any new write will be + * at, or past the EOF. + */ +STATIC int +xfs_bmap_isaeof( + struct xfs_bmalloca *bma, + int whichfork) +{ + struct xfs_bmbt_irec rec; + int is_empty; + int error; + + bma->aeof = 0; + error = xfs_bmap_last_extent(NULL, bma->ip, whichfork, &rec, + &is_empty); + if (error || is_empty) + return error; + + /* + * Check if we are allocation or past the last extent, or at least into + * the last delayed allocated extent. + */ + bma->aeof = bma->offset >= rec.br_startoff + rec.br_blockcount || + (bma->offset >= rec.br_startoff && + isnullstartblock(rec.br_startblock)); + return 0; +} + +/* + * Check if the endoff is outside the last extent. If so the caller will grow + * the allocation to a stripe unit boundary. All offsets are considered outside + * the end of file for an empty fork, so 1 is returned in *eof in that case. + */ +int +xfs_bmap_eof( + struct xfs_inode *ip, + xfs_fileoff_t endoff, + int whichfork, + int *eof) +{ + struct xfs_bmbt_irec rec; + int error; + + error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, eof); + if (error || *eof) + return error; + + *eof = endoff >= rec.br_startoff + rec.br_blockcount; + return 0; +} + /* * Returns the file-relative block number of the first block past eof in * the file. This is not based on i_size, it is based on the extent records. * Returns 0 for local files, as they do not have extent records. */ -int /* error */ +int xfs_bmap_last_offset( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode */ - xfs_fileoff_t *last_block, /* last block */ - int whichfork) /* data or attr fork */ + struct xfs_trans *tp, + struct xfs_inode *ip, + xfs_fileoff_t *last_block, + int whichfork) { - xfs_bmbt_rec_host_t *ep; /* pointer to last extent */ - int error; /* error return value */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_extnum_t nextents; /* number of extent entries */ + struct xfs_bmbt_irec rec; + int is_empty; + int error; + + *last_block = 0; + + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) + return 0; if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL) + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) return XFS_ERROR(EIO); - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { - *last_block = 0; - return 0; - } - ifp = XFS_IFORK_PTR(ip, whichfork); - if (!(ifp->if_flags & XFS_IFEXTENTS) && - (error = xfs_iread_extents(tp, ip, whichfork))) + + error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty); + if (error || is_empty) return error; - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - if (!nextents) { - *last_block = 0; - return 0; - } - ep = xfs_iext_get_ext(ifp, nextents - 1); - *last_block = xfs_bmbt_get_startoff(ep) + xfs_bmbt_get_blockcount(ep); + + *last_block = rec.br_startoff + rec.br_blockcount; return 0; } @@ -4052,7 +3993,7 @@ xfs_bmap_one_block( #ifndef DEBUG if (whichfork == XFS_DATA_FORK) { - return ((ip->i_d.di_mode & S_IFMT) == S_IFREG) ? + return S_ISREG(ip->i_d.di_mode) ? (ip->i_size == ip->i_mount->m_sb.sb_blocksize) : (ip->i_d.di_size == ip->i_mount->m_sb.sb_blocksize); } @@ -4161,7 +4102,6 @@ xfs_bmap_read_extents( xfs_extnum_t num_recs; xfs_extnum_t start; - num_recs = xfs_btree_get_numrecs(block); if (unlikely(i + num_recs > room)) { ASSERT(i + num_recs <= room); @@ -4284,9 +4224,8 @@ xfs_bmap_validate_ret( ASSERT(i == 0 || mval[i - 1].br_startoff + mval[i - 1].br_blockcount == mval[i].br_startoff); - if ((flags & XFS_BMAPI_WRITE) && !(flags & XFS_BMAPI_DELAY)) - ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK && - mval[i].br_startblock != HOLESTARTBLOCK); + ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK && + mval[i].br_startblock != HOLESTARTBLOCK); ASSERT(mval[i].br_state == XFS_EXT_NORM || mval[i].br_state == XFS_EXT_UNWRITTEN); } @@ -4295,66 +4234,609 @@ xfs_bmap_validate_ret( /* - * Map file blocks to filesystem blocks. - * File range is given by the bno/len pair. - * Adds blocks to file if a write ("flags & XFS_BMAPI_WRITE" set) - * into a hole or past eof. - * Only allocates blocks from a single allocation group, - * to avoid locking problems. + * Trim the returned map to the required bounds + */ +STATIC void +xfs_bmapi_trim_map( + struct xfs_bmbt_irec *mval, + struct xfs_bmbt_irec *got, + xfs_fileoff_t *bno, + xfs_filblks_t len, + xfs_fileoff_t obno, + xfs_fileoff_t end, + int n, + int flags) +{ + if ((flags & XFS_BMAPI_ENTIRE) || + got->br_startoff + got->br_blockcount <= obno) { + *mval = *got; + if (isnullstartblock(got->br_startblock)) + mval->br_startblock = DELAYSTARTBLOCK; + return; + } + + if (obno > *bno) + *bno = obno; + ASSERT((*bno >= obno) || (n == 0)); + ASSERT(*bno < end); + mval->br_startoff = *bno; + if (isnullstartblock(got->br_startblock)) + mval->br_startblock = DELAYSTARTBLOCK; + else + mval->br_startblock = got->br_startblock + + (*bno - got->br_startoff); + /* + * Return the minimum of what we got and what we asked for for + * the length. We can use the len variable here because it is + * modified below and we could have been there before coming + * here if the first part of the allocation didn't overlap what + * was asked for. + */ + mval->br_blockcount = XFS_FILBLKS_MIN(end - *bno, + got->br_blockcount - (*bno - got->br_startoff)); + mval->br_state = got->br_state; + ASSERT(mval->br_blockcount <= len); + return; +} + +/* + * Update and validate the extent map to return + */ +STATIC void +xfs_bmapi_update_map( + struct xfs_bmbt_irec **map, + xfs_fileoff_t *bno, + xfs_filblks_t *len, + xfs_fileoff_t obno, + xfs_fileoff_t end, + int *n, + int flags) +{ + xfs_bmbt_irec_t *mval = *map; + + ASSERT((flags & XFS_BMAPI_ENTIRE) || + ((mval->br_startoff + mval->br_blockcount) <= end)); + ASSERT((flags & XFS_BMAPI_ENTIRE) || (mval->br_blockcount <= *len) || + (mval->br_startoff < obno)); + + *bno = mval->br_startoff + mval->br_blockcount; + *len = end - *bno; + if (*n > 0 && mval->br_startoff == mval[-1].br_startoff) { + /* update previous map with new information */ + ASSERT(mval->br_startblock == mval[-1].br_startblock); + ASSERT(mval->br_blockcount > mval[-1].br_blockcount); + ASSERT(mval->br_state == mval[-1].br_state); + mval[-1].br_blockcount = mval->br_blockcount; + mval[-1].br_state = mval->br_state; + } else if (*n > 0 && mval->br_startblock != DELAYSTARTBLOCK && + mval[-1].br_startblock != DELAYSTARTBLOCK && + mval[-1].br_startblock != HOLESTARTBLOCK && + mval->br_startblock == mval[-1].br_startblock + + mval[-1].br_blockcount && + ((flags & XFS_BMAPI_IGSTATE) || + mval[-1].br_state == mval->br_state)) { + ASSERT(mval->br_startoff == + mval[-1].br_startoff + mval[-1].br_blockcount); + mval[-1].br_blockcount += mval->br_blockcount; + } else if (*n > 0 && + mval->br_startblock == DELAYSTARTBLOCK && + mval[-1].br_startblock == DELAYSTARTBLOCK && + mval->br_startoff == + mval[-1].br_startoff + mval[-1].br_blockcount) { + mval[-1].br_blockcount += mval->br_blockcount; + mval[-1].br_state = mval->br_state; + } else if (!((*n == 0) && + ((mval->br_startoff + mval->br_blockcount) <= + obno))) { + mval++; + (*n)++; + } + *map = mval; +} + +/* + * Map file blocks to filesystem blocks without allocation. + */ +int +xfs_bmapi_read( + struct xfs_inode *ip, + xfs_fileoff_t bno, + xfs_filblks_t len, + struct xfs_bmbt_irec *mval, + int *nmap, + int flags) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp; + struct xfs_bmbt_irec got; + struct xfs_bmbt_irec prev; + xfs_fileoff_t obno; + xfs_fileoff_t end; + xfs_extnum_t lastx; + int error; + int eof; + int n = 0; + int whichfork = (flags & XFS_BMAPI_ATTRFORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; + + ASSERT(*nmap >= 1); + ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK|XFS_BMAPI_ENTIRE| + XFS_BMAPI_IGSTATE))); + + if (unlikely(XFS_TEST_ERROR( + (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), + mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { + XFS_ERROR_REPORT("xfs_bmapi_read", XFS_ERRLEVEL_LOW, mp); + return XFS_ERROR(EFSCORRUPTED); + } + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + XFS_STATS_INC(xs_blk_mapr); + + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(ifp->if_ext_max == + XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); + + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(NULL, ip, whichfork); + if (error) + return error; + } + + xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, &prev); + end = bno + len; + obno = bno; + + while (bno < end && n < *nmap) { + /* Reading past eof, act as though there's a hole up to end. */ + if (eof) + got.br_startoff = end; + if (got.br_startoff > bno) { + /* Reading in a hole. */ + mval->br_startoff = bno; + mval->br_startblock = HOLESTARTBLOCK; + mval->br_blockcount = + XFS_FILBLKS_MIN(len, got.br_startoff - bno); + mval->br_state = XFS_EXT_NORM; + bno += mval->br_blockcount; + len -= mval->br_blockcount; + mval++; + n++; + continue; + } + + /* set up the extent map to return. */ + xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags); + xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags); + + /* If we're done, stop now. */ + if (bno >= end || n >= *nmap) + break; + + /* Else go on to the next record. */ + if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got); + else + eof = 1; + } + *nmap = n; + return 0; +} + +STATIC int +xfs_bmapi_reserve_delalloc( + struct xfs_inode *ip, + xfs_fileoff_t aoff, + xfs_filblks_t len, + struct xfs_bmbt_irec *got, + struct xfs_bmbt_irec *prev, + xfs_extnum_t *lastx, + int eof) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + xfs_extlen_t alen; + xfs_extlen_t indlen; + char rt = XFS_IS_REALTIME_INODE(ip); + xfs_extlen_t extsz; + int error; + + alen = XFS_FILBLKS_MIN(len, MAXEXTLEN); + if (!eof) + alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff); + + /* Figure out the extent size, adjust alen */ + extsz = xfs_get_extsz_hint(ip); + if (extsz) { + /* + * Make sure we don't exceed a single extent length when we + * align the extent by reducing length we are going to + * allocate by the maximum amount extent size aligment may + * require. + */ + alen = XFS_FILBLKS_MIN(len, MAXEXTLEN - (2 * extsz - 1)); + error = xfs_bmap_extsize_align(mp, got, prev, extsz, rt, eof, + 1, 0, &aoff, &alen); + ASSERT(!error); + } + + if (rt) + extsz = alen / mp->m_sb.sb_rextsize; + + /* + * Make a transaction-less quota reservation for delayed allocation + * blocks. This number gets adjusted later. We return if we haven't + * allocated blocks already inside this loop. + */ + error = xfs_trans_reserve_quota_nblks(NULL, ip, (long)alen, 0, + rt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS); + if (error) + return error; + + /* + * Split changing sb for alen and indlen since they could be coming + * from different places. + */ + indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen); + ASSERT(indlen > 0); + + if (rt) { + error = xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, + -((int64_t)extsz), 0); + } else { + error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, + -((int64_t)alen), 0); + } + + if (error) + goto out_unreserve_quota; + + error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, + -((int64_t)indlen), 0); + if (error) + goto out_unreserve_blocks; + + + ip->i_delayed_blks += alen; + + got->br_startoff = aoff; + got->br_startblock = nullstartblock(indlen); + got->br_blockcount = alen; + got->br_state = XFS_EXT_NORM; + xfs_bmap_add_extent_hole_delay(ip, lastx, got); + + /* + * Update our extent pointer, given that xfs_bmap_add_extent_hole_delay + * might have merged it into one of the neighbouring ones. + */ + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), got); + + ASSERT(got->br_startoff <= aoff); + ASSERT(got->br_startoff + got->br_blockcount >= aoff + alen); + ASSERT(isnullstartblock(got->br_startblock)); + ASSERT(got->br_state == XFS_EXT_NORM); + return 0; + +out_unreserve_blocks: + if (rt) + xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, extsz, 0); + else + xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, alen, 0); +out_unreserve_quota: + if (XFS_IS_QUOTA_ON(mp)) + xfs_trans_unreserve_quota_nblks(NULL, ip, alen, 0, rt ? + XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS); + return error; +} + +/* + * Map file blocks to filesystem blocks, adding delayed allocations as needed. + */ +int +xfs_bmapi_delay( + struct xfs_inode *ip, /* incore inode */ + xfs_fileoff_t bno, /* starting file offs. mapped */ + xfs_filblks_t len, /* length to map in file */ + struct xfs_bmbt_irec *mval, /* output: map values */ + int *nmap, /* i/o: mval size/count */ + int flags) /* XFS_BMAPI_... */ +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + struct xfs_bmbt_irec got; /* current file extent record */ + struct xfs_bmbt_irec prev; /* previous file extent record */ + xfs_fileoff_t obno; /* old block number (offset) */ + xfs_fileoff_t end; /* end of mapped file region */ + xfs_extnum_t lastx; /* last useful extent number */ + int eof; /* we've hit the end of extents */ + int n = 0; /* current extent index */ + int error = 0; + + ASSERT(*nmap >= 1); + ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); + ASSERT(!(flags & ~XFS_BMAPI_ENTIRE)); + + if (unlikely(XFS_TEST_ERROR( + (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE), + mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { + XFS_ERROR_REPORT("xfs_bmapi_delay", XFS_ERRLEVEL_LOW, mp); + return XFS_ERROR(EFSCORRUPTED); + } + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + XFS_STATS_INC(xs_blk_mapw); + + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); + if (error) + return error; + } + + xfs_bmap_search_extents(ip, bno, XFS_DATA_FORK, &eof, &lastx, &got, &prev); + end = bno + len; + obno = bno; + + while (bno < end && n < *nmap) { + if (eof || got.br_startoff > bno) { + error = xfs_bmapi_reserve_delalloc(ip, bno, len, &got, + &prev, &lastx, eof); + if (error) { + if (n == 0) { + *nmap = 0; + return error; + } + break; + } + } + + /* set up the extent map to return. */ + xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags); + xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags); + + /* If we're done, stop now. */ + if (bno >= end || n >= *nmap) + break; + + /* Else go on to the next record. */ + prev = got; + if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got); + else + eof = 1; + } + + *nmap = n; + return 0; +} + + +STATIC int +xfs_bmapi_allocate( + struct xfs_bmalloca *bma, + int flags) +{ + struct xfs_mount *mp = bma->ip->i_mount; + int whichfork = (flags & XFS_BMAPI_ATTRFORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; + struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); + int tmp_logflags = 0; + int error; + int rt; + + rt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(bma->ip); + + /* + * For the wasdelay case, we could also just allocate the stuff asked + * for in this bmap call but that wouldn't be as good. + */ + if (bma->wasdel) { + bma->length = (xfs_extlen_t)bma->got.br_blockcount; + bma->offset = bma->got.br_startoff; + if (bma->idx != NULLEXTNUM && bma->idx) { + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), + &bma->prev); + } + } else { + bma->length = XFS_FILBLKS_MIN(bma->length, MAXEXTLEN); + if (!bma->eof) + bma->length = XFS_FILBLKS_MIN(bma->length, + bma->got.br_startoff - bma->offset); + } + + /* + * Indicate if this is the first user data in the file, or just any + * user data. + */ + if (!(flags & XFS_BMAPI_METADATA)) { + bma->userdata = (bma->offset == 0) ? + XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA; + } + + bma->minlen = (flags & XFS_BMAPI_CONTIG) ? bma->length : 1; + + /* + * Only want to do the alignment at the eof if it is userdata and + * allocation length is larger than a stripe unit. + */ + if (mp->m_dalign && bma->length >= mp->m_dalign && + !(flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) { + error = xfs_bmap_isaeof(bma, whichfork); + if (error) + return error; + } + + error = xfs_bmap_alloc(bma); + if (error) + return error; + + if (bma->flist->xbf_low) + bma->minleft = 0; + if (bma->cur) + bma->cur->bc_private.b.firstblock = *bma->firstblock; + if (bma->blkno == NULLFSBLOCK) + return 0; + if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur) { + bma->cur = xfs_bmbt_init_cursor(mp, bma->tp, bma->ip, whichfork); + bma->cur->bc_private.b.firstblock = *bma->firstblock; + bma->cur->bc_private.b.flist = bma->flist; + } + /* + * Bump the number of extents we've allocated + * in this call. + */ + bma->nallocs++; + + if (bma->cur) + bma->cur->bc_private.b.flags = + bma->wasdel ? XFS_BTCUR_BPRV_WASDEL : 0; + + bma->got.br_startoff = bma->offset; + bma->got.br_startblock = bma->blkno; + bma->got.br_blockcount = bma->length; + bma->got.br_state = XFS_EXT_NORM; + + /* + * A wasdelay extent has been initialized, so shouldn't be flagged + * as unwritten. + */ + if (!bma->wasdel && (flags & XFS_BMAPI_PREALLOC) && + xfs_sb_version_hasextflgbit(&mp->m_sb)) + bma->got.br_state = XFS_EXT_UNWRITTEN; + + if (bma->wasdel) + error = xfs_bmap_add_extent_delay_real(bma); + else + error = xfs_bmap_add_extent_hole_real(bma, whichfork); + + bma->logflags |= tmp_logflags; + if (error) + return error; + + /* + * Update our extent pointer, given that xfs_bmap_add_extent_delay_real + * or xfs_bmap_add_extent_hole_real might have merged it into one of + * the neighbouring ones. + */ + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &bma->got); + + ASSERT(bma->got.br_startoff <= bma->offset); + ASSERT(bma->got.br_startoff + bma->got.br_blockcount >= + bma->offset + bma->length); + ASSERT(bma->got.br_state == XFS_EXT_NORM || + bma->got.br_state == XFS_EXT_UNWRITTEN); + return 0; +} + +STATIC int +xfs_bmapi_convert_unwritten( + struct xfs_bmalloca *bma, + struct xfs_bmbt_irec *mval, + xfs_filblks_t len, + int flags) +{ + int whichfork = (flags & XFS_BMAPI_ATTRFORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; + struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); + int tmp_logflags = 0; + int error; + + /* check if we need to do unwritten->real conversion */ + if (mval->br_state == XFS_EXT_UNWRITTEN && + (flags & XFS_BMAPI_PREALLOC)) + return 0; + + /* check if we need to do real->unwritten conversion */ + if (mval->br_state == XFS_EXT_NORM && + (flags & (XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT)) != + (XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT)) + return 0; + + /* + * Modify (by adding) the state flag, if writing. + */ + ASSERT(mval->br_blockcount <= len); + if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur) { + bma->cur = xfs_bmbt_init_cursor(bma->ip->i_mount, bma->tp, + bma->ip, whichfork); + bma->cur->bc_private.b.firstblock = *bma->firstblock; + bma->cur->bc_private.b.flist = bma->flist; + } + mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN) + ? XFS_EXT_NORM : XFS_EXT_UNWRITTEN; + + error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx, + &bma->cur, mval, bma->firstblock, bma->flist, + &tmp_logflags); + bma->logflags |= tmp_logflags; + if (error) + return error; + + /* + * Update our extent pointer, given that + * xfs_bmap_add_extent_unwritten_real might have merged it into one + * of the neighbouring ones. + */ + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &bma->got); + + /* + * We may have combined previously unwritten space with written space, + * so generate another request. + */ + if (mval->br_blockcount < len) + return EAGAIN; + return 0; +} + +/* + * Map file blocks to filesystem blocks, and allocate blocks or convert the + * extent state if necessary. Details behaviour is controlled by the flags + * parameter. Only allocates blocks from a single allocation group, to avoid + * locking problems. + * * The returned value in "firstblock" from the first call in a transaction * must be remembered and presented to subsequent calls in "firstblock". * An upper bound for the number of blocks to be allocated is supplied to * the first call in "total"; if no allocation group has that many free * blocks then the call will fail (return NULLFSBLOCK in "firstblock"). */ -int /* error */ -xfs_bmapi( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode */ - xfs_fileoff_t bno, /* starting file offs. mapped */ - xfs_filblks_t len, /* length to map in file */ - int flags, /* XFS_BMAPI_... */ - xfs_fsblock_t *firstblock, /* first allocated block - controls a.g. for allocs */ - xfs_extlen_t total, /* total blocks needed */ - xfs_bmbt_irec_t *mval, /* output: map values */ - int *nmap, /* i/o: mval size/count */ - xfs_bmap_free_t *flist) /* i/o: list extents to free */ +int +xfs_bmapi_write( + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_inode *ip, /* incore inode */ + xfs_fileoff_t bno, /* starting file offs. mapped */ + xfs_filblks_t len, /* length to map in file */ + int flags, /* XFS_BMAPI_... */ + xfs_fsblock_t *firstblock, /* first allocated block + controls a.g. for allocs */ + xfs_extlen_t total, /* total blocks needed */ + struct xfs_bmbt_irec *mval, /* output: map values */ + int *nmap, /* i/o: mval size/count */ + struct xfs_bmap_free *flist) /* i/o: list extents to free */ { - xfs_fsblock_t abno; /* allocated block number */ - xfs_extlen_t alen; /* allocated extent length */ - xfs_fileoff_t aoff; /* allocated file offset */ - xfs_bmalloca_t bma = { 0 }; /* args for xfs_bmap_alloc */ - xfs_btree_cur_t *cur; /* bmap btree cursor */ - xfs_fileoff_t end; /* end of mapped file region */ - int eof; /* we've hit the end of extents */ - xfs_bmbt_rec_host_t *ep; /* extent record pointer */ - int error; /* error return */ - xfs_bmbt_irec_t got; /* current file extent record */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_extlen_t indlen; /* indirect blocks length */ - xfs_extnum_t lastx; /* last useful extent number */ - int logflags; /* flags for transaction logging */ - xfs_extlen_t minleft; /* min blocks left after allocation */ - xfs_extlen_t minlen; /* min allocation size */ - xfs_mount_t *mp; /* xfs mount structure */ - int n; /* current extent index */ - int nallocs; /* number of extents alloc'd */ - xfs_extnum_t nextents; /* number of extents in file */ - xfs_fileoff_t obno; /* old block number (offset) */ - xfs_bmbt_irec_t prev; /* previous file extent record */ - int tmp_logflags; /* temp flags holder */ - int whichfork; /* data or attr fork */ - char inhole; /* current location is hole in file */ - char wasdelay; /* old extent was delayed */ - char wr; /* this is a write request */ - char rt; /* this is a realtime file */ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp; + struct xfs_bmalloca bma = { 0 }; /* args for xfs_bmap_alloc */ + xfs_fileoff_t end; /* end of mapped file region */ + int eof; /* after the end of extents */ + int error; /* error return */ + int n; /* current extent index */ + xfs_fileoff_t obno; /* old block number (offset) */ + int whichfork; /* data or attr fork */ + char inhole; /* current location is hole in file */ + char wasdelay; /* old extent was delayed */ + #ifdef DEBUG - xfs_fileoff_t orig_bno; /* original block number value */ - int orig_flags; /* original flags arg value */ - xfs_filblks_t orig_len; /* original value of len arg */ - xfs_bmbt_irec_t *orig_mval; /* original value of mval */ - int orig_nmap; /* original value of *nmap */ + xfs_fileoff_t orig_bno; /* original block number value */ + int orig_flags; /* original flags arg value */ + xfs_filblks_t orig_len; /* original value of len arg */ + struct xfs_bmbt_irec *orig_mval; /* original value of mval */ + int orig_nmap; /* original value of *nmap */ orig_bno = bno; orig_len = len; @@ -4362,488 +4844,133 @@ xfs_bmapi( orig_mval = mval; orig_nmap = *nmap; #endif + ASSERT(*nmap >= 1); - ASSERT(*nmap <= XFS_BMAP_MAX_NMAP || !(flags & XFS_BMAPI_WRITE)); + ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); + ASSERT(!(flags & XFS_BMAPI_IGSTATE)); + ASSERT(tp != NULL); + whichfork = (flags & XFS_BMAPI_ATTRFORK) ? XFS_ATTR_FORK : XFS_DATA_FORK; - mp = ip->i_mount; + if (unlikely(XFS_TEST_ERROR( (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL), mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { - XFS_ERROR_REPORT("xfs_bmapi", XFS_ERRLEVEL_LOW, mp); + XFS_ERROR_REPORT("xfs_bmapi_write", XFS_ERRLEVEL_LOW, mp); return XFS_ERROR(EFSCORRUPTED); } + if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); - rt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip); + ifp = XFS_IFORK_PTR(ip, whichfork); ASSERT(ifp->if_ext_max == XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); - if ((wr = (flags & XFS_BMAPI_WRITE)) != 0) - XFS_STATS_INC(xs_blk_mapw); - else - XFS_STATS_INC(xs_blk_mapr); - /* - * IGSTATE flag is used to combine extents which - * differ only due to the state of the extents. - * This technique is used from xfs_getbmap() - * when the caller does not wish to see the - * separation (which is the default). - * - * This technique is also used when writing a - * buffer which has been partially written, - * (usually by being flushed during a chunkread), - * to ensure one write takes place. This also - * prevents a change in the xfs inode extents at - * this time, intentionally. This change occurs - * on completion of the write operation, in - * xfs_strat_comp(), where the xfs_bmapi() call - * is transactioned, and the extents combined. - */ - if ((flags & XFS_BMAPI_IGSTATE) && wr) /* if writing unwritten space */ - wr = 0; /* no allocations are allowed */ - ASSERT(wr || !(flags & XFS_BMAPI_DELAY)); - logflags = 0; - nallocs = 0; - cur = NULL; + + XFS_STATS_INC(xs_blk_mapw); + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { - ASSERT(wr && tp); - if ((error = xfs_bmap_local_to_extents(tp, ip, - firstblock, total, &logflags, whichfork))) + error = xfs_bmap_local_to_extents(tp, ip, firstblock, total, + &bma.logflags, whichfork); + if (error) goto error0; } - if (wr && *firstblock == NULLFSBLOCK) { + + if (*firstblock == NULLFSBLOCK) { if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE) - minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1; + bma.minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1; else - minleft = 1; - } else - minleft = 0; - if (!(ifp->if_flags & XFS_IFEXTENTS) && - (error = xfs_iread_extents(tp, ip, whichfork))) - goto error0; - ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, - &prev); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + bma.minleft = 1; + } else { + bma.minleft = 0; + } + + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + goto error0; + } + + xfs_bmap_search_extents(ip, bno, whichfork, &eof, &bma.idx, &bma.got, + &bma.prev); n = 0; end = bno + len; obno = bno; - bma.ip = NULL; + + bma.tp = tp; + bma.ip = ip; + bma.total = total; + bma.userdata = 0; + bma.flist = flist; + bma.firstblock = firstblock; while (bno < end && n < *nmap) { - /* - * Reading past eof, act as though there's a hole - * up to end. - */ - if (eof && !wr) - got.br_startoff = end; - inhole = eof || got.br_startoff > bno; - wasdelay = wr && !inhole && !(flags & XFS_BMAPI_DELAY) && - isnullstartblock(got.br_startblock); + inhole = eof || bma.got.br_startoff > bno; + wasdelay = !inhole && isnullstartblock(bma.got.br_startblock); + /* * First, deal with the hole before the allocated space * that we found, if any. */ - if (wr && (inhole || wasdelay)) { - /* - * For the wasdelay case, we could also just - * allocate the stuff asked for in this bmap call - * but that wouldn't be as good. - */ - if (wasdelay) { - alen = (xfs_extlen_t)got.br_blockcount; - aoff = got.br_startoff; - if (lastx != NULLEXTNUM && lastx) { - ep = xfs_iext_get_ext(ifp, lastx - 1); - xfs_bmbt_get_all(ep, &prev); - } - } else { - alen = (xfs_extlen_t) - XFS_FILBLKS_MIN(len, MAXEXTLEN); - if (!eof) - alen = (xfs_extlen_t) - XFS_FILBLKS_MIN(alen, - got.br_startoff - bno); - aoff = bno; - } - minlen = (flags & XFS_BMAPI_CONTIG) ? alen : 1; - if (flags & XFS_BMAPI_DELAY) { - xfs_extlen_t extsz; - - /* Figure out the extent size, adjust alen */ - extsz = xfs_get_extsz_hint(ip); - if (extsz) { - /* - * make sure we don't exceed a single - * extent length when we align the - * extent by reducing length we are - * going to allocate by the maximum - * amount extent size aligment may - * require. - */ - alen = XFS_FILBLKS_MIN(len, - MAXEXTLEN - (2 * extsz - 1)); - error = xfs_bmap_extsize_align(mp, - &got, &prev, extsz, - rt, eof, - flags&XFS_BMAPI_DELAY, - flags&XFS_BMAPI_CONVERT, - &aoff, &alen); - ASSERT(!error); - } - - if (rt) - extsz = alen / mp->m_sb.sb_rextsize; - - /* - * Make a transaction-less quota reservation for - * delayed allocation blocks. This number gets - * adjusted later. We return if we haven't - * allocated blocks already inside this loop. - */ - error = xfs_trans_reserve_quota_nblks( - NULL, ip, (long)alen, 0, - rt ? XFS_QMOPT_RES_RTBLKS : - XFS_QMOPT_RES_REGBLKS); - if (error) { - if (n == 0) { - *nmap = 0; - ASSERT(cur == NULL); - return error; - } - break; - } - - /* - * Split changing sb for alen and indlen since - * they could be coming from different places. - */ - indlen = (xfs_extlen_t) - xfs_bmap_worst_indlen(ip, alen); - ASSERT(indlen > 0); - - if (rt) { - error = xfs_mod_incore_sb(mp, - XFS_SBS_FREXTENTS, - -((int64_t)extsz), 0); - } else { - error = xfs_icsb_modify_counters(mp, - XFS_SBS_FDBLOCKS, - -((int64_t)alen), 0); - } - if (!error) { - error = xfs_icsb_modify_counters(mp, - XFS_SBS_FDBLOCKS, - -((int64_t)indlen), 0); - if (error && rt) - xfs_mod_incore_sb(mp, - XFS_SBS_FREXTENTS, - (int64_t)extsz, 0); - else if (error) - xfs_icsb_modify_counters(mp, - XFS_SBS_FDBLOCKS, - (int64_t)alen, 0); - } - - if (error) { - if (XFS_IS_QUOTA_ON(mp)) - /* unreserve the blocks now */ - (void) - xfs_trans_unreserve_quota_nblks( - NULL, ip, - (long)alen, 0, rt ? - XFS_QMOPT_RES_RTBLKS : - XFS_QMOPT_RES_REGBLKS); - break; - } - - ip->i_delayed_blks += alen; - abno = nullstartblock(indlen); - } else { - /* - * If first time, allocate and fill in - * once-only bma fields. - */ - if (bma.ip == NULL) { - bma.tp = tp; - bma.ip = ip; - bma.prevp = &prev; - bma.gotp = &got; - bma.total = total; - bma.userdata = 0; - } - /* Indicate if this is the first user data - * in the file, or just any user data. - */ - if (!(flags & XFS_BMAPI_METADATA)) { - bma.userdata = (aoff == 0) ? - XFS_ALLOC_INITIAL_USER_DATA : - XFS_ALLOC_USERDATA; - } - /* - * Fill in changeable bma fields. - */ - bma.eof = eof; - bma.firstblock = *firstblock; - bma.alen = alen; - bma.off = aoff; - bma.conv = !!(flags & XFS_BMAPI_CONVERT); - bma.wasdel = wasdelay; - bma.minlen = minlen; - bma.low = flist->xbf_low; - bma.minleft = minleft; - /* - * Only want to do the alignment at the - * eof if it is userdata and allocation length - * is larger than a stripe unit. - */ - if (mp->m_dalign && alen >= mp->m_dalign && - (!(flags & XFS_BMAPI_METADATA)) && - (whichfork == XFS_DATA_FORK)) { - if ((error = xfs_bmap_isaeof(ip, aoff, - whichfork, &bma.aeof))) - goto error0; - } else - bma.aeof = 0; - /* - * Call allocator. - */ - if ((error = xfs_bmap_alloc(&bma))) - goto error0; - /* - * Copy out result fields. - */ - abno = bma.rval; - if ((flist->xbf_low = bma.low)) - minleft = 0; - alen = bma.alen; - aoff = bma.off; - ASSERT(*firstblock == NULLFSBLOCK || - XFS_FSB_TO_AGNO(mp, *firstblock) == - XFS_FSB_TO_AGNO(mp, bma.firstblock) || - (flist->xbf_low && - XFS_FSB_TO_AGNO(mp, *firstblock) < - XFS_FSB_TO_AGNO(mp, bma.firstblock))); - *firstblock = bma.firstblock; - if (cur) - cur->bc_private.b.firstblock = - *firstblock; - if (abno == NULLFSBLOCK) - break; - if ((ifp->if_flags & XFS_IFBROOT) && !cur) { - cur = xfs_bmbt_init_cursor(mp, tp, - ip, whichfork); - cur->bc_private.b.firstblock = - *firstblock; - cur->bc_private.b.flist = flist; - } - /* - * Bump the number of extents we've allocated - * in this call. - */ - nallocs++; - } - if (cur) - cur->bc_private.b.flags = - wasdelay ? XFS_BTCUR_BPRV_WASDEL : 0; - got.br_startoff = aoff; - got.br_startblock = abno; - got.br_blockcount = alen; - got.br_state = XFS_EXT_NORM; /* assume normal */ - /* - * Determine state of extent, and the filesystem. - * A wasdelay extent has been initialized, so - * shouldn't be flagged as unwritten. - */ - if (wr && xfs_sb_version_hasextflgbit(&mp->m_sb)) { - if (!wasdelay && (flags & XFS_BMAPI_PREALLOC)) - got.br_state = XFS_EXT_UNWRITTEN; - } - error = xfs_bmap_add_extent(tp, ip, &lastx, &cur, &got, - firstblock, flist, &tmp_logflags, - whichfork); - logflags |= tmp_logflags; + if (inhole || wasdelay) { + bma.eof = eof; + bma.conv = !!(flags & XFS_BMAPI_CONVERT); + bma.wasdel = wasdelay; + bma.length = len; + bma.offset = bno; + + error = xfs_bmapi_allocate(&bma, flags); if (error) goto error0; - ep = xfs_iext_get_ext(ifp, lastx); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - xfs_bmbt_get_all(ep, &got); - ASSERT(got.br_startoff <= aoff); - ASSERT(got.br_startoff + got.br_blockcount >= - aoff + alen); -#ifdef DEBUG - if (flags & XFS_BMAPI_DELAY) { - ASSERT(isnullstartblock(got.br_startblock)); - ASSERT(startblockval(got.br_startblock) > 0); - } - ASSERT(got.br_state == XFS_EXT_NORM || - got.br_state == XFS_EXT_UNWRITTEN); -#endif - /* - * Fall down into the found allocated space case. - */ - } else if (inhole) { - /* - * Reading in a hole. - */ - mval->br_startoff = bno; - mval->br_startblock = HOLESTARTBLOCK; - mval->br_blockcount = - XFS_FILBLKS_MIN(len, got.br_startoff - bno); - mval->br_state = XFS_EXT_NORM; - bno += mval->br_blockcount; - len -= mval->br_blockcount; - mval++; - n++; - continue; - } - /* - * Then deal with the allocated space we found. - */ - ASSERT(ep != NULL); - if (!(flags & XFS_BMAPI_ENTIRE) && - (got.br_startoff + got.br_blockcount > obno)) { - if (obno > bno) - bno = obno; - ASSERT((bno >= obno) || (n == 0)); - ASSERT(bno < end); - mval->br_startoff = bno; - if (isnullstartblock(got.br_startblock)) { - ASSERT(!wr || (flags & XFS_BMAPI_DELAY)); - mval->br_startblock = DELAYSTARTBLOCK; - } else - mval->br_startblock = - got.br_startblock + - (bno - got.br_startoff); - /* - * Return the minimum of what we got and what we - * asked for for the length. We can use the len - * variable here because it is modified below - * and we could have been there before coming - * here if the first part of the allocation - * didn't overlap what was asked for. - */ - mval->br_blockcount = - XFS_FILBLKS_MIN(end - bno, got.br_blockcount - - (bno - got.br_startoff)); - mval->br_state = got.br_state; - ASSERT(mval->br_blockcount <= len); - } else { - *mval = got; - if (isnullstartblock(mval->br_startblock)) { - ASSERT(!wr || (flags & XFS_BMAPI_DELAY)); - mval->br_startblock = DELAYSTARTBLOCK; - } + if (bma.blkno == NULLFSBLOCK) + break; } - /* - * Check if writing previously allocated but - * unwritten extents. - */ - if (wr && - ((mval->br_state == XFS_EXT_UNWRITTEN && - ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_DELAY)) == 0)) || - (mval->br_state == XFS_EXT_NORM && - ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_CONVERT)) == - (XFS_BMAPI_PREALLOC|XFS_BMAPI_CONVERT))))) { - /* - * Modify (by adding) the state flag, if writing. - */ - ASSERT(mval->br_blockcount <= len); - if ((ifp->if_flags & XFS_IFBROOT) && !cur) { - cur = xfs_bmbt_init_cursor(mp, - tp, ip, whichfork); - cur->bc_private.b.firstblock = - *firstblock; - cur->bc_private.b.flist = flist; - } - mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN) - ? XFS_EXT_NORM - : XFS_EXT_UNWRITTEN; - error = xfs_bmap_add_extent(tp, ip, &lastx, &cur, mval, - firstblock, flist, &tmp_logflags, - whichfork); - logflags |= tmp_logflags; - if (error) - goto error0; - ep = xfs_iext_get_ext(ifp, lastx); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - xfs_bmbt_get_all(ep, &got); - /* - * We may have combined previously unwritten - * space with written space, so generate - * another request. - */ - if (mval->br_blockcount < len) - continue; - } + /* Deal with the allocated space we found. */ + xfs_bmapi_trim_map(mval, &bma.got, &bno, len, obno, + end, n, flags); + + /* Execute unwritten extent conversion if necessary */ + error = xfs_bmapi_convert_unwritten(&bma, mval, len, flags); + if (error == EAGAIN) + continue; + if (error) + goto error0; + + /* update the extent map to return */ + xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags); - ASSERT((flags & XFS_BMAPI_ENTIRE) || - ((mval->br_startoff + mval->br_blockcount) <= end)); - ASSERT((flags & XFS_BMAPI_ENTIRE) || - (mval->br_blockcount <= len) || - (mval->br_startoff < obno)); - bno = mval->br_startoff + mval->br_blockcount; - len = end - bno; - if (n > 0 && mval->br_startoff == mval[-1].br_startoff) { - ASSERT(mval->br_startblock == mval[-1].br_startblock); - ASSERT(mval->br_blockcount > mval[-1].br_blockcount); - ASSERT(mval->br_state == mval[-1].br_state); - mval[-1].br_blockcount = mval->br_blockcount; - mval[-1].br_state = mval->br_state; - } else if (n > 0 && mval->br_startblock != DELAYSTARTBLOCK && - mval[-1].br_startblock != DELAYSTARTBLOCK && - mval[-1].br_startblock != HOLESTARTBLOCK && - mval->br_startblock == - mval[-1].br_startblock + mval[-1].br_blockcount && - ((flags & XFS_BMAPI_IGSTATE) || - mval[-1].br_state == mval->br_state)) { - ASSERT(mval->br_startoff == - mval[-1].br_startoff + mval[-1].br_blockcount); - mval[-1].br_blockcount += mval->br_blockcount; - } else if (n > 0 && - mval->br_startblock == DELAYSTARTBLOCK && - mval[-1].br_startblock == DELAYSTARTBLOCK && - mval->br_startoff == - mval[-1].br_startoff + mval[-1].br_blockcount) { - mval[-1].br_blockcount += mval->br_blockcount; - mval[-1].br_state = mval->br_state; - } else if (!((n == 0) && - ((mval->br_startoff + mval->br_blockcount) <= - obno))) { - mval++; - n++; - } /* * If we're done, stop now. Stop when we've allocated * XFS_BMAP_MAX_NMAP extents no matter what. Otherwise * the transaction may get too big. */ - if (bno >= end || n >= *nmap || nallocs >= *nmap) + if (bno >= end || n >= *nmap || bma.nallocs >= *nmap) break; - /* - * Else go on to the next record. - */ - prev = got; - if (++lastx < nextents) { - ep = xfs_iext_get_ext(ifp, lastx); - xfs_bmbt_get_all(ep, &got); - } else { + + /* Else go on to the next record. */ + bma.prev = bma.got; + if (++bma.idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) { + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma.idx), + &bma.got); + } else eof = 1; - } } *nmap = n; + /* * Transform from btree to extents, give it cur. */ - if (tp && XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) { - ASSERT(wr && cur); - error = xfs_bmap_btree_to_extents(tp, ip, cur, + int tmp_logflags = 0; + + ASSERT(bma.cur); + error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &tmp_logflags, whichfork); - logflags |= tmp_logflags; + bma.logflags |= tmp_logflags; if (error) goto error0; } @@ -4857,34 +4984,33 @@ error0: * Log everything. Do this after conversion, there's no point in * logging the extent records if we've converted to btree format. */ - if ((logflags & xfs_ilog_fext(whichfork)) && + if ((bma.logflags & xfs_ilog_fext(whichfork)) && XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) - logflags &= ~xfs_ilog_fext(whichfork); - else if ((logflags & xfs_ilog_fbroot(whichfork)) && + bma.logflags &= ~xfs_ilog_fext(whichfork); + else if ((bma.logflags & xfs_ilog_fbroot(whichfork)) && XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) - logflags &= ~xfs_ilog_fbroot(whichfork); + bma.logflags &= ~xfs_ilog_fbroot(whichfork); /* * Log whatever the flags say, even if error. Otherwise we might miss * detecting a case where the data is changed, there's an error, * and it's not logged so we don't shutdown when we should. */ - if (logflags) { - ASSERT(tp && wr); - xfs_trans_log_inode(tp, ip, logflags); - } - if (cur) { + if (bma.logflags) + xfs_trans_log_inode(tp, ip, bma.logflags); + + if (bma.cur) { if (!error) { ASSERT(*firstblock == NULLFSBLOCK || XFS_FSB_TO_AGNO(mp, *firstblock) == XFS_FSB_TO_AGNO(mp, - cur->bc_private.b.firstblock) || + bma.cur->bc_private.b.firstblock) || (flist->xbf_low && XFS_FSB_TO_AGNO(mp, *firstblock) < XFS_FSB_TO_AGNO(mp, - cur->bc_private.b.firstblock))); - *firstblock = cur->bc_private.b.firstblock; + bma.cur->bc_private.b.firstblock))); + *firstblock = bma.cur->bc_private.b.firstblock; } - xfs_btree_del_cursor(cur, + xfs_btree_del_cursor(bma.cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); } if (!error) @@ -4894,58 +5020,6 @@ error0: } /* - * Map file blocks to filesystem blocks, simple version. - * One block (extent) only, read-only. - * For flags, only the XFS_BMAPI_ATTRFORK flag is examined. - * For the other flag values, the effect is as if XFS_BMAPI_METADATA - * was set and all the others were clear. - */ -int /* error */ -xfs_bmapi_single( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode */ - int whichfork, /* data or attr fork */ - xfs_fsblock_t *fsb, /* output: mapped block */ - xfs_fileoff_t bno) /* starting file offs. mapped */ -{ - int eof; /* we've hit the end of extents */ - int error; /* error return */ - xfs_bmbt_irec_t got; /* current file extent record */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_extnum_t lastx; /* last useful extent number */ - xfs_bmbt_irec_t prev; /* previous file extent record */ - - ifp = XFS_IFORK_PTR(ip, whichfork); - if (unlikely( - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)) { - XFS_ERROR_REPORT("xfs_bmapi_single", XFS_ERRLEVEL_LOW, - ip->i_mount); - return XFS_ERROR(EFSCORRUPTED); - } - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - return XFS_ERROR(EIO); - XFS_STATS_INC(xs_blk_mapr); - if (!(ifp->if_flags & XFS_IFEXTENTS) && - (error = xfs_iread_extents(tp, ip, whichfork))) - return error; - (void)xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, - &prev); - /* - * Reading past eof, act as though there's a hole - * up to end. - */ - if (eof || got.br_startoff > bno) { - *fsb = NULLFSBLOCK; - return 0; - } - ASSERT(!isnullstartblock(got.br_startblock)); - ASSERT(bno < got.br_startoff + got.br_blockcount); - *fsb = got.br_startblock + (bno - got.br_startoff); - return 0; -} - -/* * Unmap (remove) blocks from a file. * If nexts is nonzero then the number of extents to remove is limited to * that value. If not all extents in the block range can be removed then @@ -5116,9 +5190,9 @@ xfs_bunmapi( del.br_blockcount = mod; } del.br_state = XFS_EXT_UNWRITTEN; - error = xfs_bmap_add_extent(tp, ip, &lastx, &cur, &del, - firstblock, flist, &logflags, - XFS_DATA_FORK); + error = xfs_bmap_add_extent_unwritten_real(tp, ip, + &lastx, &cur, &del, firstblock, flist, + &logflags); if (error) goto error0; goto nodelete; @@ -5174,18 +5248,18 @@ xfs_bunmapi( } prev.br_state = XFS_EXT_UNWRITTEN; lastx--; - error = xfs_bmap_add_extent(tp, ip, &lastx, - &cur, &prev, firstblock, flist, - &logflags, XFS_DATA_FORK); + error = xfs_bmap_add_extent_unwritten_real(tp, + ip, &lastx, &cur, &prev, + firstblock, flist, &logflags); if (error) goto error0; goto nodelete; } else { ASSERT(del.br_state == XFS_EXT_NORM); del.br_state = XFS_EXT_UNWRITTEN; - error = xfs_bmap_add_extent(tp, ip, &lastx, - &cur, &del, firstblock, flist, - &logflags, XFS_DATA_FORK); + error = xfs_bmap_add_extent_unwritten_real(tp, + ip, &lastx, &cur, &del, + firstblock, flist, &logflags); if (error) goto error0; goto nodelete; @@ -5507,10 +5581,9 @@ xfs_getbmap( do { nmap = (nexleft > subnex) ? subnex : nexleft; - error = xfs_bmapi(NULL, ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset), - XFS_BB_TO_FSB(mp, bmv->bmv_length), - bmapi_flags, NULL, 0, map, &nmap, - NULL); + error = xfs_bmapi_read(ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset), + XFS_BB_TO_FSB(mp, bmv->bmv_length), + map, &nmap, bmapi_flags); if (error) goto out_free_map; ASSERT(nmap <= subnex); @@ -5584,89 +5657,6 @@ xfs_getbmap( return error; } -/* - * Check the last inode extent to determine whether this allocation will result - * in blocks being allocated at the end of the file. When we allocate new data - * blocks at the end of the file which do not start at the previous data block, - * we will try to align the new blocks at stripe unit boundaries. - */ -STATIC int /* error */ -xfs_bmap_isaeof( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fileoff_t off, /* file offset in fsblocks */ - int whichfork, /* data or attribute fork */ - char *aeof) /* return value */ -{ - int error; /* error return value */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_bmbt_rec_host_t *lastrec; /* extent record pointer */ - xfs_extnum_t nextents; /* number of file extents */ - xfs_bmbt_irec_t s; /* expanded extent record */ - - ASSERT(whichfork == XFS_DATA_FORK); - ifp = XFS_IFORK_PTR(ip, whichfork); - if (!(ifp->if_flags & XFS_IFEXTENTS) && - (error = xfs_iread_extents(NULL, ip, whichfork))) - return error; - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - if (nextents == 0) { - *aeof = 1; - return 0; - } - /* - * Go to the last extent - */ - lastrec = xfs_iext_get_ext(ifp, nextents - 1); - xfs_bmbt_get_all(lastrec, &s); - /* - * Check we are allocating in the last extent (for delayed allocations) - * or past the last extent for non-delayed allocations. - */ - *aeof = (off >= s.br_startoff && - off < s.br_startoff + s.br_blockcount && - isnullstartblock(s.br_startblock)) || - off >= s.br_startoff + s.br_blockcount; - return 0; -} - -/* - * Check if the endoff is outside the last extent. If so the caller will grow - * the allocation to a stripe unit boundary. - */ -int /* error */ -xfs_bmap_eof( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fileoff_t endoff, /* file offset in fsblocks */ - int whichfork, /* data or attribute fork */ - int *eof) /* result value */ -{ - xfs_fsblock_t blockcount; /* extent block count */ - int error; /* error return value */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_bmbt_rec_host_t *lastrec; /* extent record pointer */ - xfs_extnum_t nextents; /* number of file extents */ - xfs_fileoff_t startoff; /* extent starting file offset */ - - ASSERT(whichfork == XFS_DATA_FORK); - ifp = XFS_IFORK_PTR(ip, whichfork); - if (!(ifp->if_flags & XFS_IFEXTENTS) && - (error = xfs_iread_extents(NULL, ip, whichfork))) - return error; - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - if (nextents == 0) { - *eof = 1; - return 0; - } - /* - * Go to the last extent - */ - lastrec = xfs_iext_get_ext(ifp, nextents - 1); - startoff = xfs_bmbt_get_startoff(lastrec); - blockcount = xfs_bmbt_get_blockcount(lastrec); - *eof = endoff >= startoff + blockcount; - return 0; -} - #ifdef DEBUG STATIC struct xfs_buf * xfs_bmap_get_bp( @@ -6101,9 +6091,8 @@ xfs_bmap_punch_delalloc_range( * trying to remove a real extent (which requires a * transaction) or a hole, which is probably a bad idea... */ - error = xfs_bmapi(NULL, ip, start_fsb, 1, - XFS_BMAPI_ENTIRE, NULL, 0, &imap, - &nimaps, NULL); + error = xfs_bmapi_read(ip, start_fsb, 1, &imap, &nimaps, + XFS_BMAPI_ENTIRE); if (error) { /* something screwed, just bail */ diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index c62234bde053..89ee672d378a 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -62,27 +62,23 @@ typedef struct xfs_bmap_free #define XFS_BMAP_MAX_NMAP 4 /* - * Flags for xfs_bmapi + * Flags for xfs_bmapi_* */ -#define XFS_BMAPI_WRITE 0x001 /* write operation: allocate space */ -#define XFS_BMAPI_DELAY 0x002 /* delayed write operation */ -#define XFS_BMAPI_ENTIRE 0x004 /* return entire extent, not trimmed */ -#define XFS_BMAPI_METADATA 0x008 /* mapping metadata not user data */ -#define XFS_BMAPI_ATTRFORK 0x010 /* use attribute fork not data */ -#define XFS_BMAPI_PREALLOC 0x040 /* preallocation op: unwritten space */ -#define XFS_BMAPI_IGSTATE 0x080 /* Ignore state - */ +#define XFS_BMAPI_ENTIRE 0x001 /* return entire extent, not trimmed */ +#define XFS_BMAPI_METADATA 0x002 /* mapping metadata not user data */ +#define XFS_BMAPI_ATTRFORK 0x004 /* use attribute fork not data */ +#define XFS_BMAPI_PREALLOC 0x008 /* preallocation op: unwritten space */ +#define XFS_BMAPI_IGSTATE 0x010 /* Ignore state - */ /* combine contig. space */ -#define XFS_BMAPI_CONTIG 0x100 /* must allocate only one extent */ +#define XFS_BMAPI_CONTIG 0x020 /* must allocate only one extent */ /* * unwritten extent conversion - this needs write cache flushing and no additional * allocation alignments. When specified with XFS_BMAPI_PREALLOC it converts * from written to unwritten, otherwise convert from unwritten to written. */ -#define XFS_BMAPI_CONVERT 0x200 +#define XFS_BMAPI_CONVERT 0x040 #define XFS_BMAPI_FLAGS \ - { XFS_BMAPI_WRITE, "WRITE" }, \ - { XFS_BMAPI_DELAY, "DELAY" }, \ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ { XFS_BMAPI_METADATA, "METADATA" }, \ { XFS_BMAPI_ATTRFORK, "ATTRFORK" }, \ @@ -113,21 +109,28 @@ static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp) * Argument structure for xfs_bmap_alloc. */ typedef struct xfs_bmalloca { - xfs_fsblock_t firstblock; /* i/o first block allocated */ - xfs_fsblock_t rval; /* starting block of new extent */ - xfs_fileoff_t off; /* offset in file filling in */ + xfs_fsblock_t *firstblock; /* i/o first block allocated */ + struct xfs_bmap_free *flist; /* bmap freelist */ struct xfs_trans *tp; /* transaction pointer */ struct xfs_inode *ip; /* incore inode pointer */ - struct xfs_bmbt_irec *prevp; /* extent before the new one */ - struct xfs_bmbt_irec *gotp; /* extent after, or delayed */ - xfs_extlen_t alen; /* i/o length asked/allocated */ + struct xfs_bmbt_irec prev; /* extent before the new one */ + struct xfs_bmbt_irec got; /* extent after, or delayed */ + + xfs_fileoff_t offset; /* offset in file filling in */ + xfs_extlen_t length; /* i/o length asked/allocated */ + xfs_fsblock_t blkno; /* starting block of new extent */ + + struct xfs_btree_cur *cur; /* btree cursor */ + xfs_extnum_t idx; /* current extent index */ + int nallocs;/* number of extents alloc'd */ + int logflags;/* flags for transaction logging */ + xfs_extlen_t total; /* total blocks needed for xaction */ xfs_extlen_t minlen; /* minimum allocation size (blocks) */ xfs_extlen_t minleft; /* amount must be left after alloc */ char eof; /* set if allocating past last extent */ char wasdel; /* replacing a delayed allocation */ char userdata;/* set if is user data */ - char low; /* low on space, using seq'l ags */ char aeof; /* allocated space at eof */ char conv; /* overwriting unwritten extents */ } xfs_bmalloca_t; @@ -152,251 +155,62 @@ typedef struct xfs_bmalloca { { BMAP_RIGHT_FILLING, "RF" }, \ { BMAP_ATTRFORK, "ATTR" } -/* - * Add bmap trace insert entries for all the contents of the extent list. - * - * Quite excessive tracing. Only do this for debug builds. - */ #if defined(__KERNEL) && defined(DEBUG) -void -xfs_bmap_trace_exlist( - struct xfs_inode *ip, /* incore inode pointer */ - xfs_extnum_t cnt, /* count of entries in list */ - int whichfork, - unsigned long caller_ip); /* data or attr fork */ +void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt, + int whichfork, unsigned long caller_ip); #define XFS_BMAP_TRACE_EXLIST(ip,c,w) \ xfs_bmap_trace_exlist(ip,c,w, _THIS_IP_) #else #define XFS_BMAP_TRACE_EXLIST(ip,c,w) #endif -/* - * Convert inode from non-attributed to attributed. - * Must not be in a transaction, ip must not be locked. - */ -int /* error code */ -xfs_bmap_add_attrfork( - struct xfs_inode *ip, /* incore inode pointer */ - int size, /* space needed for new attribute */ - int rsvd); /* flag for reserved block allocation */ - -/* - * Add the extent to the list of extents to be free at transaction end. - * The list is maintained sorted (by block number). - */ -void -xfs_bmap_add_free( - xfs_fsblock_t bno, /* fs block number of extent */ - xfs_filblks_t len, /* length of extent */ - xfs_bmap_free_t *flist, /* list of extents */ - struct xfs_mount *mp); /* mount point structure */ - -/* - * Routine to clean up the free list data structure when - * an error occurs during a transaction. - */ -void -xfs_bmap_cancel( - xfs_bmap_free_t *flist); /* free list to clean up */ - -/* - * Compute and fill in the value of the maximum depth of a bmap btree - * in this filesystem. Done once, during mount. - */ -void -xfs_bmap_compute_maxlevels( - struct xfs_mount *mp, /* file system mount structure */ - int whichfork); /* data or attr fork */ - -/* - * Returns the file-relative block number of the first unused block in the file. - * This is the lowest-address hole if the file has holes, else the first block - * past the end of file. - */ -int /* error */ -xfs_bmap_first_unused( - struct xfs_trans *tp, /* transaction pointer */ - struct xfs_inode *ip, /* incore inode */ - xfs_extlen_t len, /* size of hole to find */ - xfs_fileoff_t *unused, /* unused block num */ - int whichfork); /* data or attr fork */ - -/* - * Returns the file-relative block number of the last block + 1 before - * last_block (input value) in the file. - * This is not based on i_size, it is based on the extent list. - * Returns 0 for local files, as they do not have an extent list. - */ -int /* error */ -xfs_bmap_last_before( - struct xfs_trans *tp, /* transaction pointer */ - struct xfs_inode *ip, /* incore inode */ - xfs_fileoff_t *last_block, /* last block */ - int whichfork); /* data or attr fork */ - -/* - * Returns the file-relative block number of the first block past eof in - * the file. This is not based on i_size, it is based on the extent list. - * Returns 0 for local files, as they do not have an extent list. - */ -int /* error */ -xfs_bmap_last_offset( - struct xfs_trans *tp, /* transaction pointer */ - struct xfs_inode *ip, /* incore inode */ - xfs_fileoff_t *unused, /* last block num */ - int whichfork); /* data or attr fork */ - -/* - * Returns whether the selected fork of the inode has exactly one - * block or not. For the data fork we check this matches di_size, - * implying the file's range is 0..bsize-1. - */ -int -xfs_bmap_one_block( - struct xfs_inode *ip, /* incore inode */ - int whichfork); /* data or attr fork */ - -/* - * Read in the extents to iu_extents. - * All inode fields are set up by caller, we just traverse the btree - * and copy the records in. - */ -int /* error */ -xfs_bmap_read_extents( - struct xfs_trans *tp, /* transaction pointer */ - struct xfs_inode *ip, /* incore inode */ - int whichfork); /* data or attr fork */ - -/* - * Map file blocks to filesystem blocks. - * File range is given by the bno/len pair. - * Adds blocks to file if a write ("flags & XFS_BMAPI_WRITE" set) - * into a hole or past eof. - * Only allocates blocks from a single allocation group, - * to avoid locking problems. - * The returned value in "firstblock" from the first call in a transaction - * must be remembered and presented to subsequent calls in "firstblock". - * An upper bound for the number of blocks to be allocated is supplied to - * the first call in "total"; if no allocation group has that many free - * blocks then the call will fail (return NULLFSBLOCK in "firstblock"). - */ -int /* error */ -xfs_bmapi( - struct xfs_trans *tp, /* transaction pointer */ - struct xfs_inode *ip, /* incore inode */ - xfs_fileoff_t bno, /* starting file offs. mapped */ - xfs_filblks_t len, /* length to map in file */ - int flags, /* XFS_BMAPI_... */ - xfs_fsblock_t *firstblock, /* first allocated block - controls a.g. for allocs */ - xfs_extlen_t total, /* total blocks needed */ - struct xfs_bmbt_irec *mval, /* output: map values */ - int *nmap, /* i/o: mval size/count */ - xfs_bmap_free_t *flist); /* i/o: list extents to free */ - -/* - * Map file blocks to filesystem blocks, simple version. - * One block only, read-only. - * For flags, only the XFS_BMAPI_ATTRFORK flag is examined. - * For the other flag values, the effect is as if XFS_BMAPI_METADATA - * was set and all the others were clear. - */ -int /* error */ -xfs_bmapi_single( - struct xfs_trans *tp, /* transaction pointer */ - struct xfs_inode *ip, /* incore inode */ - int whichfork, /* data or attr fork */ - xfs_fsblock_t *fsb, /* output: mapped block */ - xfs_fileoff_t bno); /* starting file offs. mapped */ - -/* - * Unmap (remove) blocks from a file. - * If nexts is nonzero then the number of extents to remove is limited to - * that value. If not all extents in the block range can be removed then - * *done is set. - */ -int /* error */ -xfs_bunmapi( - struct xfs_trans *tp, /* transaction pointer */ - struct xfs_inode *ip, /* incore inode */ - xfs_fileoff_t bno, /* starting offset to unmap */ - xfs_filblks_t len, /* length to unmap in file */ - int flags, /* XFS_BMAPI_... */ - xfs_extnum_t nexts, /* number of extents max */ - xfs_fsblock_t *firstblock, /* first allocated block - controls a.g. for allocs */ - xfs_bmap_free_t *flist, /* i/o: list extents to free */ - int *done); /* set if not done yet */ - -/* - * Check an extent list, which has just been read, for - * any bit in the extent flag field. - */ -int -xfs_check_nostate_extents( - struct xfs_ifork *ifp, - xfs_extnum_t idx, - xfs_extnum_t num); - -uint -xfs_default_attroffset( - struct xfs_inode *ip); +int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); +void xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len, + struct xfs_bmap_free *flist, struct xfs_mount *mp); +void xfs_bmap_cancel(struct xfs_bmap_free *flist); +void xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork); +int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip, + xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork); +int xfs_bmap_last_before(struct xfs_trans *tp, struct xfs_inode *ip, + xfs_fileoff_t *last_block, int whichfork); +int xfs_bmap_last_offset(struct xfs_trans *tp, struct xfs_inode *ip, + xfs_fileoff_t *unused, int whichfork); +int xfs_bmap_one_block(struct xfs_inode *ip, int whichfork); +int xfs_bmap_read_extents(struct xfs_trans *tp, struct xfs_inode *ip, + int whichfork); +int xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno, + xfs_filblks_t len, struct xfs_bmbt_irec *mval, + int *nmap, int flags); +int xfs_bmapi_delay(struct xfs_inode *ip, xfs_fileoff_t bno, + xfs_filblks_t len, struct xfs_bmbt_irec *mval, + int *nmap, int flags); +int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip, + xfs_fileoff_t bno, xfs_filblks_t len, int flags, + xfs_fsblock_t *firstblock, xfs_extlen_t total, + struct xfs_bmbt_irec *mval, int *nmap, + struct xfs_bmap_free *flist); +int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, + xfs_fileoff_t bno, xfs_filblks_t len, int flags, + xfs_extnum_t nexts, xfs_fsblock_t *firstblock, + struct xfs_bmap_free *flist, int *done); +int xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx, + xfs_extnum_t num); +uint xfs_default_attroffset(struct xfs_inode *ip); #ifdef __KERNEL__ - -/* - * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi - * caller. Frees all the extents that need freeing, which must be done - * last due to locking considerations. - * - * Return 1 if the given transaction was committed and a new one allocated, - * and 0 otherwise. - */ -int /* error */ -xfs_bmap_finish( - struct xfs_trans **tp, /* transaction pointer addr */ - xfs_bmap_free_t *flist, /* i/o: list extents to free */ - int *committed); /* xact committed or not */ - /* bmap to userspace formatter - copy to user & advance pointer */ typedef int (*xfs_bmap_format_t)(void **, struct getbmapx *, int *); -/* - * Get inode's extents as described in bmv, and format for output. - */ -int /* error code */ -xfs_getbmap( - xfs_inode_t *ip, - struct getbmapx *bmv, /* user bmap structure */ - xfs_bmap_format_t formatter, /* format to user */ - void *arg); /* formatter arg */ - -/* - * Check if the endoff is outside the last extent. If so the caller will grow - * the allocation to a stripe unit boundary - */ -int -xfs_bmap_eof( - struct xfs_inode *ip, - xfs_fileoff_t endoff, - int whichfork, - int *eof); - -/* - * Count fsblocks of the given fork. - */ -int -xfs_bmap_count_blocks( - xfs_trans_t *tp, - struct xfs_inode *ip, - int whichfork, - int *count); - -int -xfs_bmap_punch_delalloc_range( - struct xfs_inode *ip, - xfs_fileoff_t start_fsb, - xfs_fileoff_t length); +int xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist, + int *committed); +int xfs_getbmap(struct xfs_inode *ip, struct getbmapx *bmv, + xfs_bmap_format_t formatter, void *arg); +int xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff, + int whichfork, int *eof); +int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip, + int whichfork, int *count); +int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, + xfs_fileoff_t start_fsb, xfs_fileoff_t length); #endif /* __KERNEL__ */ #endif /* __XFS_BMAP_H__ */ diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index cabf4b5604aa..1f19f03af9d3 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -275,8 +275,7 @@ xfs_btree_dup_cursor( return error; } new->bc_bufs[i] = bp; - ASSERT(bp); - ASSERT(!XFS_BUF_GETERROR(bp)); + ASSERT(!xfs_buf_geterror(bp)); } else new->bc_bufs[i] = NULL; } @@ -467,8 +466,7 @@ xfs_btree_get_bufl( ASSERT(fsbno != NULLFSBLOCK); d = XFS_FSB_TO_DADDR(mp, fsbno); bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock); - ASSERT(bp); - ASSERT(!XFS_BUF_GETERROR(bp)); + ASSERT(!xfs_buf_geterror(bp)); return bp; } @@ -491,8 +489,7 @@ xfs_btree_get_bufs( ASSERT(agbno != NULLAGBLOCK); d = XFS_AGB_TO_DADDR(mp, agno, agbno); bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock); - ASSERT(bp); - ASSERT(!XFS_BUF_GETERROR(bp)); + ASSERT(!xfs_buf_geterror(bp)); return bp; } @@ -632,9 +629,9 @@ xfs_btree_read_bufl( mp->m_bsize, lock, &bp))) { return error; } - ASSERT(!bp || !XFS_BUF_GETERROR(bp)); + ASSERT(!xfs_buf_geterror(bp)); if (bp) - XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval); + xfs_buf_set_ref(bp, refval); *bpp = bp; return 0; } @@ -942,13 +939,13 @@ xfs_btree_set_refs( switch (cur->bc_btnum) { case XFS_BTNUM_BNO: case XFS_BTNUM_CNT: - XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_ALLOC_BTREE_REF); + xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF); break; case XFS_BTNUM_INO: - XFS_BUF_SET_VTYPE_REF(bp, B_FS_INOMAP, XFS_INO_BTREE_REF); + xfs_buf_set_ref(bp, XFS_INO_BTREE_REF); break; case XFS_BTNUM_BMAP: - XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_BMAP_BTREE_REF); + xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF); break; default: ASSERT(0); @@ -973,8 +970,8 @@ xfs_btree_get_buf_block( *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d, mp->m_bsize, flags); - ASSERT(*bpp); - ASSERT(!XFS_BUF_GETERROR(*bpp)); + if (!*bpp) + return ENOMEM; *block = XFS_BUF_TO_BLOCK(*bpp); return 0; @@ -1006,8 +1003,7 @@ xfs_btree_read_buf_block( if (error) return error; - ASSERT(*bpp != NULL); - ASSERT(!XFS_BUF_GETERROR(*bpp)); + ASSERT(!xfs_buf_geterror(*bpp)); xfs_btree_set_refs(cur, *bpp); *block = XFS_BUF_TO_BLOCK(*bpp); diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h index 8d05a6a46ce3..5b240de104c0 100644 --- a/fs/xfs/xfs_btree.h +++ b/fs/xfs/xfs_btree.h @@ -262,7 +262,7 @@ typedef struct xfs_btree_cur /* * Convert from buffer to btree block header. */ -#define XFS_BUF_TO_BLOCK(bp) ((struct xfs_btree_block *)XFS_BUF_PTR(bp)) +#define XFS_BUF_TO_BLOCK(bp) ((struct xfs_btree_block *)((bp)->b_addr)) /* diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/xfs_buf.c index b2b411985591..cf0ac056815f 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -43,7 +43,6 @@ static kmem_zone_t *xfs_buf_zone; STATIC int xfsbufd(void *); -STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int); static struct workqueue_struct *xfslogd_workqueue; struct workqueue_struct *xfsdatad_workqueue; @@ -66,10 +65,6 @@ struct workqueue_struct *xfsconvertd_workqueue; #define xb_to_km(flags) \ (((flags) & XBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP) -#define xfs_buf_allocate(flags) \ - kmem_zone_alloc(xfs_buf_zone, xb_to_km(flags)) -#define xfs_buf_deallocate(bp) \ - kmem_zone_free(xfs_buf_zone, (bp)); static inline int xfs_buf_is_vmapped( @@ -152,6 +147,7 @@ xfs_buf_stale( struct xfs_buf *bp) { bp->b_flags |= XBF_STALE; + xfs_buf_delwri_dequeue(bp); atomic_set(&(bp)->b_lru_ref, 0); if (!list_empty(&bp->b_lru)) { struct xfs_buftarg *btp = bp->b_target; @@ -167,14 +163,19 @@ xfs_buf_stale( ASSERT(atomic_read(&bp->b_hold) >= 1); } -STATIC void -_xfs_buf_initialize( - xfs_buf_t *bp, - xfs_buftarg_t *target, +struct xfs_buf * +xfs_buf_alloc( + struct xfs_buftarg *target, xfs_off_t range_base, size_t range_length, xfs_buf_flags_t flags) { + struct xfs_buf *bp; + + bp = kmem_zone_alloc(xfs_buf_zone, xb_to_km(flags)); + if (unlikely(!bp)) + return NULL; + /* * We don't want certain flags to appear in b_flags. */ @@ -203,8 +204,9 @@ _xfs_buf_initialize( init_waitqueue_head(&bp->b_waiters); XFS_STATS_INC(xb_create); - trace_xfs_buf_init(bp, _RET_IP_); + + return bp; } /* @@ -277,7 +279,7 @@ xfs_buf_free( } else if (bp->b_flags & _XBF_KMEM) kmem_free(bp->b_addr); _xfs_buf_free_pages(bp); - xfs_buf_deallocate(bp); + kmem_zone_free(xfs_buf_zone, bp); } /* @@ -416,10 +418,7 @@ _xfs_buf_map_pages( /* * Look up, and creates if absent, a lockable buffer for * a given range of an inode. The buffer is returned - * locked. If other overlapping buffers exist, they are - * released before the new buffer is created and locked, - * which may imply that this call will block until those buffers - * are unlocked. No I/O is implied by this call. + * locked. No I/O is implied by this call. */ xfs_buf_t * _xfs_buf_find( @@ -481,8 +480,6 @@ _xfs_buf_find( /* No match found */ if (new_bp) { - _xfs_buf_initialize(new_bp, btp, range_base, - range_length, flags); rb_link_node(&new_bp->b_rbnode, parent, rbp); rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree); /* the buffer keeps the perag reference until it is freed */ @@ -525,35 +522,51 @@ found: } /* - * Assembles a buffer covering the specified range. - * Storage in memory for all portions of the buffer will be allocated, - * although backing storage may not be. + * Assembles a buffer covering the specified range. The code is optimised for + * cache hits, as metadata intensive workloads will see 3 orders of magnitude + * more hits than misses. */ -xfs_buf_t * +struct xfs_buf * xfs_buf_get( xfs_buftarg_t *target,/* target for buffer */ xfs_off_t ioff, /* starting offset of range */ size_t isize, /* length of range */ xfs_buf_flags_t flags) { - xfs_buf_t *bp, *new_bp; + struct xfs_buf *bp; + struct xfs_buf *new_bp; int error = 0; - new_bp = xfs_buf_allocate(flags); + bp = _xfs_buf_find(target, ioff, isize, flags, NULL); + if (likely(bp)) + goto found; + + new_bp = xfs_buf_alloc(target, ioff << BBSHIFT, isize << BBSHIFT, + flags); if (unlikely(!new_bp)) return NULL; bp = _xfs_buf_find(target, ioff, isize, flags, new_bp); + if (!bp) { + kmem_zone_free(xfs_buf_zone, new_bp); + return NULL; + } + if (bp == new_bp) { error = xfs_buf_allocate_memory(bp, flags); if (error) goto no_buffer; - } else { - xfs_buf_deallocate(new_bp); - if (unlikely(bp == NULL)) - return NULL; - } + } else + kmem_zone_free(xfs_buf_zone, new_bp); + /* + * Now we have a workable buffer, fill in the block number so + * that we can do IO on it. + */ + bp->b_bn = ioff; + bp->b_count_desired = bp->b_buffer_length; + +found: if (!(bp->b_flags & XBF_MAPPED)) { error = _xfs_buf_map_pages(bp, flags); if (unlikely(error)) { @@ -564,18 +577,10 @@ xfs_buf_get( } XFS_STATS_INC(xb_get); - - /* - * Always fill in the block number now, the mapped cases can do - * their own overlay of this later. - */ - bp->b_bn = ioff; - bp->b_count_desired = bp->b_buffer_length; - trace_xfs_buf_get(bp, flags, _RET_IP_); return bp; - no_buffer: +no_buffer: if (flags & (XBF_LOCK | XBF_TRYLOCK)) xfs_buf_unlock(bp); xfs_buf_rele(bp); @@ -596,7 +601,7 @@ _xfs_buf_read( bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD); status = xfs_buf_iorequest(bp); - if (status || XFS_BUF_ISERROR(bp) || (flags & XBF_ASYNC)) + if (status || bp->b_error || (flags & XBF_ASYNC)) return status; return xfs_buf_iowait(bp); } @@ -679,7 +684,6 @@ xfs_buf_read_uncached( /* set up the buffer for a read IO */ XFS_BUF_SET_ADDR(bp, daddr); XFS_BUF_READ(bp); - XFS_BUF_BUSY(bp); xfsbdstrat(mp, bp); error = xfs_buf_iowait(bp); @@ -690,19 +694,6 @@ xfs_buf_read_uncached( return bp; } -xfs_buf_t * -xfs_buf_get_empty( - size_t len, - xfs_buftarg_t *target) -{ - xfs_buf_t *bp; - - bp = xfs_buf_allocate(0); - if (bp) - _xfs_buf_initialize(bp, target, 0, len, 0); - return bp; -} - /* * Return a buffer allocated as an empty buffer and associated to external * memory via xfs_buf_associate_memory() back to it's empty state. @@ -788,10 +779,9 @@ xfs_buf_get_uncached( int error, i; xfs_buf_t *bp; - bp = xfs_buf_allocate(0); + bp = xfs_buf_alloc(target, 0, len, 0); if (unlikely(bp == NULL)) goto fail; - _xfs_buf_initialize(bp, target, 0, len, 0); error = _xfs_buf_get_pages(bp, page_count, 0); if (error) @@ -819,7 +809,7 @@ xfs_buf_get_uncached( __free_page(bp->b_pages[i]); _xfs_buf_free_pages(bp); fail_free_buf: - xfs_buf_deallocate(bp); + kmem_zone_free(xfs_buf_zone, bp); fail: return NULL; } @@ -938,12 +928,6 @@ void xfs_buf_unlock( struct xfs_buf *bp) { - if ((bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)) == XBF_DELWRI) { - atomic_inc(&bp->b_hold); - bp->b_flags |= XBF_ASYNC; - xfs_buf_delwri_queue(bp, 0); - } - XB_CLEAR_OWNER(bp); up(&bp->b_sema); @@ -1020,9 +1004,19 @@ xfs_buf_ioerror( trace_xfs_buf_ioerror(bp, error, _RET_IP_); } +void +xfs_buf_ioerror_alert( + struct xfs_buf *bp, + const char *func) +{ + xfs_alert(bp->b_target->bt_mount, +"metadata I/O error: block 0x%llx (\"%s\") error %d buf count %zd", + (__uint64_t)XFS_BUF_ADDR(bp), func, + bp->b_error, XFS_BUF_COUNT(bp)); +} + int xfs_bwrite( - struct xfs_mount *mp, struct xfs_buf *bp) { int error; @@ -1034,25 +1028,13 @@ xfs_bwrite( xfs_bdstrat_cb(bp); error = xfs_buf_iowait(bp); - if (error) - xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); - xfs_buf_relse(bp); + if (error) { + xfs_force_shutdown(bp->b_target->bt_mount, + SHUTDOWN_META_IO_ERROR); + } return error; } -void -xfs_bdwrite( - void *mp, - struct xfs_buf *bp) -{ - trace_xfs_buf_bdwrite(bp, _RET_IP_); - - bp->b_flags &= ~XBF_READ; - bp->b_flags |= (XBF_DELWRI | XBF_ASYNC); - - xfs_buf_delwri_queue(bp, 1); -} - /* * Called when we want to stop a buffer from getting written or read. * We attach the EIO error, muck with its flags, and call xfs_buf_ioend @@ -1069,15 +1051,14 @@ xfs_bioerror( /* * No need to wait until the buffer is unpinned, we aren't flushing it. */ - XFS_BUF_ERROR(bp, EIO); + xfs_buf_ioerror(bp, EIO); /* * We're calling xfs_buf_ioend, so delete XBF_DONE flag. */ XFS_BUF_UNREAD(bp); - XFS_BUF_UNDELAYWRITE(bp); XFS_BUF_UNDONE(bp); - XFS_BUF_STALE(bp); + xfs_buf_stale(bp); xfs_buf_ioend(bp, 0); @@ -1094,7 +1075,7 @@ STATIC int xfs_bioerror_relse( struct xfs_buf *bp) { - int64_t fl = XFS_BUF_BFLAGS(bp); + int64_t fl = bp->b_flags; /* * No need to wait until the buffer is unpinned. * We aren't flushing it. @@ -1104,9 +1085,8 @@ xfs_bioerror_relse( * change that interface. */ XFS_BUF_UNREAD(bp); - XFS_BUF_UNDELAYWRITE(bp); XFS_BUF_DONE(bp); - XFS_BUF_STALE(bp); + xfs_buf_stale(bp); bp->b_iodone = NULL; if (!(fl & XBF_ASYNC)) { /* @@ -1115,8 +1095,8 @@ xfs_bioerror_relse( * There's no reason to mark error for * ASYNC buffers. */ - XFS_BUF_ERROR(bp, EIO); - XFS_BUF_FINISH_IOWAIT(bp); + xfs_buf_ioerror(bp, EIO); + complete(&bp->b_iowait); } else { xfs_buf_relse(bp); } @@ -1224,6 +1204,9 @@ _xfs_buf_ioapply( rw = READ; } + /* we only use the buffer cache for meta-data */ + rw |= REQ_META; + next_chunk: atomic_inc(&bp->b_io_remaining); nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT); @@ -1273,15 +1256,10 @@ xfs_buf_iorequest( { trace_xfs_buf_iorequest(bp, _RET_IP_); - if (bp->b_flags & XBF_DELWRI) { - xfs_buf_delwri_queue(bp, 1); - return 0; - } + ASSERT(!(bp->b_flags & XBF_DELWRI)); - if (bp->b_flags & XBF_WRITE) { + if (bp->b_flags & XBF_WRITE) xfs_buf_wait_unpin(bp); - } - xfs_buf_hold(bp); /* Set the count to 1 initially, this will stop an I/O @@ -1321,7 +1299,7 @@ xfs_buf_offset( struct page *page; if (bp->b_flags & XBF_MAPPED) - return XFS_BUF_PTR(bp) + offset; + return bp->b_addr + offset; offset += bp->b_offset; page = bp->b_pages[offset >> PAGE_SHIFT]; @@ -1479,9 +1457,13 @@ xfs_setsize_buftarg_flags( btp->bt_smask = sectorsize - 1; if (set_blocksize(btp->bt_bdev, sectorsize)) { + char name[BDEVNAME_SIZE]; + + bdevname(btp->bt_bdev, name); + xfs_warn(btp->bt_mount, "Cannot set_blocksize to %u on device %s\n", - sectorsize, XFS_BUFTARG_NAME(btp)); + sectorsize, name); return EINVAL; } @@ -1512,12 +1494,12 @@ xfs_setsize_buftarg( } STATIC int -xfs_alloc_delwrite_queue( +xfs_alloc_delwri_queue( xfs_buftarg_t *btp, const char *fsname) { - INIT_LIST_HEAD(&btp->bt_delwrite_queue); - spin_lock_init(&btp->bt_delwrite_lock); + INIT_LIST_HEAD(&btp->bt_delwri_queue); + spin_lock_init(&btp->bt_delwri_lock); btp->bt_flags = 0; btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname); if (IS_ERR(btp->bt_task)) @@ -1547,7 +1529,7 @@ xfs_alloc_buftarg( spin_lock_init(&btp->bt_lru_lock); if (xfs_setsize_buftarg_early(btp, bdev)) goto error; - if (xfs_alloc_delwrite_queue(btp, fsname)) + if (xfs_alloc_delwri_queue(btp, fsname)) goto error; btp->bt_shrinker.shrink = xfs_buftarg_shrink; btp->bt_shrinker.seeks = DEFAULT_SEEKS; @@ -1563,56 +1545,48 @@ error: /* * Delayed write buffer handling */ -STATIC void +void xfs_buf_delwri_queue( - xfs_buf_t *bp, - int unlock) + xfs_buf_t *bp) { - struct list_head *dwq = &bp->b_target->bt_delwrite_queue; - spinlock_t *dwlk = &bp->b_target->bt_delwrite_lock; + struct xfs_buftarg *btp = bp->b_target; trace_xfs_buf_delwri_queue(bp, _RET_IP_); - ASSERT((bp->b_flags&(XBF_DELWRI|XBF_ASYNC)) == (XBF_DELWRI|XBF_ASYNC)); + ASSERT(!(bp->b_flags & XBF_READ)); - spin_lock(dwlk); - /* If already in the queue, dequeue and place at tail */ + spin_lock(&btp->bt_delwri_lock); if (!list_empty(&bp->b_list)) { + /* if already in the queue, move it to the tail */ ASSERT(bp->b_flags & _XBF_DELWRI_Q); - if (unlock) - atomic_dec(&bp->b_hold); - list_del(&bp->b_list); - } - - if (list_empty(dwq)) { + list_move_tail(&bp->b_list, &btp->bt_delwri_queue); + } else { /* start xfsbufd as it is about to have something to do */ - wake_up_process(bp->b_target->bt_task); - } + if (list_empty(&btp->bt_delwri_queue)) + wake_up_process(bp->b_target->bt_task); - bp->b_flags |= _XBF_DELWRI_Q; - list_add_tail(&bp->b_list, dwq); + atomic_inc(&bp->b_hold); + bp->b_flags |= XBF_DELWRI | _XBF_DELWRI_Q | XBF_ASYNC; + list_add_tail(&bp->b_list, &btp->bt_delwri_queue); + } bp->b_queuetime = jiffies; - spin_unlock(dwlk); - - if (unlock) - xfs_buf_unlock(bp); + spin_unlock(&btp->bt_delwri_lock); } void xfs_buf_delwri_dequeue( xfs_buf_t *bp) { - spinlock_t *dwlk = &bp->b_target->bt_delwrite_lock; int dequeued = 0; - spin_lock(dwlk); + spin_lock(&bp->b_target->bt_delwri_lock); if ((bp->b_flags & XBF_DELWRI) && !list_empty(&bp->b_list)) { ASSERT(bp->b_flags & _XBF_DELWRI_Q); list_del_init(&bp->b_list); dequeued = 1; } bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q); - spin_unlock(dwlk); + spin_unlock(&bp->b_target->bt_delwri_lock); if (dequeued) xfs_buf_rele(bp); @@ -1644,16 +1618,9 @@ xfs_buf_delwri_promote( if (bp->b_queuetime < jiffies - age) return; bp->b_queuetime = jiffies - age; - spin_lock(&btp->bt_delwrite_lock); - list_move(&bp->b_list, &btp->bt_delwrite_queue); - spin_unlock(&btp->bt_delwrite_lock); -} - -STATIC void -xfs_buf_runall_queues( - struct workqueue_struct *queue) -{ - flush_workqueue(queue); + spin_lock(&btp->bt_delwri_lock); + list_move(&bp->b_list, &btp->bt_delwri_queue); + spin_unlock(&btp->bt_delwri_lock); } /* @@ -1667,18 +1634,16 @@ xfs_buf_delwri_split( unsigned long age) { xfs_buf_t *bp, *n; - struct list_head *dwq = &target->bt_delwrite_queue; - spinlock_t *dwlk = &target->bt_delwrite_lock; int skipped = 0; int force; force = test_and_clear_bit(XBT_FORCE_FLUSH, &target->bt_flags); INIT_LIST_HEAD(list); - spin_lock(dwlk); - list_for_each_entry_safe(bp, n, dwq, b_list) { + spin_lock(&target->bt_delwri_lock); + list_for_each_entry_safe(bp, n, &target->bt_delwri_queue, b_list) { ASSERT(bp->b_flags & XBF_DELWRI); - if (!XFS_BUF_ISPINNED(bp) && xfs_buf_trylock(bp)) { + if (!xfs_buf_ispinned(bp) && xfs_buf_trylock(bp)) { if (!force && time_before(jiffies, bp->b_queuetime + age)) { xfs_buf_unlock(bp); @@ -1692,10 +1657,9 @@ xfs_buf_delwri_split( } else skipped++; } - spin_unlock(dwlk); + spin_unlock(&target->bt_delwri_lock); return skipped; - } /* @@ -1745,7 +1709,7 @@ xfsbufd( } /* sleep for a long time if there is nothing to do. */ - if (list_empty(&target->bt_delwrite_queue)) + if (list_empty(&target->bt_delwri_queue)) tout = MAX_SCHEDULE_TIMEOUT; schedule_timeout_interruptible(tout); @@ -1781,9 +1745,7 @@ xfs_flush_buftarg( LIST_HEAD(wait_list); struct blk_plug plug; - xfs_buf_runall_queues(xfsconvertd_workqueue); - xfs_buf_runall_queues(xfsdatad_workqueue); - xfs_buf_runall_queues(xfslogd_workqueue); + flush_workqueue(xfslogd_workqueue); set_bit(XBT_FORCE_FLUSH, &target->bt_flags); pincount = xfs_buf_delwri_split(target, &tmp_list, 0); @@ -1864,11 +1826,3 @@ xfs_buf_terminate(void) destroy_workqueue(xfslogd_workqueue); kmem_zone_destroy(xfs_buf_zone); } - -#ifdef CONFIG_KDB_MODULES -struct list_head * -xfs_get_buftarg_list(void) -{ - return &xfs_buftarg_list; -} -#endif diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/xfs_buf.h index 6a83b46b4bcf..5bab046e859f 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -105,8 +105,8 @@ typedef struct xfs_buftarg { /* per device delwri queue */ struct task_struct *bt_task; - struct list_head bt_delwrite_queue; - spinlock_t bt_delwrite_lock; + struct list_head bt_delwri_queue; + spinlock_t bt_delwri_lock; unsigned long bt_flags; /* LRU control structures */ @@ -175,7 +175,8 @@ extern xfs_buf_t *xfs_buf_get(xfs_buftarg_t *, xfs_off_t, size_t, extern xfs_buf_t *xfs_buf_read(xfs_buftarg_t *, xfs_off_t, size_t, xfs_buf_flags_t); -extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *); +struct xfs_buf *xfs_buf_alloc(struct xfs_buftarg *, xfs_off_t, size_t, + xfs_buf_flags_t); extern void xfs_buf_set_empty(struct xfs_buf *bp, size_t len); extern xfs_buf_t *xfs_buf_get_uncached(struct xfs_buftarg *, size_t, int); extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t); @@ -197,14 +198,14 @@ extern void xfs_buf_unlock(xfs_buf_t *); ((bp)->b_sema.count <= 0) /* Buffer Read and Write Routines */ -extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp); -extern void xfs_bdwrite(void *mp, xfs_buf_t *bp); +extern int xfs_bwrite(struct xfs_buf *bp); extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *); extern int xfs_bdstrat_cb(struct xfs_buf *); extern void xfs_buf_ioend(xfs_buf_t *, int); extern void xfs_buf_ioerror(xfs_buf_t *, int); +extern void xfs_buf_ioerror_alert(struct xfs_buf *, const char *func); extern int xfs_buf_iorequest(xfs_buf_t *); extern int xfs_buf_iowait(xfs_buf_t *); extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *, @@ -221,53 +222,32 @@ static inline int xfs_buf_geterror(xfs_buf_t *bp) extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t); /* Delayed Write Buffer Routines */ -extern void xfs_buf_delwri_dequeue(xfs_buf_t *); -extern void xfs_buf_delwri_promote(xfs_buf_t *); +extern void xfs_buf_delwri_queue(struct xfs_buf *); +extern void xfs_buf_delwri_dequeue(struct xfs_buf *); +extern void xfs_buf_delwri_promote(struct xfs_buf *); /* Buffer Daemon Setup Routines */ extern int xfs_buf_init(void); extern void xfs_buf_terminate(void); -#define xfs_buf_target_name(target) \ - ({ char __b[BDEVNAME_SIZE]; bdevname((target)->bt_bdev, __b); __b; }) - - -#define XFS_BUF_BFLAGS(bp) ((bp)->b_flags) #define XFS_BUF_ZEROFLAGS(bp) \ ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI| \ XBF_SYNCIO|XBF_FUA|XBF_FLUSH)) void xfs_buf_stale(struct xfs_buf *bp); -#define XFS_BUF_STALE(bp) xfs_buf_stale(bp); #define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE) #define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE) -#define XFS_BUF_SUPER_STALE(bp) do { \ - XFS_BUF_STALE(bp); \ - xfs_buf_delwri_dequeue(bp); \ - XFS_BUF_DONE(bp); \ - } while (0) - -#define XFS_BUF_DELAYWRITE(bp) ((bp)->b_flags |= XBF_DELWRI) -#define XFS_BUF_UNDELAYWRITE(bp) xfs_buf_delwri_dequeue(bp) -#define XFS_BUF_ISDELAYWRITE(bp) ((bp)->b_flags & XBF_DELWRI) -#define XFS_BUF_ERROR(bp,no) xfs_buf_ioerror(bp,no) -#define XFS_BUF_GETERROR(bp) xfs_buf_geterror(bp) -#define XFS_BUF_ISERROR(bp) (xfs_buf_geterror(bp) ? 1 : 0) +#define XFS_BUF_ISDELAYWRITE(bp) ((bp)->b_flags & XBF_DELWRI) #define XFS_BUF_DONE(bp) ((bp)->b_flags |= XBF_DONE) #define XFS_BUF_UNDONE(bp) ((bp)->b_flags &= ~XBF_DONE) #define XFS_BUF_ISDONE(bp) ((bp)->b_flags & XBF_DONE) -#define XFS_BUF_BUSY(bp) do { } while (0) -#define XFS_BUF_UNBUSY(bp) do { } while (0) -#define XFS_BUF_ISBUSY(bp) (1) - #define XFS_BUF_ASYNC(bp) ((bp)->b_flags |= XBF_ASYNC) #define XFS_BUF_UNASYNC(bp) ((bp)->b_flags &= ~XBF_ASYNC) #define XFS_BUF_ISASYNC(bp) ((bp)->b_flags & XBF_ASYNC) -#define XFS_BUF_HOLD(bp) xfs_buf_hold(bp) #define XFS_BUF_READ(bp) ((bp)->b_flags |= XBF_READ) #define XFS_BUF_UNREAD(bp) ((bp)->b_flags &= ~XBF_READ) #define XFS_BUF_ISREAD(bp) ((bp)->b_flags & XBF_READ) @@ -276,10 +256,6 @@ void xfs_buf_stale(struct xfs_buf *bp); #define XFS_BUF_UNWRITE(bp) ((bp)->b_flags &= ~XBF_WRITE) #define XFS_BUF_ISWRITE(bp) ((bp)->b_flags & XBF_WRITE) -#define XFS_BUF_SET_START(bp) do { } while (0) - -#define XFS_BUF_PTR(bp) (xfs_caddr_t)((bp)->b_addr) -#define XFS_BUF_SET_PTR(bp, val, cnt) xfs_buf_associate_memory(bp, val, cnt) #define XFS_BUF_ADDR(bp) ((bp)->b_bn) #define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_bn = (xfs_daddr_t)(bno)) #define XFS_BUF_OFFSET(bp) ((bp)->b_file_offset) @@ -289,23 +265,15 @@ void xfs_buf_stale(struct xfs_buf *bp); #define XFS_BUF_SIZE(bp) ((bp)->b_buffer_length) #define XFS_BUF_SET_SIZE(bp, cnt) ((bp)->b_buffer_length = (cnt)) -static inline void -xfs_buf_set_ref( - struct xfs_buf *bp, - int lru_ref) +static inline void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) { atomic_set(&bp->b_lru_ref, lru_ref); } -#define XFS_BUF_SET_VTYPE_REF(bp, type, ref) xfs_buf_set_ref(bp, ref) -#define XFS_BUF_SET_VTYPE(bp, type) do { } while (0) - -#define XFS_BUF_ISPINNED(bp) atomic_read(&((bp)->b_pin_count)) -#define XFS_BUF_FINISH_IOWAIT(bp) complete(&bp->b_iowait); - -#define XFS_BUF_SET_TARGET(bp, target) ((bp)->b_target = (target)) -#define XFS_BUF_TARGET(bp) ((bp)->b_target) -#define XFS_BUFTARG_NAME(target) xfs_buf_target_name(target) +static inline int xfs_buf_ispinned(struct xfs_buf *bp) +{ + return atomic_read(&bp->b_pin_count); +} static inline void xfs_buf_relse(xfs_buf_t *bp) { @@ -323,14 +291,7 @@ extern void xfs_wait_buftarg(xfs_buftarg_t *); extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); extern int xfs_flush_buftarg(xfs_buftarg_t *, int); -#ifdef CONFIG_KDB_MODULES -extern struct list_head *xfs_get_buftarg_list(void); -#endif - #define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev) #define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) -#define xfs_binval(buftarg) xfs_flush_buftarg(buftarg, 1) -#define XFS_bflush(buftarg) xfs_flush_buftarg(buftarg, 1) - #endif /* __XFS_BUF_H__ */ diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 88492916c3dc..eac97ef81e2a 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -124,9 +124,9 @@ xfs_buf_item_log_check( bp = bip->bli_buf; ASSERT(XFS_BUF_COUNT(bp) > 0); - ASSERT(XFS_BUF_PTR(bp) != NULL); + ASSERT(bp->b_addr != NULL); orig = bip->bli_orig; - buffer = XFS_BUF_PTR(bp); + buffer = bp->b_addr; for (x = 0; x < XFS_BUF_COUNT(bp); x++) { if (orig[x] != buffer[x] && !btst(bip->bli_logged, x)) { xfs_emerg(bp->b_mount, @@ -371,7 +371,6 @@ xfs_buf_item_pin( { struct xfs_buf_log_item *bip = BUF_ITEM(lip); - ASSERT(XFS_BUF_ISBUSY(bip->bli_buf)); ASSERT(atomic_read(&bip->bli_refcount) > 0); ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || (bip->bli_flags & XFS_BLI_STALE)); @@ -479,13 +478,13 @@ xfs_buf_item_trylock( struct xfs_buf_log_item *bip = BUF_ITEM(lip); struct xfs_buf *bp = bip->bli_buf; - if (XFS_BUF_ISPINNED(bp)) + if (xfs_buf_ispinned(bp)) return XFS_ITEM_PINNED; if (!xfs_buf_trylock(bp)) return XFS_ITEM_LOCKED; /* take a reference to the buffer. */ - XFS_BUF_HOLD(bp); + xfs_buf_hold(bp); ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); trace_xfs_buf_item_trylock(bip); @@ -630,7 +629,7 @@ xfs_buf_item_push( * the xfsbufd to get this buffer written. We have to unlock the buffer * to allow the xfsbufd to write it, too. */ -STATIC void +STATIC bool xfs_buf_item_pushbuf( struct xfs_log_item *lip) { @@ -644,6 +643,7 @@ xfs_buf_item_pushbuf( xfs_buf_delwri_promote(bp); xfs_buf_relse(bp); + return true; } STATIC void @@ -656,7 +656,7 @@ xfs_buf_item_committing( /* * This is the ops vector shared by all buf log items. */ -static struct xfs_item_ops xfs_buf_item_ops = { +static const struct xfs_item_ops xfs_buf_item_ops = { .iop_size = xfs_buf_item_size, .iop_format = xfs_buf_item_format, .iop_pin = xfs_buf_item_pin, @@ -726,7 +726,7 @@ xfs_buf_item_init( * to have logged. */ bip->bli_orig = (char *)kmem_alloc(XFS_BUF_COUNT(bp), KM_SLEEP); - memcpy(bip->bli_orig, XFS_BUF_PTR(bp), XFS_BUF_COUNT(bp)); + memcpy(bip->bli_orig, bp->b_addr, XFS_BUF_COUNT(bp)); bip->bli_logged = (char *)kmem_zalloc(XFS_BUF_COUNT(bp) / NBBY, KM_SLEEP); #endif @@ -895,7 +895,6 @@ xfs_buf_attach_iodone( { xfs_log_item_t *head_lip; - ASSERT(XFS_BUF_ISBUSY(bp)); ASSERT(xfs_buf_islocked(bp)); lip->li_cb = cb; @@ -960,7 +959,7 @@ xfs_buf_iodone_callbacks( static ulong lasttime; static xfs_buftarg_t *lasttarg; - if (likely(!XFS_BUF_GETERROR(bp))) + if (likely(!xfs_buf_geterror(bp))) goto do_callbacks; /* @@ -968,19 +967,18 @@ xfs_buf_iodone_callbacks( * I/O errors, there's no point in giving this a retry. */ if (XFS_FORCED_SHUTDOWN(mp)) { - XFS_BUF_SUPER_STALE(bp); + xfs_buf_stale(bp); + XFS_BUF_DONE(bp); trace_xfs_buf_item_iodone(bp, _RET_IP_); goto do_callbacks; } - if (XFS_BUF_TARGET(bp) != lasttarg || + if (bp->b_target != lasttarg || time_after(jiffies, (lasttime + 5*HZ))) { lasttime = jiffies; - xfs_alert(mp, "Device %s: metadata write error block 0x%llx", - XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)), - (__uint64_t)XFS_BUF_ADDR(bp)); + xfs_buf_ioerror_alert(bp, __func__); } - lasttarg = XFS_BUF_TARGET(bp); + lasttarg = bp->b_target; /* * If the write was asynchronous then no one will be looking for the @@ -991,12 +989,11 @@ xfs_buf_iodone_callbacks( * around. */ if (XFS_BUF_ISASYNC(bp)) { - XFS_BUF_ERROR(bp, 0); /* errno of 0 unsets the flag */ + xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */ if (!XFS_BUF_ISSTALE(bp)) { - XFS_BUF_DELAYWRITE(bp); + xfs_buf_delwri_queue(bp); XFS_BUF_DONE(bp); - XFS_BUF_SET_START(bp); } ASSERT(bp->b_iodone != NULL); trace_xfs_buf_item_iodone_async(bp, _RET_IP_); @@ -1008,12 +1005,10 @@ xfs_buf_iodone_callbacks( * If the write of the buffer was synchronous, we want to make * sure to return the error to the caller of xfs_bwrite(). */ - XFS_BUF_STALE(bp); + xfs_buf_stale(bp); XFS_BUF_DONE(bp); - XFS_BUF_UNDELAYWRITE(bp); trace_xfs_buf_error_relse(bp, _RET_IP_); - xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); do_callbacks: xfs_buf_do_callbacks(bp); diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 2925726529f8..77c74257c2a3 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -692,6 +692,24 @@ xfs_da_join(xfs_da_state_t *state) return(error); } +#ifdef DEBUG +static void +xfs_da_blkinfo_onlychild_validate(struct xfs_da_blkinfo *blkinfo, __u16 level) +{ + __be16 magic = blkinfo->magic; + + if (level == 1) { + ASSERT(magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || + magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); + } else + ASSERT(magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); + ASSERT(!blkinfo->forw); + ASSERT(!blkinfo->back); +} +#else /* !DEBUG */ +#define xfs_da_blkinfo_onlychild_validate(blkinfo, level) +#endif /* !DEBUG */ + /* * We have only one entry in the root. Copy the only remaining child of * the old root to block 0 as the new root node. @@ -700,8 +718,6 @@ STATIC int xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk) { xfs_da_intnode_t *oldroot; - /* REFERENCED */ - xfs_da_blkinfo_t *blkinfo; xfs_da_args_t *args; xfs_dablk_t child; xfs_dabuf_t *bp; @@ -732,15 +748,9 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk) if (error) return(error); ASSERT(bp != NULL); - blkinfo = bp->data; - if (be16_to_cpu(oldroot->hdr.level) == 1) { - ASSERT(blkinfo->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || - blkinfo->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - } else { - ASSERT(blkinfo->magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); - } - ASSERT(!blkinfo->forw); - ASSERT(!blkinfo->back); + xfs_da_blkinfo_onlychild_validate(bp->data, + be16_to_cpu(oldroot->hdr.level)); + memcpy(root_blk->bp->data, bp->data, state->blocksize); xfs_da_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1); error = xfs_da_shrink_inode(args, child, bp); @@ -1568,9 +1578,8 @@ xfs_da_grow_inode_int( */ nmap = 1; ASSERT(args->firstblock != NULL); - error = xfs_bmapi(tp, dp, *bno, count, - xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE|XFS_BMAPI_METADATA| - XFS_BMAPI_CONTIG, + error = xfs_bmapi_write(tp, dp, *bno, count, + xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG, args->firstblock, args->total, &map, &nmap, args->flist); if (error) @@ -1592,9 +1601,8 @@ xfs_da_grow_inode_int( for (b = *bno, mapi = 0; b < *bno + count; ) { nmap = MIN(XFS_BMAP_MAX_NMAP, count); c = (int)(*bno + count - b); - error = xfs_bmapi(tp, dp, b, c, - xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE| - XFS_BMAPI_METADATA, + error = xfs_bmapi_write(tp, dp, b, c, + xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA, args->firstblock, args->total, &mapp[mapi], &nmap, args->flist); if (error) @@ -1965,33 +1973,16 @@ xfs_da_do_buf( /* * Optimize the one-block case. */ - if (nfsb == 1) { - xfs_fsblock_t fsb; - - if ((error = - xfs_bmapi_single(trans, dp, whichfork, &fsb, - (xfs_fileoff_t)bno))) { - return error; - } + if (nfsb == 1) mapp = ↦ - if (fsb == NULLFSBLOCK) { - nmap = 0; - } else { - map.br_startblock = fsb; - map.br_startoff = (xfs_fileoff_t)bno; - map.br_blockcount = 1; - nmap = 1; - } - } else { + else mapp = kmem_alloc(sizeof(*mapp) * nfsb, KM_SLEEP); - nmap = nfsb; - if ((error = xfs_bmapi(trans, dp, (xfs_fileoff_t)bno, - nfsb, - XFS_BMAPI_METADATA | - xfs_bmapi_aflag(whichfork), - NULL, 0, mapp, &nmap, NULL))) - goto exit0; - } + + nmap = nfsb; + error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, mapp, + &nmap, xfs_bmapi_aflag(whichfork)); + if (error) + goto exit0; } else { map.br_startblock = XFS_DADDR_TO_FSB(mp, mappedbno); map.br_startoff = (xfs_fileoff_t)bno; @@ -2040,7 +2031,7 @@ xfs_da_do_buf( case 0: bp = xfs_trans_get_buf(trans, mp->m_ddev_targp, mappedbno, nmapped, 0); - error = bp ? XFS_BUF_GETERROR(bp) : XFS_ERROR(EIO); + error = bp ? bp->b_error : XFS_ERROR(EIO); break; case 1: case 2: @@ -2062,13 +2053,10 @@ xfs_da_do_buf( if (!bp) continue; if (caller == 1) { - if (whichfork == XFS_ATTR_FORK) { - XFS_BUF_SET_VTYPE_REF(bp, B_FS_ATTR_BTREE, - XFS_ATTR_BTREE_REF); - } else { - XFS_BUF_SET_VTYPE_REF(bp, B_FS_DIR_BTREE, - XFS_DIR_BTREE_REF); - } + if (whichfork == XFS_ATTR_FORK) + xfs_buf_set_ref(bp, XFS_ATTR_BTREE_REF); + else + xfs_buf_set_ref(bp, XFS_DIR_BTREE_REF); } if (bplist) { bplist[nbplist++] = bp; @@ -2258,7 +2246,7 @@ xfs_da_buf_make(int nbuf, xfs_buf_t **bps) dabuf->nbuf = 1; bp = bps[0]; dabuf->bbcount = (short)BTOBB(XFS_BUF_COUNT(bp)); - dabuf->data = XFS_BUF_PTR(bp); + dabuf->data = bp->b_addr; dabuf->bps[0] = bp; } else { dabuf->nbuf = nbuf; @@ -2269,7 +2257,7 @@ xfs_da_buf_make(int nbuf, xfs_buf_t **bps) dabuf->data = kmem_alloc(BBTOB(dabuf->bbcount), KM_SLEEP); for (i = off = 0; i < nbuf; i++, off += XFS_BUF_COUNT(bp)) { bp = bps[i]; - memcpy((char *)dabuf->data + off, XFS_BUF_PTR(bp), + memcpy((char *)dabuf->data + off, bp->b_addr, XFS_BUF_COUNT(bp)); } } @@ -2292,8 +2280,8 @@ xfs_da_buf_clean(xfs_dabuf_t *dabuf) for (i = off = 0; i < dabuf->nbuf; i++, off += XFS_BUF_COUNT(bp)) { bp = dabuf->bps[i]; - memcpy(XFS_BUF_PTR(bp), (char *)dabuf->data + off, - XFS_BUF_COUNT(bp)); + memcpy(bp->b_addr, dabuf->data + off, + XFS_BUF_COUNT(bp)); } } } @@ -2330,7 +2318,7 @@ xfs_da_log_buf(xfs_trans_t *tp, xfs_dabuf_t *dabuf, uint first, uint last) ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]); if (dabuf->nbuf == 1) { - ASSERT(dabuf->data == (void *)XFS_BUF_PTR(dabuf->bps[0])); + ASSERT(dabuf->data == dabuf->bps[0]->b_addr); xfs_trans_log_buf(tp, dabuf->bps[0], first, last); return; } diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index 9a84a85c03b1..654dc6f05bac 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -425,8 +425,8 @@ xfs_swap_extents( } - xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - xfs_trans_ijoin_ref(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); xfs_trans_log_inode(tp, ip, ilf_fields); xfs_trans_log_inode(tp, tip, tilf_fields); @@ -438,7 +438,7 @@ xfs_swap_extents( if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, XFS_TRANS_SWAPEXT); + error = xfs_trans_commit(tp, 0); trace_xfs_swap_extent_after(ip, 0); trace_xfs_swap_extent_after(tip, 1); diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h index dffba9ba0db6..a3721633abc8 100644 --- a/fs/xfs/xfs_dinode.h +++ b/fs/xfs/xfs_dinode.h @@ -148,7 +148,7 @@ typedef enum xfs_dinode_fmt { be32_to_cpu((dip)->di_nextents) : \ be16_to_cpu((dip)->di_anextents)) -#define XFS_BUF_TO_DINODE(bp) ((xfs_dinode_t *)XFS_BUF_PTR(bp)) +#define XFS_BUF_TO_DINODE(bp) ((xfs_dinode_t *)((bp)->b_addr)) /* * For block and character special files the 32bit dev_t is stored at the diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c index 4580ce00aeb4..a2e27010c7fb 100644 --- a/fs/xfs/xfs_dir2.c +++ b/fs/xfs/xfs_dir2.c @@ -121,7 +121,7 @@ xfs_dir_isempty( { xfs_dir2_sf_hdr_t *sfp; - ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + ASSERT(S_ISDIR(dp->i_d.di_mode)); if (dp->i_d.di_size == 0) /* might happen during shutdown. */ return 1; if (dp->i_d.di_size > XFS_IFORK_DSIZE(dp)) @@ -179,7 +179,7 @@ xfs_dir_init( memset((char *)&args, 0, sizeof(args)); args.dp = dp; args.trans = tp; - ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + ASSERT(S_ISDIR(dp->i_d.di_mode)); if ((error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino))) return error; return xfs_dir2_sf_create(&args, pdp->i_ino); @@ -202,7 +202,7 @@ xfs_dir_createname( int rval; int v; /* type-checking value */ - ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + ASSERT(S_ISDIR(dp->i_d.di_mode)); if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum))) return rval; XFS_STATS_INC(xs_dir_create); @@ -278,7 +278,7 @@ xfs_dir_lookup( int rval; int v; /* type-checking value */ - ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + ASSERT(S_ISDIR(dp->i_d.di_mode)); XFS_STATS_INC(xs_dir_lookup); memset(&args, 0, sizeof(xfs_da_args_t)); @@ -333,7 +333,7 @@ xfs_dir_removename( int rval; int v; /* type-checking value */ - ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + ASSERT(S_ISDIR(dp->i_d.di_mode)); XFS_STATS_INC(xs_dir_remove); memset(&args, 0, sizeof(xfs_da_args_t)); @@ -382,7 +382,7 @@ xfs_readdir( if (XFS_FORCED_SHUTDOWN(dp->i_mount)) return XFS_ERROR(EIO); - ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + ASSERT(S_ISDIR(dp->i_d.di_mode)); XFS_STATS_INC(xs_dir_getdents); if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) @@ -414,7 +414,7 @@ xfs_dir_replace( int rval; int v; /* type-checking value */ - ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + ASSERT(S_ISDIR(dp->i_d.di_mode)); if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum))) return rval; @@ -464,7 +464,7 @@ xfs_dir_canenter( if (resblks) return 0; - ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + ASSERT(S_ISDIR(dp->i_d.di_mode)); memset(&args, 0, sizeof(xfs_da_args_t)); args.name = name->name; diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index ca2386d82cdf..66e108f561a3 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -888,12 +888,10 @@ xfs_dir2_leaf_getdents( * we already have in the table. */ nmap = map_size - map_valid; - error = xfs_bmapi(NULL, dp, - map_off, + error = xfs_bmapi_read(dp, map_off, xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET) - map_off, - XFS_BMAPI_METADATA, NULL, 0, - &map[map_valid], &nmap, NULL); + &map[map_valid], &nmap, 0); /* * Don't know if we should ignore this or * try to return an error. diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index 084b3247d636..0179a41d9e5a 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -1564,7 +1564,7 @@ xfs_dir2_node_addname_int( if (unlikely(xfs_dir2_db_to_fdb(mp, dbno) != fbno)) { xfs_alert(mp, - "%s: dir ino " "%llu needed freesp block %lld for\n" + "%s: dir ino %llu needed freesp block %lld for\n" " data block %lld, got %lld ifbno %llu lastfbno %d", __func__, (unsigned long long)dp->i_ino, (long long)xfs_dir2_db_to_fdb(mp, dbno), diff --git a/fs/xfs/linux-2.6/xfs_discard.c b/fs/xfs/xfs_discard.c index 244e797dae32..8a24f0c6c860 100644 --- a/fs/xfs/linux-2.6/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -38,7 +38,7 @@ xfs_trim_extents( struct xfs_mount *mp, xfs_agnumber_t agno, xfs_fsblock_t start, - xfs_fsblock_t len, + xfs_fsblock_t end, xfs_fsblock_t minlen, __uint64_t *blocks_trimmed) { @@ -100,7 +100,7 @@ xfs_trim_extents( * down partially overlapping ranges for now. */ if (XFS_AGB_TO_FSB(mp, agno, fbno) + flen < start || - XFS_AGB_TO_FSB(mp, agno, fbno) >= start + len) { + XFS_AGB_TO_FSB(mp, agno, fbno) > end) { trace_xfs_discard_exclude(mp, agno, fbno, flen); goto next_extent; } @@ -145,7 +145,7 @@ xfs_ioc_trim( struct request_queue *q = mp->m_ddev_targp->bt_bdev->bd_disk->queue; unsigned int granularity = q->limits.discard_granularity; struct fstrim_range range; - xfs_fsblock_t start, len, minlen; + xfs_fsblock_t start, end, minlen; xfs_agnumber_t start_agno, end_agno, agno; __uint64_t blocks_trimmed = 0; int error, last_error = 0; @@ -165,19 +165,19 @@ xfs_ioc_trim( * matter as trimming blocks is an advisory interface. */ start = XFS_B_TO_FSBT(mp, range.start); - len = XFS_B_TO_FSBT(mp, range.len); + end = start + XFS_B_TO_FSBT(mp, range.len) - 1; minlen = XFS_B_TO_FSB(mp, max_t(u64, granularity, range.minlen)); - start_agno = XFS_FSB_TO_AGNO(mp, start); - if (start_agno >= mp->m_sb.sb_agcount) + if (start >= mp->m_sb.sb_dblocks) return -XFS_ERROR(EINVAL); + if (end > mp->m_sb.sb_dblocks - 1) + end = mp->m_sb.sb_dblocks - 1; - end_agno = XFS_FSB_TO_AGNO(mp, start + len); - if (end_agno >= mp->m_sb.sb_agcount) - end_agno = mp->m_sb.sb_agcount - 1; + start_agno = XFS_FSB_TO_AGNO(mp, start); + end_agno = XFS_FSB_TO_AGNO(mp, end); for (agno = start_agno; agno <= end_agno; agno++) { - error = -xfs_trim_extents(mp, agno, start, len, minlen, + error = -xfs_trim_extents(mp, agno, start, end, minlen, &blocks_trimmed); if (error) last_error = error; diff --git a/fs/xfs/linux-2.6/xfs_discard.h b/fs/xfs/xfs_discard.h index 344879aea646..344879aea646 100644 --- a/fs/xfs/linux-2.6/xfs_discard.h +++ b/fs/xfs/xfs_discard.h diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 837f31158d43..25d7280e9f6b 100644 --- a/fs/xfs/quota/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -318,10 +318,9 @@ xfs_qm_init_dquot_blk( int curid, i; ASSERT(tp); - ASSERT(XFS_BUF_ISBUSY(bp)); ASSERT(xfs_buf_islocked(bp)); - d = (xfs_dqblk_t *)XFS_BUF_PTR(bp); + d = bp->b_addr; /* * ID of the first dquot in the block - id's are zero based. @@ -378,16 +377,14 @@ xfs_qm_dqalloc( return (ESRCH); } - xfs_trans_ijoin_ref(tp, quotip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL); nmaps = 1; - if ((error = xfs_bmapi(tp, quotip, - offset_fsb, XFS_DQUOT_CLUSTER_SIZE_FSB, - XFS_BMAPI_METADATA | XFS_BMAPI_WRITE, - &firstblock, - XFS_QM_DQALLOC_SPACE_RES(mp), - &map, &nmaps, &flist))) { + error = xfs_bmapi_write(tp, quotip, offset_fsb, + XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, + &firstblock, XFS_QM_DQALLOC_SPACE_RES(mp), + &map, &nmaps, &flist); + if (error) goto error0; - } ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB); ASSERT(nmaps == 1); ASSERT((map.br_startblock != DELAYSTARTBLOCK) && @@ -403,8 +400,11 @@ xfs_qm_dqalloc( dqp->q_blkno, mp->m_quotainfo->qi_dqchunklen, 0); - if (!bp || (error = XFS_BUF_GETERROR(bp))) + + error = xfs_buf_geterror(bp); + if (error) goto error1; + /* * Make a chunk of dquots out of this buffer and log * the entire thing. @@ -486,9 +486,8 @@ xfs_qm_dqtobp( /* * Find the block map; no allocations yet */ - error = xfs_bmapi(NULL, quotip, dqp->q_fileoffset, - XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, - NULL, 0, &map, &nmaps, NULL); + error = xfs_bmapi_read(quotip, dqp->q_fileoffset, + XFS_DQUOT_CLUSTER_SIZE_FSB, &map, &nmaps, 0); xfs_iunlock(quotip, XFS_ILOCK_SHARED); if (error) @@ -534,13 +533,12 @@ xfs_qm_dqtobp( return XFS_ERROR(error); } - ASSERT(XFS_BUF_ISBUSY(bp)); ASSERT(xfs_buf_islocked(bp)); /* * calculate the location of the dquot inside the buffer. */ - ddq = (struct xfs_disk_dquot *)(XFS_BUF_PTR(bp) + dqp->q_bufoffset); + ddq = bp->b_addr + dqp->q_bufoffset; /* * A simple sanity check in case we got a corrupted dquot... @@ -553,7 +551,6 @@ xfs_qm_dqtobp( xfs_trans_brelse(tp, bp); return XFS_ERROR(EIO); } - XFS_BUF_BUSY(bp); /* We dirtied this */ } *O_bpp = bp; @@ -608,7 +605,7 @@ xfs_qm_dqread( dqp->q_res_rtbcount = be64_to_cpu(ddqp->d_rtbcount); /* Mark the buf so that this will stay incore a little longer */ - XFS_BUF_SET_VTYPE_REF(bp, B_FS_DQUOT, XFS_DQUOT_REF); + xfs_buf_set_ref(bp, XFS_DQUOT_REF); /* * We got the buffer with a xfs_trans_read_buf() (in dqtobp()) @@ -622,7 +619,6 @@ xfs_qm_dqread( * this particular dquot was repaired. We still aren't afraid to * brelse it because we have the changes incore. */ - ASSERT(XFS_BUF_ISBUSY(bp)); ASSERT(xfs_buf_islocked(bp)); xfs_trans_brelse(tp, bp); @@ -1204,7 +1200,7 @@ xfs_qm_dqflush( /* * Calculate the location of the dquot inside the buffer. */ - ddqp = (struct xfs_disk_dquot *)(XFS_BUF_PTR(bp) + dqp->q_bufoffset); + ddqp = bp->b_addr + dqp->q_bufoffset; /* * A simple sanity check in case we got a corrupted dquot.. @@ -1240,15 +1236,17 @@ xfs_qm_dqflush( * If the buffer is pinned then push on the log so we won't * get stuck waiting in the write for too long. */ - if (XFS_BUF_ISPINNED(bp)) { + if (xfs_buf_ispinned(bp)) { trace_xfs_dqflush_force(dqp); xfs_log_force(mp, 0); } if (flags & SYNC_WAIT) - error = xfs_bwrite(mp, bp); + error = xfs_bwrite(bp); else - xfs_bdwrite(mp, bp); + xfs_buf_delwri_queue(bp); + + xfs_buf_relse(bp); trace_xfs_dqflush_done(dqp); @@ -1447,7 +1445,7 @@ xfs_qm_dqflock_pushbuf_wait( goto out_lock; if (XFS_BUF_ISDELAYWRITE(bp)) { - if (XFS_BUF_ISPINNED(bp)) + if (xfs_buf_ispinned(bp)) xfs_log_force(mp, 0); xfs_buf_delwri_promote(bp); wake_up_process(bp->b_target->bt_task); diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 34b7e945dbfa..34b7e945dbfa 100644 --- a/fs/xfs/quota/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index 9e0e2fa3f2c8..0dee0b71029d 100644 --- a/fs/xfs/quota/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -183,13 +183,14 @@ xfs_qm_dqunpin_wait( * search the buffer cache can be a time consuming thing, and AIL lock is a * spinlock. */ -STATIC void +STATIC bool xfs_qm_dquot_logitem_pushbuf( struct xfs_log_item *lip) { struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip); struct xfs_dquot *dqp = qlip->qli_dquot; struct xfs_buf *bp; + bool ret = true; ASSERT(XFS_DQ_IS_LOCKED(dqp)); @@ -201,17 +202,20 @@ xfs_qm_dquot_logitem_pushbuf( if (completion_done(&dqp->q_flush) || !(lip->li_flags & XFS_LI_IN_AIL)) { xfs_dqunlock(dqp); - return; + return true; } bp = xfs_incore(dqp->q_mount->m_ddev_targp, qlip->qli_format.qlf_blkno, dqp->q_mount->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK); xfs_dqunlock(dqp); if (!bp) - return; + return true; if (XFS_BUF_ISDELAYWRITE(bp)) xfs_buf_delwri_promote(bp); + if (xfs_buf_ispinned(bp)) + ret = false; xfs_buf_relse(bp); + return ret; } /* @@ -291,7 +295,7 @@ xfs_qm_dquot_logitem_committing( /* * This is the ops vector for dquots */ -static struct xfs_item_ops xfs_dquot_item_ops = { +static const struct xfs_item_ops xfs_dquot_item_ops = { .iop_size = xfs_qm_dquot_logitem_size, .iop_format = xfs_qm_dquot_logitem_format, .iop_pin = xfs_qm_dquot_logitem_pin, @@ -479,7 +483,7 @@ xfs_qm_qoff_logitem_committing( { } -static struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { +static const struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { .iop_size = xfs_qm_qoff_logitem_size, .iop_format = xfs_qm_qoff_logitem_format, .iop_pin = xfs_qm_qoff_logitem_pin, @@ -494,7 +498,7 @@ static struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { /* * This is the ops vector shared by all quotaoff-start log items. */ -static struct xfs_item_ops xfs_qm_qoff_logitem_ops = { +static const struct xfs_item_ops xfs_qm_qoff_logitem_ops = { .iop_size = xfs_qm_qoff_logitem_size, .iop_format = xfs_qm_qoff_logitem_format, .iop_pin = xfs_qm_qoff_logitem_pin, diff --git a/fs/xfs/quota/xfs_dquot_item.h b/fs/xfs/xfs_dquot_item.h index 5acae2ada70b..5acae2ada70b 100644 --- a/fs/xfs/quota/xfs_dquot_item.h +++ b/fs/xfs/xfs_dquot_item.h diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/xfs_export.c index 75e5d322e48f..da108977b21f 100644 --- a/fs/xfs/linux-2.6/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -229,16 +229,16 @@ xfs_fs_nfs_commit_metadata( { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; - int error = 0; + xfs_lsn_t lsn = 0; xfs_ilock(ip, XFS_ILOCK_SHARED); - if (xfs_ipincount(ip)) { - error = _xfs_log_force_lsn(mp, ip->i_itemp->ili_last_lsn, - XFS_LOG_SYNC, NULL); - } + if (xfs_ipincount(ip)) + lsn = ip->i_itemp->ili_last_lsn; xfs_iunlock(ip, XFS_ILOCK_SHARED); - return error; + if (!lsn) + return 0; + return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); } const struct export_operations xfs_export_operations = { diff --git a/fs/xfs/linux-2.6/xfs_export.h b/fs/xfs/xfs_export.h index 3272b6ae7a35..3272b6ae7a35 100644 --- a/fs/xfs/linux-2.6/xfs_export.h +++ b/fs/xfs/xfs_export.h diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index d22e62623437..35c2aff38b20 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -217,7 +217,7 @@ xfs_efi_item_committing( /* * This is the ops vector shared by all efi log items. */ -static struct xfs_item_ops xfs_efi_item_ops = { +static const struct xfs_item_ops xfs_efi_item_ops = { .iop_size = xfs_efi_item_size, .iop_format = xfs_efi_item_format, .iop_pin = xfs_efi_item_pin, @@ -477,7 +477,7 @@ xfs_efd_item_committing( /* * This is the ops vector shared by all efd log items. */ -static struct xfs_item_ops xfs_efd_item_ops = { +static const struct xfs_item_ops xfs_efd_item_ops = { .iop_size = xfs_efd_item_size, .iop_format = xfs_efd_item_format, .iop_pin = xfs_efd_item_pin, diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/xfs_file.c index cca00f49e092..753ed9b5c70b 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -124,6 +124,35 @@ xfs_iozero( return (-status); } +/* + * Fsync operations on directories are much simpler than on regular files, + * as there is no file data to flush, and thus also no need for explicit + * cache flush operations, and there are no non-transaction metadata updates + * on directories either. + */ +STATIC int +xfs_dir_fsync( + struct file *file, + loff_t start, + loff_t end, + int datasync) +{ + struct xfs_inode *ip = XFS_I(file->f_mapping->host); + struct xfs_mount *mp = ip->i_mount; + xfs_lsn_t lsn = 0; + + trace_xfs_dir_fsync(ip); + + xfs_ilock(ip, XFS_ILOCK_SHARED); + if (xfs_ipincount(ip)) + lsn = ip->i_itemp->ili_last_lsn; + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + if (!lsn) + return 0; + return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); +} + STATIC int xfs_file_fsync( struct file *file, @@ -137,6 +166,7 @@ xfs_file_fsync( struct xfs_trans *tp; int error = 0; int log_flushed = 0; + xfs_lsn_t lsn = 0; trace_xfs_file_fsync(ip); @@ -149,8 +179,6 @@ xfs_file_fsync( xfs_iflags_clear(ip, XFS_ITRUNCATED); - xfs_ioend_wait(ip); - if (mp->m_flags & XFS_MOUNT_BARRIER) { /* * If we have an RT and/or log subvolume we need to make sure @@ -214,11 +242,11 @@ xfs_file_fsync( * transaction. So we play it safe and fire off the * transaction anyway. */ - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - xfs_trans_set_sync(tp); - error = _xfs_trans_commit(tp, 0, &log_flushed); + error = xfs_trans_commit(tp, 0); + lsn = ip->i_itemp->ili_last_lsn; xfs_iunlock(ip, XFS_ILOCK_EXCL); } else { /* @@ -229,14 +257,14 @@ xfs_file_fsync( * disk yet, the inode will be still be pinned. If it is, * force the log. */ - if (xfs_ipincount(ip)) { - error = _xfs_log_force_lsn(mp, - ip->i_itemp->ili_last_lsn, - XFS_LOG_SYNC, &log_flushed); - } + if (xfs_ipincount(ip)) + lsn = ip->i_itemp->ili_last_lsn; xfs_iunlock(ip, XFS_ILOCK_SHARED); } + if (!error && lsn) + error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed); + /* * If we only have a single device, and the log force about was * a no-op we might have to flush the data device cache here. @@ -315,7 +343,19 @@ xfs_file_aio_read( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - if (unlikely(ioflags & IO_ISDIRECT)) { + /* + * Locking is a bit tricky here. If we take an exclusive lock + * for direct IO, we effectively serialise all new concurrent + * read IO to this file and block it behind IO that is currently in + * progress because IO in progress holds the IO lock shared. We only + * need to hold the lock exclusive to blow away the page cache, so + * only take lock exclusively if the page cache needs invalidation. + * This allows the normal direct IO case of no page cache pages to + * proceeed concurrently without serialisation. + */ + xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); + if ((ioflags & IO_ISDIRECT) && inode->i_mapping->nrpages) { + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); if (inode->i_mapping->nrpages) { @@ -328,8 +368,7 @@ xfs_file_aio_read( } } xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); - } else - xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); + } trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags); @@ -405,11 +444,13 @@ xfs_aio_write_isize_update( */ STATIC void xfs_aio_write_newsize_update( - struct xfs_inode *ip) + struct xfs_inode *ip, + xfs_fsize_t new_size) { - if (ip->i_new_size) { + if (new_size == ip->i_new_size) { xfs_rw_ilock(ip, XFS_ILOCK_EXCL); - ip->i_new_size = 0; + if (new_size == ip->i_new_size) + ip->i_new_size = 0; if (ip->i_d.di_size > ip->i_size) ip->i_d.di_size = ip->i_size; xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); @@ -460,7 +501,7 @@ xfs_file_splice_write( ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); xfs_aio_write_isize_update(inode, ppos, ret); - xfs_aio_write_newsize_update(ip); + xfs_aio_write_newsize_update(ip, new_size); xfs_iunlock(ip, XFS_IOLOCK_EXCL); return ret; } @@ -498,11 +539,9 @@ xfs_zero_last_block( last_fsb = XFS_B_TO_FSBT(mp, isize); nimaps = 1; - error = xfs_bmapi(NULL, ip, last_fsb, 1, 0, NULL, 0, &imap, - &nimaps, NULL); - if (error) { + error = xfs_bmapi_read(ip, last_fsb, 1, &imap, &nimaps, 0); + if (error) return error; - } ASSERT(nimaps > 0); /* * If the block underlying isize is just a hole, then there @@ -593,8 +632,8 @@ xfs_zero_eof( while (start_zero_fsb <= end_zero_fsb) { nimaps = 1; zero_count_fsb = end_zero_fsb - start_zero_fsb + 1; - error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb, - 0, NULL, 0, &imap, &nimaps, NULL); + error = xfs_bmapi_read(ip, start_zero_fsb, zero_count_fsb, + &imap, &nimaps, 0); if (error) { ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); return error; @@ -657,6 +696,7 @@ xfs_file_aio_write_checks( struct file *file, loff_t *pos, size_t *count, + xfs_fsize_t *new_sizep, int *iolock) { struct inode *inode = file->f_mapping->host; @@ -664,6 +704,9 @@ xfs_file_aio_write_checks( xfs_fsize_t new_size; int error = 0; + xfs_rw_ilock(ip, XFS_ILOCK_EXCL); + *new_sizep = 0; +restart: error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode)); if (error) { xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock); @@ -671,20 +714,41 @@ xfs_file_aio_write_checks( return error; } - new_size = *pos + *count; - if (new_size > ip->i_size) - ip->i_new_size = new_size; - if (likely(!(file->f_mode & FMODE_NOCMTIME))) file_update_time(file); /* * If the offset is beyond the size of the file, we need to zero any * blocks that fall between the existing EOF and the start of this - * write. + * write. There is no need to issue zeroing if another in-flght IO ends + * at or before this one If zeronig is needed and we are currently + * holding the iolock shared, we need to update it to exclusive which + * involves dropping all locks and relocking to maintain correct locking + * order. If we do this, restart the function to ensure all checks and + * values are still valid. */ - if (*pos > ip->i_size) + if ((ip->i_new_size && *pos > ip->i_new_size) || + (!ip->i_new_size && *pos > ip->i_size)) { + if (*iolock == XFS_IOLOCK_SHARED) { + xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock); + *iolock = XFS_IOLOCK_EXCL; + xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock); + goto restart; + } error = -xfs_zero_eof(ip, *pos, ip->i_size); + } + + /* + * If this IO extends beyond EOF, we may need to update ip->i_new_size. + * We have already zeroed space beyond EOF (if necessary). Only update + * ip->i_new_size if this IO ends beyond any other in-flight writes. + */ + new_size = *pos + *count; + if (new_size > ip->i_size) { + if (new_size > ip->i_new_size) + ip->i_new_size = new_size; + *new_sizep = new_size; + } xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); if (error) @@ -719,7 +783,7 @@ xfs_file_aio_write_checks( * the dio layer. To avoid the problem with aio, we also need to wait for * outstanding IOs to complete so that unwritten extent conversion is completed * before we try to map the overlapping block. This is currently implemented by - * hitting it with a big hammer (i.e. xfs_ioend_wait()). + * hitting it with a big hammer (i.e. inode_dio_wait()). * * Returns with locks held indicated by @iolock and errors indicated by * negative return values. @@ -731,6 +795,7 @@ xfs_file_dio_aio_write( unsigned long nr_segs, loff_t pos, size_t ocount, + xfs_fsize_t *new_size, int *iolock) { struct file *file = iocb->ki_filp; @@ -751,18 +816,35 @@ xfs_file_dio_aio_write( if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask)) unaligned_io = 1; - if (unaligned_io || mapping->nrpages || pos > ip->i_size) + /* + * We don't need to take an exclusive lock unless there page cache needs + * to be invalidated or unaligned IO is being executed. We don't need to + * consider the EOF extension case here because + * xfs_file_aio_write_checks() will relock the inode as necessary for + * EOF zeroing cases and fill out the new inode size as appropriate. + */ + if (unaligned_io || mapping->nrpages) *iolock = XFS_IOLOCK_EXCL; else *iolock = XFS_IOLOCK_SHARED; - xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock); + xfs_rw_ilock(ip, *iolock); + + /* + * Recheck if there are cached pages that need invalidate after we got + * the iolock to protect against other threads adding new pages while + * we were waiting for the iolock. + */ + if (mapping->nrpages && *iolock == XFS_IOLOCK_SHARED) { + xfs_rw_iunlock(ip, *iolock); + *iolock = XFS_IOLOCK_EXCL; + xfs_rw_ilock(ip, *iolock); + } - ret = xfs_file_aio_write_checks(file, &pos, &count, iolock); + ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock); if (ret) return ret; if (mapping->nrpages) { - WARN_ON(*iolock != XFS_IOLOCK_EXCL); ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1, FI_REMAPF_LOCKED); if (ret) @@ -774,7 +856,7 @@ xfs_file_dio_aio_write( * otherwise demote the lock if we had to flush cached pages */ if (unaligned_io) - xfs_ioend_wait(ip); + inode_dio_wait(inode); else if (*iolock == XFS_IOLOCK_EXCL) { xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); *iolock = XFS_IOLOCK_SHARED; @@ -796,6 +878,7 @@ xfs_file_buffered_aio_write( unsigned long nr_segs, loff_t pos, size_t ocount, + xfs_fsize_t *new_size, int *iolock) { struct file *file = iocb->ki_filp; @@ -807,9 +890,9 @@ xfs_file_buffered_aio_write( size_t count = ocount; *iolock = XFS_IOLOCK_EXCL; - xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock); + xfs_rw_ilock(ip, *iolock); - ret = xfs_file_aio_write_checks(file, &pos, &count, iolock); + ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock); if (ret) return ret; @@ -849,6 +932,7 @@ xfs_file_aio_write( ssize_t ret; int iolock; size_t ocount = 0; + xfs_fsize_t new_size = 0; XFS_STATS_INC(xs_write_calls); @@ -868,10 +952,10 @@ xfs_file_aio_write( if (unlikely(file->f_flags & O_DIRECT)) ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, - ocount, &iolock); + ocount, &new_size, &iolock); else ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos, - ocount, &iolock); + ocount, &new_size, &iolock); xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret); @@ -881,15 +965,18 @@ xfs_file_aio_write( /* Handle various SYNC-type writes */ if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { loff_t end = pos + ret - 1; + int error; xfs_rw_iunlock(ip, iolock); - ret = -xfs_file_fsync(file, pos, end, + error = xfs_file_fsync(file, pos, end, (file->f_flags & __O_SYNC) ? 0 : 1); xfs_rw_ilock(ip, iolock); + if (error) + ret = error; } out_unlock: - xfs_aio_write_newsize_update(ip); + xfs_aio_write_newsize_update(ip, new_size); xfs_rw_iunlock(ip, iolock); return ret; } @@ -1082,7 +1169,7 @@ const struct file_operations xfs_dir_file_operations = { #ifdef CONFIG_COMPAT .compat_ioctl = xfs_file_compat_ioctl, #endif - .fsync = xfs_file_fsync, + .fsync = xfs_dir_fsync, }; static const struct vm_operations_struct xfs_file_vm_ops = { diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index 9124425b7f2f..5170306a1009 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -344,9 +344,9 @@ _xfs_filestream_update_ag( * Either ip is a regular file and pip is a directory, or ip is a * directory and pip is NULL. */ - ASSERT(ip && (((ip->i_d.di_mode & S_IFREG) && pip && - (pip->i_d.di_mode & S_IFDIR)) || - ((ip->i_d.di_mode & S_IFDIR) && !pip))); + ASSERT(ip && ((S_ISREG(ip->i_d.di_mode) && pip && + S_ISDIR(pip->i_d.di_mode)) || + (S_ISDIR(ip->i_d.di_mode) && !pip))); mp = ip->i_mount; cache = mp->m_filestream; @@ -537,7 +537,7 @@ xfs_filestream_lookup_ag( xfs_agnumber_t ag; int ref; - if (!(ip->i_d.di_mode & (S_IFREG | S_IFDIR))) { + if (!S_ISREG(ip->i_d.di_mode) && !S_ISDIR(ip->i_d.di_mode)) { ASSERT(0); return NULLAGNUMBER; } @@ -579,9 +579,9 @@ xfs_filestream_associate( xfs_agnumber_t ag, rotorstep, startag; int err = 0; - ASSERT(pip->i_d.di_mode & S_IFDIR); - ASSERT(ip->i_d.di_mode & S_IFREG); - if (!(pip->i_d.di_mode & S_IFDIR) || !(ip->i_d.di_mode & S_IFREG)) + ASSERT(S_ISDIR(pip->i_d.di_mode)); + ASSERT(S_ISREG(ip->i_d.di_mode)); + if (!S_ISDIR(pip->i_d.di_mode) || !S_ISREG(ip->i_d.di_mode)) return -EINVAL; mp = pip->i_mount; @@ -682,7 +682,7 @@ xfs_filestream_new_ag( ip = ap->ip; mp = ip->i_mount; cache = mp->m_filestream; - minlen = ap->alen; + minlen = ap->length; *agp = NULLAGNUMBER; /* @@ -761,7 +761,7 @@ xfs_filestream_new_ag( */ ag = (ag == NULLAGNUMBER) ? 0 : (ag + 1) % mp->m_sb.sb_agcount; flags = (ap->userdata ? XFS_PICK_USERDATA : 0) | - (ap->low ? XFS_PICK_LOWSPACE : 0); + (ap->flist->xbf_low ? XFS_PICK_LOWSPACE : 0); err = _xfs_filestream_pick_ag(mp, ag, agp, flags, minlen); if (err || *agp == NULLAGNUMBER) diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/xfs_fs_subr.c index ed88ed16811c..ed88ed16811c 100644 --- a/fs/xfs/linux-2.6/xfs_fs_subr.c +++ b/fs/xfs/xfs_fs_subr.c diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 9153d2c77caf..1c6fdeb702ff 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -194,6 +194,10 @@ xfs_growfs_data_private( bp = xfs_buf_get(mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), XFS_FSS_TO_BB(mp, 1), XBF_LOCK | XBF_MAPPED); + if (!bp) { + error = ENOMEM; + goto error0; + } agf = XFS_BUF_TO_AGF(bp); memset(agf, 0, mp->m_sb.sb_sectsize); agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC); @@ -216,16 +220,21 @@ xfs_growfs_data_private( tmpsize = agsize - XFS_PREALLOC_BLOCKS(mp); agf->agf_freeblks = cpu_to_be32(tmpsize); agf->agf_longest = cpu_to_be32(tmpsize); - error = xfs_bwrite(mp, bp); - if (error) { + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) goto error0; - } + /* * AG inode header block */ bp = xfs_buf_get(mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), XFS_FSS_TO_BB(mp, 1), XBF_LOCK | XBF_MAPPED); + if (!bp) { + error = ENOMEM; + goto error0; + } agi = XFS_BUF_TO_AGI(bp); memset(agi, 0, mp->m_sb.sb_sectsize); agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC); @@ -240,10 +249,11 @@ xfs_growfs_data_private( agi->agi_dirino = cpu_to_be32(NULLAGINO); for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO); - error = xfs_bwrite(mp, bp); - if (error) { + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) goto error0; - } + /* * BNO btree root block */ @@ -251,6 +261,10 @@ xfs_growfs_data_private( XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)), BTOBB(mp->m_sb.sb_blocksize), XBF_LOCK | XBF_MAPPED); + if (!bp) { + error = ENOMEM; + goto error0; + } block = XFS_BUF_TO_BLOCK(bp); memset(block, 0, mp->m_sb.sb_blocksize); block->bb_magic = cpu_to_be32(XFS_ABTB_MAGIC); @@ -262,10 +276,11 @@ xfs_growfs_data_private( arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); arec->ar_blockcount = cpu_to_be32( agsize - be32_to_cpu(arec->ar_startblock)); - error = xfs_bwrite(mp, bp); - if (error) { + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) goto error0; - } + /* * CNT btree root block */ @@ -273,6 +288,10 @@ xfs_growfs_data_private( XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)), BTOBB(mp->m_sb.sb_blocksize), XBF_LOCK | XBF_MAPPED); + if (!bp) { + error = ENOMEM; + goto error0; + } block = XFS_BUF_TO_BLOCK(bp); memset(block, 0, mp->m_sb.sb_blocksize); block->bb_magic = cpu_to_be32(XFS_ABTC_MAGIC); @@ -285,10 +304,11 @@ xfs_growfs_data_private( arec->ar_blockcount = cpu_to_be32( agsize - be32_to_cpu(arec->ar_startblock)); nfree += be32_to_cpu(arec->ar_blockcount); - error = xfs_bwrite(mp, bp); - if (error) { + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) goto error0; - } + /* * INO btree root block */ @@ -296,6 +316,10 @@ xfs_growfs_data_private( XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)), BTOBB(mp->m_sb.sb_blocksize), XBF_LOCK | XBF_MAPPED); + if (!bp) { + error = ENOMEM; + goto error0; + } block = XFS_BUF_TO_BLOCK(bp); memset(block, 0, mp->m_sb.sb_blocksize); block->bb_magic = cpu_to_be32(XFS_IBT_MAGIC); @@ -303,10 +327,10 @@ xfs_growfs_data_private( block->bb_numrecs = 0; block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); - error = xfs_bwrite(mp, bp); - if (error) { + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) goto error0; - } } xfs_trans_agblocks_delta(tp, nfree); /* @@ -396,9 +420,9 @@ xfs_growfs_data_private( * just issue a warning and continue. The real work is * already done and committed. */ - if (!(error = xfs_bwrite(mp, bp))) { - continue; - } else { + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) { xfs_warn(mp, "write error %d updating secondary superblock for ag %d", error, agno); diff --git a/fs/xfs/linux-2.6/xfs_globals.c b/fs/xfs/xfs_globals.c index 76e81cff70b9..76e81cff70b9 100644 --- a/fs/xfs/linux-2.6/xfs_globals.c +++ b/fs/xfs/xfs_globals.c diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index dd5628bd8d0b..169380e66057 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -150,7 +150,7 @@ xfs_check_agi_freecount( /* * Initialise a new set of inodes. */ -STATIC void +STATIC int xfs_ialloc_inode_init( struct xfs_mount *mp, struct xfs_trans *tp, @@ -202,9 +202,8 @@ xfs_ialloc_inode_init( fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize * blks_per_cluster, XBF_LOCK); - ASSERT(fbuf); - ASSERT(!XFS_BUF_GETERROR(fbuf)); - + if (!fbuf) + return ENOMEM; /* * Initialize all inodes in this buffer and then log them. * @@ -226,6 +225,7 @@ xfs_ialloc_inode_init( } xfs_trans_inode_alloc_buf(tp, fbuf); } + return 0; } /* @@ -370,9 +370,11 @@ xfs_ialloc_ag_alloc( * rather than a linear progression to prevent the next generation * number from being easily guessable. */ - xfs_ialloc_inode_init(args.mp, tp, agno, args.agbno, args.len, - random32()); + error = xfs_ialloc_inode_init(args.mp, tp, agno, args.agbno, + args.len, random32()); + if (error) + return error; /* * Convert the results. */ @@ -1486,7 +1488,7 @@ xfs_read_agi( if (error) return error; - ASSERT(*bpp && !XFS_BUF_GETERROR(*bpp)); + ASSERT(!xfs_buf_geterror(*bpp)); agi = XFS_BUF_TO_AGI(*bpp); /* @@ -1503,7 +1505,7 @@ xfs_read_agi( return XFS_ERROR(EFSCORRUPTED); } - XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_AGI, XFS_AGI_REF); + xfs_buf_set_ref(*bpp, XFS_AGI_REF); xfs_check_agi_unlinked(agi); return 0; diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index 7759812c1bbe..0fa98b1c70ea 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -75,7 +75,6 @@ xfs_inode_alloc( return NULL; } - ASSERT(atomic_read(&ip->i_iocount) == 0); ASSERT(atomic_read(&ip->i_pincount) == 0); ASSERT(!spin_is_locked(&ip->i_flags_lock)); ASSERT(completion_done(&ip->i_flush)); @@ -150,7 +149,6 @@ xfs_inode_free( } /* asserts to verify all state is correct here */ - ASSERT(atomic_read(&ip->i_iocount) == 0); ASSERT(atomic_read(&ip->i_pincount) == 0); ASSERT(!spin_is_locked(&ip->i_flags_lock)); ASSERT(completion_done(&ip->i_flush)); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 3cc21ddf9f7e..c0237c602f11 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -190,12 +190,6 @@ xfs_imap_to_bp( } xfs_inobp_check(mp, bp); - - /* - * Mark the buffer as an inode buffer now that it looks good - */ - XFS_BUF_SET_VTYPE(bp, B_FS_INO); - *bpp = bp; return 0; } @@ -368,7 +362,7 @@ xfs_iformat( /* * no local regular files yet */ - if (unlikely((be16_to_cpu(dip->di_mode) & S_IFMT) == S_IFREG)) { + if (unlikely(S_ISREG(be16_to_cpu(dip->di_mode)))) { xfs_warn(ip->i_mount, "corrupt inode %Lu (local format for regular file).", (unsigned long long) ip->i_ino); @@ -1040,7 +1034,7 @@ xfs_ialloc( if (pip && XFS_INHERIT_GID(pip)) { ip->i_d.di_gid = pip->i_d.di_gid; - if ((pip->i_d.di_mode & S_ISGID) && (mode & S_IFMT) == S_IFDIR) { + if ((pip->i_d.di_mode & S_ISGID) && S_ISDIR(mode)) { ip->i_d.di_mode |= S_ISGID; } } @@ -1097,14 +1091,14 @@ xfs_ialloc( if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) { uint di_flags = 0; - if ((mode & S_IFMT) == S_IFDIR) { + if (S_ISDIR(mode)) { if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) di_flags |= XFS_DIFLAG_RTINHERIT; if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) { di_flags |= XFS_DIFLAG_EXTSZINHERIT; ip->i_d.di_extsize = pip->i_d.di_extsize; } - } else if ((mode & S_IFMT) == S_IFREG) { + } else if (S_ISREG(mode)) { if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) di_flags |= XFS_DIFLAG_REALTIME; if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) { @@ -1152,7 +1146,7 @@ xfs_ialloc( /* * Log the new values stuffed into the inode. */ - xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_log_inode(tp, ip, flags); /* now that we have an i_mode we can setup inode ops and unlock */ @@ -1187,8 +1181,9 @@ xfs_isize_check( xfs_fileoff_t map_first; int nimaps; xfs_bmbt_irec_t imaps[2]; + int error; - if ((ip->i_d.di_mode & S_IFMT) != S_IFREG) + if (!S_ISREG(ip->i_d.di_mode)) return; if (XFS_IS_REALTIME_INODE(ip)) @@ -1203,13 +1198,12 @@ xfs_isize_check( * The filesystem could be shutting down, so bmapi may return * an error. */ - if (xfs_bmapi(NULL, ip, map_first, + error = xfs_bmapi_read(ip, map_first, (XFS_B_TO_FSB(mp, - (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) - - map_first), - XFS_BMAPI_ENTIRE, NULL, 0, imaps, &nimaps, - NULL)) - return; + (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) - map_first), + imaps, &nimaps, XFS_BMAPI_ENTIRE); + if (error) + return; ASSERT(nimaps == 1); ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK); } @@ -1297,7 +1291,7 @@ xfs_itruncate_extents( */ error = xfs_bmap_finish(&tp, &free_list, &committed); if (committed) - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); if (error) goto out_bmap_cancel; @@ -1313,7 +1307,7 @@ xfs_itruncate_extents( error = xfs_trans_commit(tp, 0); tp = ntp; - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); if (error) goto out; @@ -1644,7 +1638,7 @@ xfs_iunlink_remove( * inodes that are in memory - they all must be marked stale and attached to * the cluster buffer. */ -STATIC void +STATIC int xfs_ifree_cluster( xfs_inode_t *free_ip, xfs_trans_t *tp, @@ -1690,6 +1684,8 @@ xfs_ifree_cluster( mp->m_bsize * blks_per_cluster, XBF_LOCK); + if (!bp) + return ENOMEM; /* * Walk the inodes already attached to the buffer and mark them * stale. These will all have the flush locks held, so an @@ -1799,6 +1795,7 @@ retry: } xfs_perag_put(pag); + return 0; } /* @@ -1828,7 +1825,7 @@ xfs_ifree( ASSERT(ip->i_d.di_nextents == 0); ASSERT(ip->i_d.di_anextents == 0); ASSERT((ip->i_d.di_size == 0 && ip->i_size == 0) || - ((ip->i_d.di_mode & S_IFMT) != S_IFREG)); + (!S_ISREG(ip->i_d.di_mode))); ASSERT(ip->i_d.di_nblocks == 0); /* @@ -1878,10 +1875,10 @@ xfs_ifree( dip->di_mode = 0; if (delete) { - xfs_ifree_cluster(ip, tp, first_ino); + error = xfs_ifree_cluster(ip, tp, first_ino); } - return 0; + return error; } /* @@ -2472,11 +2469,11 @@ cluster_corrupt_out: */ if (bp->b_iodone) { XFS_BUF_UNDONE(bp); - XFS_BUF_STALE(bp); - XFS_BUF_ERROR(bp,EIO); + xfs_buf_stale(bp); + xfs_buf_ioerror(bp, EIO); xfs_buf_ioend(bp, 0); } else { - XFS_BUF_STALE(bp); + xfs_buf_stale(bp); xfs_buf_relse(bp); } } @@ -2585,7 +2582,7 @@ xfs_iflush( * If the buffer is pinned then push on the log now so we won't * get stuck waiting in the write for too long. */ - if (XFS_BUF_ISPINNED(bp)) + if (xfs_buf_ispinned(bp)) xfs_log_force(mp, 0); /* @@ -2597,9 +2594,11 @@ xfs_iflush( goto cluster_corrupt_out; if (flags & SYNC_WAIT) - error = xfs_bwrite(mp, bp); + error = xfs_bwrite(bp); else - xfs_bdwrite(mp, bp); + xfs_buf_delwri_queue(bp); + + xfs_buf_relse(bp); return error; corrupt_out: @@ -2671,7 +2670,7 @@ xfs_iflush_int( __func__, ip->i_ino, ip, ip->i_d.di_magic); goto corrupt_out; } - if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) { + if (S_ISREG(ip->i_d.di_mode)) { if (XFS_TEST_ERROR( (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && (ip->i_d.di_format != XFS_DINODE_FMT_BTREE), @@ -2681,7 +2680,7 @@ xfs_iflush_int( __func__, ip->i_ino, ip); goto corrupt_out; } - } else if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) { + } else if (S_ISDIR(ip->i_d.di_mode)) { if (XFS_TEST_ERROR( (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) && diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index a97644ab945a..760140d1dd66 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -257,13 +257,12 @@ typedef struct xfs_inode { xfs_fsize_t i_size; /* in-memory size */ xfs_fsize_t i_new_size; /* size when write completes */ - atomic_t i_iocount; /* outstanding I/O count */ /* VFS inode */ struct inode i_vnode; /* embedded VFS inode */ } xfs_inode_t; -#define XFS_ISIZE(ip) (((ip)->i_d.di_mode & S_IFMT) == S_IFREG) ? \ +#define XFS_ISIZE(ip) S_ISREG((ip)->i_d.di_mode) ? \ (ip)->i_size : (ip)->i_d.di_size; /* Convert from vfs inode to xfs inode */ diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 588406dc6a35..abaafdbb3e65 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -658,10 +658,8 @@ xfs_inode_item_unlock( lock_flags = iip->ili_lock_flags; iip->ili_lock_flags = 0; - if (lock_flags) { + if (lock_flags) xfs_iunlock(ip, lock_flags); - IRELE(ip); - } } /* @@ -708,13 +706,14 @@ xfs_inode_item_committed( * marked delayed write. If that's the case, we'll promote it and that will * allow the caller to write the buffer by triggering the xfsbufd to run. */ -STATIC void +STATIC bool xfs_inode_item_pushbuf( struct xfs_log_item *lip) { struct xfs_inode_log_item *iip = INODE_ITEM(lip); struct xfs_inode *ip = iip->ili_inode; struct xfs_buf *bp; + bool ret = true; ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED)); @@ -725,7 +724,7 @@ xfs_inode_item_pushbuf( if (completion_done(&ip->i_flush) || !(lip->li_flags & XFS_LI_IN_AIL)) { xfs_iunlock(ip, XFS_ILOCK_SHARED); - return; + return true; } bp = xfs_incore(ip->i_mount->m_ddev_targp, iip->ili_format.ilf_blkno, @@ -733,10 +732,13 @@ xfs_inode_item_pushbuf( xfs_iunlock(ip, XFS_ILOCK_SHARED); if (!bp) - return; + return true; if (XFS_BUF_ISDELAYWRITE(bp)) xfs_buf_delwri_promote(bp); + if (xfs_buf_ispinned(bp)) + ret = false; xfs_buf_relse(bp); + return ret; } /* @@ -793,7 +795,7 @@ xfs_inode_item_committing( /* * This is the ops vector shared by all buf log items. */ -static struct xfs_item_ops xfs_inode_item_ops = { +static const struct xfs_item_ops xfs_inode_item_ops = { .iop_size = xfs_inode_item_size, .iop_format = xfs_inode_item_format, .iop_pin = xfs_inode_item_pin, diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index acca2c5ca3fa..d99a90518909 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -265,7 +265,7 @@ xfs_open_by_handle( return PTR_ERR(filp); } - if (inode->i_mode & S_IFREG) { + if (S_ISREG(inode->i_mode)) { filp->f_flags |= O_NOATIME; filp->f_mode |= FMODE_NOCMTIME; } @@ -850,14 +850,14 @@ xfs_set_diflags( di_flags |= XFS_DIFLAG_NODEFRAG; if (xflags & XFS_XFLAG_FILESTREAM) di_flags |= XFS_DIFLAG_FILESTREAM; - if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) { + if (S_ISDIR(ip->i_d.di_mode)) { if (xflags & XFS_XFLAG_RTINHERIT) di_flags |= XFS_DIFLAG_RTINHERIT; if (xflags & XFS_XFLAG_NOSYMLINKS) di_flags |= XFS_DIFLAG_NOSYMLINKS; if (xflags & XFS_XFLAG_EXTSZINHERIT) di_flags |= XFS_DIFLAG_EXTSZINHERIT; - } else if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) { + } else if (S_ISREG(ip->i_d.di_mode)) { if (xflags & XFS_XFLAG_REALTIME) di_flags |= XFS_DIFLAG_REALTIME; if (xflags & XFS_XFLAG_EXTSIZE) @@ -1069,7 +1069,7 @@ xfs_ioctl_setattr( } } - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); /* * Change file ownership. Must be the owner or privileged. diff --git a/fs/xfs/linux-2.6/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h index d56173b34a2a..d56173b34a2a 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.h +++ b/fs/xfs/xfs_ioctl.h diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index 54e623bfbb85..54e623bfbb85 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.h b/fs/xfs/xfs_ioctl32.h index 80f4060e8970..80f4060e8970 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl32.h +++ b/fs/xfs/xfs_ioctl32.h diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 091d82b94c4d..9afa282aa937 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -208,22 +208,20 @@ xfs_iomap_write_direct( if (error) goto error1; - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); - bmapi_flag = XFS_BMAPI_WRITE; + bmapi_flag = 0; if (offset < ip->i_size || extsz) bmapi_flag |= XFS_BMAPI_PREALLOC; /* - * Issue the xfs_bmapi() call to allocate the blocks. - * * From this point onwards we overwrite the imap pointer that the * caller gave to us. */ xfs_bmap_init(&free_list, &firstfsb); nimaps = 1; - error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, bmapi_flag, - &firstfsb, 0, imap, &nimaps, &free_list); + error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, bmapi_flag, + &firstfsb, 0, imap, &nimaps, &free_list); if (error) goto error0; @@ -300,8 +298,8 @@ xfs_iomap_eof_want_preallocate( while (count_fsb > 0) { imaps = nimaps; firstblock = NULLFSBLOCK; - error = xfs_bmapi(NULL, ip, start_fsb, count_fsb, 0, - &firstblock, 0, imap, &imaps, NULL); + error = xfs_bmapi_read(ip, start_fsb, count_fsb, imap, &imaps, + 0); if (error) return error; for (n = 0; n < imaps; n++) { @@ -381,7 +379,6 @@ xfs_iomap_write_delay( xfs_fileoff_t last_fsb; xfs_off_t aligned_offset; xfs_fileoff_t ioalign; - xfs_fsblock_t firstblock; xfs_extlen_t extsz; int nimaps; xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS]; @@ -425,12 +422,8 @@ retry: } nimaps = XFS_WRITE_IMAPS; - firstblock = NULLFSBLOCK; - error = xfs_bmapi(NULL, ip, offset_fsb, - (xfs_filblks_t)(last_fsb - offset_fsb), - XFS_BMAPI_DELAY | XFS_BMAPI_WRITE | - XFS_BMAPI_ENTIRE, &firstblock, 1, imap, - &nimaps, NULL); + error = xfs_bmapi_delay(ip, offset_fsb, last_fsb - offset_fsb, + imap, &nimaps, XFS_BMAPI_ENTIRE); switch (error) { case 0: case ENOSPC: @@ -535,7 +528,7 @@ xfs_iomap_write_allocate( return XFS_ERROR(error); } xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); xfs_bmap_init(&free_list, &first_block); @@ -587,14 +580,12 @@ xfs_iomap_write_allocate( } /* - * Go get the actual blocks. - * * From this point onwards we overwrite the imap * pointer that the caller gave to us. */ - error = xfs_bmapi(tp, ip, map_start_fsb, count_fsb, - XFS_BMAPI_WRITE, &first_block, 1, - imap, &nimaps, &free_list); + error = xfs_bmapi_write(tp, ip, map_start_fsb, + count_fsb, 0, &first_block, 1, + imap, &nimaps, &free_list); if (error) goto trans_cancel; @@ -701,15 +692,15 @@ xfs_iomap_write_unwritten( } xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); /* * Modify the unwritten extent state of the buffer. */ xfs_bmap_init(&free_list, &firstfsb); nimaps = 1; - error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, - XFS_BMAPI_WRITE|XFS_BMAPI_CONVERT, &firstfsb, + error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, + XFS_BMAPI_CONVERT, &firstfsb, 1, &imap, &nimaps, &free_list); if (error) goto error_on_bmapi_transaction; diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/xfs_iops.c index 501e4f630548..23ce927973a4 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -70,9 +70,8 @@ xfs_synchronize_times( } /* - * If the linux inode is valid, mark it dirty. - * Used when committing a dirty inode into a transaction so that - * the inode will get written back by the linux code + * If the linux inode is valid, mark it dirty, else mark the dirty state + * in the XFS inode to make sure we pick it up when reclaiming the inode. */ void xfs_mark_inode_dirty_sync( @@ -82,6 +81,10 @@ xfs_mark_inode_dirty_sync( if (!(inode->i_state & (I_WILL_FREE|I_FREEING))) mark_inode_dirty_sync(inode); + else { + barrier(); + ip->i_update_core = 1; + } } void @@ -92,6 +95,28 @@ xfs_mark_inode_dirty( if (!(inode->i_state & (I_WILL_FREE|I_FREEING))) mark_inode_dirty(inode); + else { + barrier(); + ip->i_update_core = 1; + } + +} + + +int xfs_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) +{ + const struct xattr *xattr; + struct xfs_inode *ip = XFS_I(inode); + int error = 0; + + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + error = xfs_attr_set(ip, xattr->name, xattr->value, + xattr->value_len, ATTR_SECURE); + if (error < 0) + break; + } + return error; } /* @@ -100,31 +125,15 @@ xfs_mark_inode_dirty( * these attrs can be journalled at inode creation time (along with the * inode, of course, such that log replay can't cause these to be lost). */ + STATIC int xfs_init_security( struct inode *inode, struct inode *dir, const struct qstr *qstr) { - struct xfs_inode *ip = XFS_I(inode); - size_t length; - void *value; - unsigned char *name; - int error; - - error = security_inode_init_security(inode, dir, qstr, (char **)&name, - &value, &length); - if (error) { - if (error == -EOPNOTSUPP) - return 0; - return -error; - } - - error = xfs_attr_set(ip, name, value, length, ATTR_SECURE); - - kfree(name); - kfree(value); - return error; + return security_inode_init_security(inode, dir, qstr, + &xfs_initxattrs, NULL); } static void @@ -202,9 +211,9 @@ xfs_vn_mknod( if (default_acl) { error = -xfs_inherit_acl(inode, default_acl); + default_acl = NULL; if (unlikely(error)) goto out_cleanup_inode; - posix_acl_release(default_acl); } @@ -457,7 +466,7 @@ xfs_vn_getattr( trace_xfs_getattr(ip); if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); + return -XFS_ERROR(EIO); stat->size = XFS_ISIZE(ip); stat->dev = inode->i_sb->s_dev; @@ -603,7 +612,7 @@ xfs_setattr_nonsize( } } - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); /* * Change file ownership. Must be the owner or privileged. @@ -825,16 +834,16 @@ xfs_setattr_size( * care about here. */ if (ip->i_size != ip->i_d.di_size && iattr->ia_size > ip->i_d.di_size) { - error = xfs_flush_pages(ip, ip->i_d.di_size, iattr->ia_size, - XBF_ASYNC, FI_NONE); + error = xfs_flush_pages(ip, ip->i_d.di_size, iattr->ia_size, 0, + FI_NONE); if (error) goto out_unlock; } /* - * Wait for all I/O to complete. + * Wait for all direct I/O to complete. */ - xfs_ioend_wait(ip); + inode_dio_wait(inode); error = -block_truncate_page(inode->i_mapping, iattr->ia_size, xfs_get_blocks); @@ -855,7 +864,7 @@ xfs_setattr_size( xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); /* * Only change the c/mtime if we are changing the size or we are @@ -1022,7 +1031,7 @@ xfs_vn_fiemap( } static const struct inode_operations xfs_inode_operations = { - .check_acl = xfs_check_acl, + .get_acl = xfs_get_acl, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, .setxattr = generic_setxattr, @@ -1048,7 +1057,7 @@ static const struct inode_operations xfs_dir_inode_operations = { .rmdir = xfs_vn_unlink, .mknod = xfs_vn_mknod, .rename = xfs_vn_rename, - .check_acl = xfs_check_acl, + .get_acl = xfs_get_acl, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, .setxattr = generic_setxattr, @@ -1073,7 +1082,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = { .rmdir = xfs_vn_unlink, .mknod = xfs_vn_mknod, .rename = xfs_vn_rename, - .check_acl = xfs_check_acl, + .get_acl = xfs_get_acl, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, .setxattr = generic_setxattr, @@ -1086,7 +1095,7 @@ static const struct inode_operations xfs_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = xfs_vn_follow_link, .put_link = xfs_vn_put_link, - .check_acl = xfs_check_acl, + .get_acl = xfs_get_acl, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, .setxattr = generic_setxattr, @@ -1144,7 +1153,7 @@ xfs_setup_inode( hlist_add_fake(&inode->i_hash); inode->i_mode = ip->i_d.di_mode; - inode->i_nlink = ip->i_d.di_nlink; + set_nlink(inode, ip->i_d.di_nlink); inode->i_uid = ip->i_d.di_uid; inode->i_gid = ip->i_d.di_gid; @@ -1194,6 +1203,15 @@ xfs_setup_inode( break; } + /* + * If there is no attribute fork no ACL can exist on this inode, + * and it can't have any file capabilities attached to it either. + */ + if (!XFS_IFORK_Q(ip)) { + inode_has_no_xattr(inode); + cache_no_acl(inode); + } + xfs_iflags_clear(ip, XFS_INEW); barrier(); diff --git a/fs/xfs/linux-2.6/xfs_iops.h b/fs/xfs/xfs_iops.h index ef41c92ce66e..ef41c92ce66e 100644 --- a/fs/xfs/linux-2.6/xfs_iops.h +++ b/fs/xfs/xfs_iops.h diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/xfs_linux.h index d42f814e4d35..828662f70d64 100644 --- a/fs/xfs/linux-2.6/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -32,13 +32,12 @@ # define XFS_BIG_INUMS 0 #endif -#include <xfs_types.h> +#include "xfs_types.h" -#include <kmem.h> -#include <mrlock.h> -#include <time.h> - -#include <support/uuid.h> +#include "kmem.h" +#include "mrlock.h" +#include "time.h" +#include "uuid.h" #include <linux/semaphore.h> #include <linux/mm.h> @@ -69,6 +68,8 @@ #include <linux/ctype.h> #include <linux/writeback.h> #include <linux/capability.h> +#include <linux/kthread.h> +#include <linux/freezer.h> #include <linux/list_sort.h> #include <asm/page.h> @@ -78,14 +79,14 @@ #include <asm/byteorder.h> #include <asm/unaligned.h> -#include <xfs_vnode.h> -#include <xfs_stats.h> -#include <xfs_sysctl.h> -#include <xfs_iops.h> -#include <xfs_aops.h> -#include <xfs_super.h> -#include <xfs_buf.h> -#include <xfs_message.h> +#include "xfs_vnode.h" +#include "xfs_stats.h" +#include "xfs_sysctl.h" +#include "xfs_iops.h" +#include "xfs_aops.h" +#include "xfs_super.h" +#include "xfs_buf.h" +#include "xfs_message.h" #ifdef __BIG_ENDIAN #define XFS_NATIVE_HOST 1 diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 06ff8437ed8e..a14cd89fe465 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -626,7 +626,7 @@ xfs_log_item_init( struct xfs_mount *mp, struct xfs_log_item *item, int type, - struct xfs_item_ops *ops) + const struct xfs_item_ops *ops) { item->li_mountp = mp; item->li_ailp = mp->m_ail; @@ -878,10 +878,10 @@ xlog_iodone(xfs_buf_t *bp) /* * Race to shutdown the filesystem if we see an error. */ - if (XFS_TEST_ERROR((XFS_BUF_GETERROR(bp)), l->l_mp, + if (XFS_TEST_ERROR((xfs_buf_geterror(bp)), l->l_mp, XFS_ERRTAG_IODONE_IOERR, XFS_RANDOM_IODONE_IOERR)) { - xfs_ioerror_alert("xlog_iodone", l->l_mp, bp, XFS_BUF_ADDR(bp)); - XFS_BUF_STALE(bp); + xfs_buf_ioerror_alert(bp, __func__); + xfs_buf_stale(bp); xfs_force_shutdown(l->l_mp, SHUTDOWN_LOG_IO_ERROR); /* * This flag will be propagated to the trans-committed @@ -1047,11 +1047,10 @@ xlog_alloc_log(xfs_mount_t *mp, xlog_get_iclog_buffer_size(mp, log); error = ENOMEM; - bp = xfs_buf_get_empty(log->l_iclog_size, mp->m_logdev_targp); + bp = xfs_buf_alloc(mp->m_logdev_targp, 0, log->l_iclog_size, 0); if (!bp) goto out_free_log; bp->b_iodone = xlog_iodone; - ASSERT(XFS_BUF_ISBUSY(bp)); ASSERT(xfs_buf_islocked(bp)); log->l_xbuf = bp; @@ -1108,7 +1107,6 @@ xlog_alloc_log(xfs_mount_t *mp, iclog->ic_callback_tail = &(iclog->ic_callback); iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize; - ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp)); ASSERT(xfs_buf_islocked(iclog->ic_bp)); init_waitqueue_head(&iclog->ic_force_wait); init_waitqueue_head(&iclog->ic_write_wait); @@ -1248,8 +1246,8 @@ xlog_bdstrat( struct xlog_in_core *iclog = bp->b_fspriv; if (iclog->ic_state & XLOG_STATE_IOERROR) { - XFS_BUF_ERROR(bp, EIO); - XFS_BUF_STALE(bp); + xfs_buf_ioerror(bp, EIO); + xfs_buf_stale(bp); xfs_buf_ioend(bp, 0); /* * It would seem logical to return EIO here, but we rely on @@ -1355,7 +1353,6 @@ xlog_sync(xlog_t *log, XFS_BUF_SET_COUNT(bp, count); bp->b_fspriv = iclog; XFS_BUF_ZEROFLAGS(bp); - XFS_BUF_BUSY(bp); XFS_BUF_ASYNC(bp); bp->b_flags |= XBF_SYNCIO; @@ -1390,24 +1387,23 @@ xlog_sync(xlog_t *log, */ XFS_BUF_WRITE(bp); - if ((error = xlog_bdstrat(bp))) { - xfs_ioerror_alert("xlog_sync", log->l_mp, bp, - XFS_BUF_ADDR(bp)); + error = xlog_bdstrat(bp); + if (error) { + xfs_buf_ioerror_alert(bp, "xlog_sync"); return error; } if (split) { bp = iclog->ic_log->l_xbuf; XFS_BUF_SET_ADDR(bp, 0); /* logical 0 */ - XFS_BUF_SET_PTR(bp, (xfs_caddr_t)((__psint_t)&(iclog->ic_header)+ - (__psint_t)count), split); + xfs_buf_associate_memory(bp, + (char *)&iclog->ic_header + count, split); bp->b_fspriv = iclog; XFS_BUF_ZEROFLAGS(bp); - XFS_BUF_BUSY(bp); XFS_BUF_ASYNC(bp); bp->b_flags |= XBF_SYNCIO; if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) bp->b_flags |= XBF_FUA; - dptr = XFS_BUF_PTR(bp); + dptr = bp->b_addr; /* * Bump the cycle numbers at the start of each block * since this part of the buffer is at the start of @@ -1427,9 +1423,9 @@ xlog_sync(xlog_t *log, /* account for internal log which doesn't start at block #0 */ XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart); XFS_BUF_WRITE(bp); - if ((error = xlog_bdstrat(bp))) { - xfs_ioerror_alert("xlog_sync (split)", log->l_mp, - bp, XFS_BUF_ADDR(bp)); + error = xlog_bdstrat(bp); + if (error) { + xfs_buf_ioerror_alert(bp, "xlog_sync (split)"); return error; } } diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 78c9039994af..3f7bf451c034 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -137,7 +137,7 @@ struct xfs_trans; void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item, int type, - struct xfs_item_ops *ops); + const struct xfs_item_ops *ops); xfs_lsn_t xfs_log_done(struct xfs_mount *mp, struct xlog_ticket *ticket, diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 8fe4206de057..541a508adea1 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -147,7 +147,7 @@ xlog_align( xfs_daddr_t offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1); ASSERT(BBTOB(offset + nbblks) <= XFS_BUF_SIZE(bp)); - return XFS_BUF_PTR(bp) + BBTOB(offset); + return bp->b_addr + BBTOB(offset); } @@ -178,15 +178,12 @@ xlog_bread_noalign( XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); XFS_BUF_READ(bp); - XFS_BUF_BUSY(bp); XFS_BUF_SET_COUNT(bp, BBTOB(nbblks)); - XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp); xfsbdstrat(log->l_mp, bp); error = xfs_buf_iowait(bp); if (error) - xfs_ioerror_alert("xlog_bread", log->l_mp, - bp, XFS_BUF_ADDR(bp)); + xfs_buf_ioerror_alert(bp, __func__); return error; } @@ -220,18 +217,18 @@ xlog_bread_offset( xfs_buf_t *bp, xfs_caddr_t offset) { - xfs_caddr_t orig_offset = XFS_BUF_PTR(bp); + xfs_caddr_t orig_offset = bp->b_addr; int orig_len = bp->b_buffer_length; int error, error2; - error = XFS_BUF_SET_PTR(bp, offset, BBTOB(nbblks)); + error = xfs_buf_associate_memory(bp, offset, BBTOB(nbblks)); if (error) return error; error = xlog_bread_noalign(log, blk_no, nbblks, bp); /* must reset buffer pointer even on error */ - error2 = XFS_BUF_SET_PTR(bp, orig_offset, orig_len); + error2 = xfs_buf_associate_memory(bp, orig_offset, orig_len); if (error) return error; return error2; @@ -266,15 +263,14 @@ xlog_bwrite( XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); XFS_BUF_ZEROFLAGS(bp); - XFS_BUF_BUSY(bp); - XFS_BUF_HOLD(bp); + xfs_buf_hold(bp); xfs_buf_lock(bp); XFS_BUF_SET_COUNT(bp, BBTOB(nbblks)); - XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp); - if ((error = xfs_bwrite(log->l_mp, bp))) - xfs_ioerror_alert("xlog_bwrite", log->l_mp, - bp, XFS_BUF_ADDR(bp)); + error = xfs_bwrite(bp); + if (error) + xfs_buf_ioerror_alert(bp, __func__); + xfs_buf_relse(bp); return error; } @@ -360,14 +356,12 @@ STATIC void xlog_recover_iodone( struct xfs_buf *bp) { - if (XFS_BUF_GETERROR(bp)) { + if (bp->b_error) { /* * We're not going to bother about retrying * this during recovery. One strike! */ - xfs_ioerror_alert("xlog_recover_iodone", - bp->b_target->bt_mount, bp, - XFS_BUF_ADDR(bp)); + xfs_buf_ioerror_alert(bp, __func__); xfs_force_shutdown(bp->b_target->bt_mount, SHUTDOWN_META_IO_ERROR); } @@ -1262,7 +1256,7 @@ xlog_write_log_records( */ ealign = round_down(end_block, sectbb); if (j == 0 && (start_block + endcount > ealign)) { - offset = XFS_BUF_PTR(bp) + BBTOB(ealign - start_block); + offset = bp->b_addr + BBTOB(ealign - start_block); error = xlog_bread_offset(log, ealign, sectbb, bp, offset); if (error) @@ -2135,15 +2129,15 @@ xlog_recover_buffer_pass2( bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, buf_flags); - if (XFS_BUF_ISERROR(bp)) { - xfs_ioerror_alert("xlog_recover_do..(read#1)", mp, - bp, buf_f->blf_blkno); - error = XFS_BUF_GETERROR(bp); + if (!bp) + return XFS_ERROR(ENOMEM); + error = bp->b_error; + if (error) { + xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)"); xfs_buf_relse(bp); return error; } - error = 0; if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); } else if (buf_f->blf_flags & @@ -2174,15 +2168,16 @@ xlog_recover_buffer_pass2( be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) && (XFS_BUF_COUNT(bp) != MAX(log->l_mp->m_sb.sb_blocksize, (__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) { - XFS_BUF_STALE(bp); - error = xfs_bwrite(mp, bp); + xfs_buf_stale(bp); + error = xfs_bwrite(bp); } else { ASSERT(bp->b_target->bt_mount == mp); bp->b_iodone = xlog_recover_iodone; - xfs_bdwrite(mp, bp); + xfs_buf_delwri_queue(bp); } - return (error); + xfs_buf_relse(bp); + return error; } STATIC int @@ -2227,14 +2222,16 @@ xlog_recover_inode_pass2( bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, XBF_LOCK); - if (XFS_BUF_ISERROR(bp)) { - xfs_ioerror_alert("xlog_recover_do..(read#2)", mp, - bp, in_f->ilf_blkno); - error = XFS_BUF_GETERROR(bp); + if (!bp) { + error = ENOMEM; + goto error; + } + error = bp->b_error; + if (error) { + xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#2)"); xfs_buf_relse(bp); goto error; } - error = 0; ASSERT(in_f->ilf_fields & XFS_ILOG_CORE); dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset); @@ -2283,7 +2280,7 @@ xlog_recover_inode_pass2( /* Take the opportunity to reset the flush iteration count */ dicp->di_flushiter = 0; - if (unlikely((dicp->di_mode & S_IFMT) == S_IFREG)) { + if (unlikely(S_ISREG(dicp->di_mode))) { if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && (dicp->di_format != XFS_DINODE_FMT_BTREE)) { XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)", @@ -2296,7 +2293,7 @@ xlog_recover_inode_pass2( error = EFSCORRUPTED; goto error; } - } else if (unlikely((dicp->di_mode & S_IFMT) == S_IFDIR)) { + } else if (unlikely(S_ISDIR(dicp->di_mode))) { if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && (dicp->di_format != XFS_DINODE_FMT_BTREE) && (dicp->di_format != XFS_DINODE_FMT_LOCAL)) { @@ -2439,7 +2436,8 @@ xlog_recover_inode_pass2( write_inode_buffer: ASSERT(bp->b_target->bt_mount == mp); bp->b_iodone = xlog_recover_iodone; - xfs_bdwrite(mp, bp); + xfs_buf_delwri_queue(bp); + xfs_buf_relse(bp); error: if (need_free) kmem_free(in_f); @@ -2537,8 +2535,7 @@ xlog_recover_dquot_pass2( XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp); if (error) { - xfs_ioerror_alert("xlog_recover_do..(read#3)", mp, - bp, dq_f->qlf_blkno); + xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#3)"); return error; } ASSERT(bp); @@ -2561,7 +2558,8 @@ xlog_recover_dquot_pass2( ASSERT(dq_f->qlf_size == 2); ASSERT(bp->b_target->bt_mount == mp); bp->b_iodone = xlog_recover_iodone; - xfs_bdwrite(mp, bp); + xfs_buf_delwri_queue(bp); + xfs_buf_relse(bp); return (0); } @@ -3437,7 +3435,7 @@ xlog_do_recovery_pass( /* * Check for header wrapping around physical end-of-log */ - offset = XFS_BUF_PTR(hbp); + offset = hbp->b_addr; split_hblks = 0; wrapped_hblks = 0; if (blk_no + hblks <= log->l_logBBsize) { @@ -3497,7 +3495,7 @@ xlog_do_recovery_pass( } else { /* This log record is split across the * physical end of log */ - offset = XFS_BUF_PTR(dbp); + offset = dbp->b_addr; split_bblks = 0; if (blk_no != log->l_logBBsize) { /* some data is before the physical @@ -3656,7 +3654,7 @@ xlog_do_recover( return error; } - XFS_bflush(log->l_mp->m_ddev_targp); + xfs_flush_buftarg(log->l_mp->m_ddev_targp, 1); /* * If IO errors happened during recovery, bail out. @@ -3689,8 +3687,7 @@ xlog_do_recover( xfsbdstrat(log->l_mp, bp); error = xfs_buf_iowait(bp); if (error) { - xfs_ioerror_alert("xlog_do_recover", - log->l_mp, bp, XFS_BUF_ADDR(bp)); + xfs_buf_ioerror_alert(bp, __func__); ASSERT(0); xfs_buf_relse(bp); return error; diff --git a/fs/xfs/linux-2.6/xfs_message.c b/fs/xfs/xfs_message.c index bd672def95ac..bd672def95ac 100644 --- a/fs/xfs/linux-2.6/xfs_message.c +++ b/fs/xfs/xfs_message.c diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h new file mode 100644 index 000000000000..56dc0c17f16a --- /dev/null +++ b/fs/xfs/xfs_message.h @@ -0,0 +1,37 @@ +#ifndef __XFS_MESSAGE_H +#define __XFS_MESSAGE_H 1 + +struct xfs_mount; + +extern __printf(2, 3) +void xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...); +extern __printf(2, 3) +void xfs_alert(const struct xfs_mount *mp, const char *fmt, ...); +extern __printf(3, 4) +void xfs_alert_tag(const struct xfs_mount *mp, int tag, const char *fmt, ...); +extern __printf(2, 3) +void xfs_crit(const struct xfs_mount *mp, const char *fmt, ...); +extern __printf(2, 3) +void xfs_err(const struct xfs_mount *mp, const char *fmt, ...); +extern __printf(2, 3) +void xfs_warn(const struct xfs_mount *mp, const char *fmt, ...); +extern __printf(2, 3) +void xfs_notice(const struct xfs_mount *mp, const char *fmt, ...); +extern __printf(2, 3) +void xfs_info(const struct xfs_mount *mp, const char *fmt, ...); + +#ifdef DEBUG +extern __printf(2, 3) +void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...); +#else +static inline __printf(2, 3) +void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...) +{ +} +#endif + +extern void assfail(char *expr, char *f, int l); + +extern void xfs_hex_dump(void *p, int length); + +#endif /* __XFS_MESSAGE_H */ diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 7f25245da289..d06afbc3540d 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -44,9 +44,6 @@ #include "xfs_trace.h" -STATIC void xfs_unmountfs_wait(xfs_mount_t *); - - #ifdef HAVE_PERCPU_SB STATIC void xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t, int); @@ -1331,7 +1328,7 @@ xfs_mountfs( ASSERT(rip != NULL); - if (unlikely((rip->i_d.di_mode & S_IFMT) != S_IFDIR)) { + if (unlikely(!S_ISDIR(rip->i_d.di_mode))) { xfs_warn(mp, "corrupted root inode %llu: not a directory", (unsigned long long)rip->i_ino); xfs_iunlock(rip, XFS_ILOCK_EXCL); @@ -1484,7 +1481,7 @@ xfs_unmountfs( * state as much as possible. */ xfs_reclaim_inodes(mp, 0); - XFS_bflush(mp->m_ddev_targp); + xfs_flush_buftarg(mp->m_ddev_targp, 1); xfs_reclaim_inodes(mp, SYNC_WAIT); xfs_qm_unmount(mp); @@ -1496,11 +1493,6 @@ xfs_unmountfs( */ xfs_log_force(mp, XFS_LOG_SYNC); - xfs_binval(mp->m_ddev_targp); - if (mp->m_rtdev_targp) { - xfs_binval(mp->m_rtdev_targp); - } - /* * Unreserve any blocks we have so that when we unmount we don't account * the reserved free space as used. This is really only necessary for @@ -1526,7 +1518,16 @@ xfs_unmountfs( xfs_warn(mp, "Unable to update superblock counters. " "Freespace may not be correct on next mount."); xfs_unmountfs_writesb(mp); - xfs_unmountfs_wait(mp); /* wait for async bufs */ + + /* + * Make sure all buffers have been flushed and completed before + * unmounting the log. + */ + error = xfs_flush_buftarg(mp->m_ddev_targp, 1); + if (error) + xfs_warn(mp, "%d busy buffers during unmount.", error); + xfs_wait_buftarg(mp->m_ddev_targp); + xfs_log_unmount_write(mp); xfs_log_unmount(mp); xfs_uuid_unmount(mp); @@ -1537,16 +1538,6 @@ xfs_unmountfs( xfs_free_perag(mp); } -STATIC void -xfs_unmountfs_wait(xfs_mount_t *mp) -{ - if (mp->m_logdev_targp != mp->m_ddev_targp) - xfs_wait_buftarg(mp->m_logdev_targp); - if (mp->m_rtdev_targp) - xfs_wait_buftarg(mp->m_rtdev_targp); - xfs_wait_buftarg(mp->m_ddev_targp); -} - int xfs_fs_writable(xfs_mount_t *mp) { @@ -1612,15 +1603,14 @@ xfs_unmountfs_writesb(xfs_mount_t *mp) XFS_BUF_UNDONE(sbp); XFS_BUF_UNREAD(sbp); - XFS_BUF_UNDELAYWRITE(sbp); + xfs_buf_delwri_dequeue(sbp); XFS_BUF_WRITE(sbp); XFS_BUF_UNASYNC(sbp); - ASSERT(XFS_BUF_TARGET(sbp) == mp->m_ddev_targp); + ASSERT(sbp->b_target == mp->m_ddev_targp); xfsbdstrat(mp, sbp); error = xfs_buf_iowait(sbp); if (error) - xfs_ioerror_alert("xfs_unmountfs_writesb", - mp, sbp, XFS_BUF_ADDR(sbp)); + xfs_buf_ioerror_alert(sbp, __func__); xfs_buf_relse(sbp); } return error; @@ -1938,7 +1928,7 @@ xfs_getsb( xfs_buf_lock(bp); } - XFS_BUF_HOLD(bp); + xfs_buf_hold(bp); ASSERT(XFS_BUF_ISDONE(bp)); return bp; } diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/xfs_qm.c index 46e54ad9a2dc..0bbb1a41998b 100644 --- a/fs/xfs/quota/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -674,7 +674,8 @@ xfs_qm_dqattach_one( * disk and we didn't ask it to allocate; * ESRCH if quotas got turned off suddenly. */ - error = xfs_qm_dqget(ip->i_mount, ip, id, type, XFS_QMOPT_DOWARN, &dqp); + error = xfs_qm_dqget(ip->i_mount, ip, id, type, + doalloc | XFS_QMOPT_DOWARN, &dqp); if (error) return error; @@ -1240,7 +1241,7 @@ xfs_qm_reset_dqcounts( do_div(j, sizeof(xfs_dqblk_t)); ASSERT(mp->m_quotainfo->qi_dqperchunk == j); #endif - ddq = (xfs_disk_dquot_t *)XFS_BUF_PTR(bp); + ddq = bp->b_addr; for (j = 0; j < mp->m_quotainfo->qi_dqperchunk; j++) { /* * Do a sanity check, and if needed, repair the dqblk. Don't @@ -1296,7 +1297,8 @@ xfs_qm_dqiter_bufs( break; xfs_qm_reset_dqcounts(mp, bp, firstid, type); - xfs_bdwrite(mp, bp); + xfs_buf_delwri_queue(bp); + xfs_buf_relse(bp); /* * goto the next block. */ @@ -1346,11 +1348,8 @@ xfs_qm_dqiterate( * the inode is never added to the transaction. */ xfs_ilock(qip, XFS_ILOCK_SHARED); - error = xfs_bmapi(NULL, qip, lblkno, - maxlblkcnt - lblkno, - XFS_BMAPI_METADATA, - NULL, - 0, map, &nmaps, NULL); + error = xfs_bmapi_read(qip, lblkno, maxlblkcnt - lblkno, + map, &nmaps, 0); xfs_iunlock(qip, XFS_ILOCK_SHARED); if (error) break; @@ -1683,7 +1682,7 @@ xfs_qm_quotacheck( * quotacheck'd stamp on the superblock. So, here we do a synchronous * flush. */ - XFS_bflush(mp->m_ddev_targp); + xfs_flush_buftarg(mp->m_ddev_targp, 1); /* * If one type of quotas is off, then it will lose its diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/xfs_qm.h index 43b9abe1052c..43b9abe1052c 100644 --- a/fs/xfs/quota/xfs_qm.h +++ b/fs/xfs/xfs_qm.h diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c index a0a829addca9..a0a829addca9 100644 --- a/fs/xfs/quota/xfs_qm_bhv.c +++ b/fs/xfs/xfs_qm_bhv.c diff --git a/fs/xfs/quota/xfs_qm_stats.c b/fs/xfs/xfs_qm_stats.c index 8671a0b32644..8671a0b32644 100644 --- a/fs/xfs/quota/xfs_qm_stats.c +++ b/fs/xfs/xfs_qm_stats.c diff --git a/fs/xfs/quota/xfs_qm_stats.h b/fs/xfs/xfs_qm_stats.h index 5b964fc0dc09..5b964fc0dc09 100644 --- a/fs/xfs/quota/xfs_qm_stats.h +++ b/fs/xfs/xfs_qm_stats.h diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 609246f42e6c..5cc3dde1bc90 100644 --- a/fs/xfs/quota/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -261,7 +261,7 @@ xfs_qm_scall_trunc_qfile( } xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); error = xfs_itruncate_data(&tp, ip, 0); if (error) { diff --git a/fs/xfs/quota/xfs_quota_priv.h b/fs/xfs/xfs_quota_priv.h index 94a3d927d716..94a3d927d716 100644 --- a/fs/xfs/quota/xfs_quota_priv.h +++ b/fs/xfs/xfs_quota_priv.h diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c index 29b9d642e93d..7e76f537abb7 100644 --- a/fs/xfs/linux-2.6/xfs_quotaops.c +++ b/fs/xfs/xfs_quotaops.c @@ -25,7 +25,7 @@ #include "xfs_trans.h" #include "xfs_bmap_btree.h" #include "xfs_inode.h" -#include "quota/xfs_qm.h" +#include "xfs_qm.h" #include <linux/quota.h> diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c index 77a59891734e..866de277079a 100644 --- a/fs/xfs/xfs_rename.c +++ b/fs/xfs/xfs_rename.c @@ -116,7 +116,7 @@ xfs_rename( trace_xfs_rename(src_dp, target_dp, src_name, target_name); new_parent = (src_dp != target_dp); - src_is_directory = ((src_ip->i_d.di_mode & S_IFMT) == S_IFDIR); + src_is_directory = S_ISDIR(src_ip->i_d.di_mode); if (src_is_directory) { /* @@ -170,12 +170,12 @@ xfs_rename( * we can rely on either trans_commit or trans_cancel to unlock * them. */ - xfs_trans_ijoin_ref(tp, src_dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL); if (new_parent) - xfs_trans_ijoin_ref(tp, target_dp, XFS_ILOCK_EXCL); - xfs_trans_ijoin_ref(tp, src_ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL); if (target_ip) - xfs_trans_ijoin_ref(tp, target_ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL); /* * If we are using project inheritance, we only allow renames @@ -226,7 +226,7 @@ xfs_rename( * target and source are directories and that target can be * destroyed, or that neither is a directory. */ - if ((target_ip->i_d.di_mode & S_IFMT) == S_IFDIR) { + if (S_ISDIR(target_ip->i_d.di_mode)) { /* * Make sure target dir is empty. */ diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 8f76fdff4f46..87323f1ded64 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -112,7 +112,7 @@ xfs_growfs_rt_alloc( * Lock the inode. */ xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_bmap_init(&flist, &firstblock); /* @@ -120,9 +120,9 @@ xfs_growfs_rt_alloc( */ nmap = 1; cancelflags |= XFS_TRANS_ABORT; - error = xfs_bmapi(tp, ip, oblocks, nblocks - oblocks, - XFS_BMAPI_WRITE | XFS_BMAPI_METADATA, &firstblock, - resblks, &map, &nmap, &flist); + error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks, + XFS_BMAPI_METADATA, &firstblock, + resblks, &map, &nmap, &flist); if (!error && nmap < 1) error = XFS_ERROR(ENOSPC); if (error) @@ -155,7 +155,7 @@ xfs_growfs_rt_alloc( * Lock the bitmap inode. */ xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); /* * Get a buffer for the block. */ @@ -168,7 +168,7 @@ error_cancel: xfs_trans_cancel(tp, cancelflags); goto error; } - memset(XFS_BUF_PTR(bp), 0, mp->m_sb.sb_blocksize); + memset(bp->b_addr, 0, mp->m_sb.sb_blocksize); xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1); /* * Commit the transaction. @@ -856,34 +856,24 @@ xfs_rtbuf_get( xfs_buf_t **bpp) /* output: buffer for the block */ { xfs_buf_t *bp; /* block buffer, result */ - xfs_daddr_t d; /* disk addr of block */ - int error; /* error value */ - xfs_fsblock_t fsb; /* fs block number for block */ xfs_inode_t *ip; /* bitmap or summary inode */ + xfs_bmbt_irec_t map; + int nmap; + int error; /* error value */ ip = issum ? mp->m_rsumip : mp->m_rbmip; - /* - * Map from the file offset (block) and inode number to the - * file system block. - */ - error = xfs_bmapi_single(tp, ip, XFS_DATA_FORK, &fsb, block); - if (error) { + + error = xfs_bmapi_read(ip, block, 1, &map, &nmap, XFS_DATA_FORK); + if (error) return error; - } - ASSERT(fsb != NULLFSBLOCK); - /* - * Convert to disk address for buffer cache. - */ - d = XFS_FSB_TO_DADDR(mp, fsb); - /* - * Read the buffer. - */ - error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, + + ASSERT(map.br_startblock != NULLFSBLOCK); + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, map.br_startblock), mp->m_bsize, 0, &bp); - if (error) { + if (error) return error; - } - ASSERT(bp && !XFS_BUF_GETERROR(bp)); + ASSERT(!xfs_buf_geterror(bp)); *bpp = bp; return 0; } @@ -943,7 +933,7 @@ xfs_rtcheck_range( if (error) { return error; } - bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + bufp = bp->b_addr; /* * Compute the starting word's address, and starting bit. */ @@ -994,7 +984,7 @@ xfs_rtcheck_range( if (error) { return error; } - b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + b = bufp = bp->b_addr; word = 0; } else { /* @@ -1040,7 +1030,7 @@ xfs_rtcheck_range( if (error) { return error; } - b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + b = bufp = bp->b_addr; word = 0; } else { /* @@ -1158,7 +1148,7 @@ xfs_rtfind_back( if (error) { return error; } - bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + bufp = bp->b_addr; /* * Get the first word's index & point to it. */ @@ -1210,7 +1200,7 @@ xfs_rtfind_back( if (error) { return error; } - bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + bufp = bp->b_addr; word = XFS_BLOCKWMASK(mp); b = &bufp[word]; } else { @@ -1256,7 +1246,7 @@ xfs_rtfind_back( if (error) { return error; } - bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + bufp = bp->b_addr; word = XFS_BLOCKWMASK(mp); b = &bufp[word]; } else { @@ -1333,7 +1323,7 @@ xfs_rtfind_forw( if (error) { return error; } - bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + bufp = bp->b_addr; /* * Get the first word's index & point to it. */ @@ -1384,7 +1374,7 @@ xfs_rtfind_forw( if (error) { return error; } - b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + b = bufp = bp->b_addr; word = 0; } else { /* @@ -1429,7 +1419,7 @@ xfs_rtfind_forw( if (error) { return error; } - b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + b = bufp = bp->b_addr; word = 0; } else { /* @@ -1649,7 +1639,7 @@ xfs_rtmodify_range( if (error) { return error; } - bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + bufp = bp->b_addr; /* * Compute the starting word's address, and starting bit. */ @@ -1694,7 +1684,7 @@ xfs_rtmodify_range( if (error) { return error; } - first = b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + first = b = bufp = bp->b_addr; word = 0; } else { /* @@ -1734,7 +1724,7 @@ xfs_rtmodify_range( if (error) { return error; } - first = b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + first = b = bufp = bp->b_addr; word = 0; } else { /* @@ -1832,8 +1822,8 @@ xfs_rtmodify_summary( */ sp = XFS_SUMPTR(mp, bp, so); *sp += delta; - xfs_trans_log_buf(tp, bp, (uint)((char *)sp - (char *)XFS_BUF_PTR(bp)), - (uint)((char *)sp - (char *)XFS_BUF_PTR(bp) + sizeof(*sp) - 1)); + xfs_trans_log_buf(tp, bp, (uint)((char *)sp - (char *)bp->b_addr), + (uint)((char *)sp - (char *)bp->b_addr + sizeof(*sp) - 1)); return 0; } @@ -1970,7 +1960,7 @@ xfs_growfs_rt( * Lock out other callers by grabbing the bitmap inode lock. */ xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL); - xfs_trans_ijoin_ref(tp, mp->m_rbmip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL); /* * Update the bitmap inode's size. */ @@ -1982,7 +1972,7 @@ xfs_growfs_rt( * Get the summary inode into the transaction. */ xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL); - xfs_trans_ijoin_ref(tp, mp->m_rsumip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, mp->m_rsumip, XFS_ILOCK_EXCL); /* * Update the summary inode's size. */ @@ -2153,7 +2143,7 @@ xfs_rtfree_extent( * Synchronize by locking the bitmap inode. */ xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL); - xfs_trans_ijoin_ref(tp, mp->m_rbmip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL); #if defined(__KERNEL__) && defined(DEBUG) /* diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index 09e1f4f35e97..f7f3a359c1c5 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -47,7 +47,7 @@ struct xfs_trans; #define XFS_SUMOFFSTOBLOCK(mp,s) \ (((s) * (uint)sizeof(xfs_suminfo_t)) >> (mp)->m_sb.sb_blocklog) #define XFS_SUMPTR(mp,bp,so) \ - ((xfs_suminfo_t *)((char *)XFS_BUF_PTR(bp) + \ + ((xfs_suminfo_t *)((bp)->b_addr + \ (((so) * (uint)sizeof(xfs_suminfo_t)) & XFS_BLOCKMASK(mp)))) #define XFS_BITTOBLOCK(mp,bi) ((bi) >> (mp)->m_blkbit_log) diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c index d6d6fdfe9422..597d044a09a1 100644 --- a/fs/xfs/xfs_rw.c +++ b/fs/xfs/xfs_rw.c @@ -92,24 +92,6 @@ xfs_do_force_shutdown( } /* - * Prints out an ALERT message about I/O error. - */ -void -xfs_ioerror_alert( - char *func, - struct xfs_mount *mp, - xfs_buf_t *bp, - xfs_daddr_t blkno) -{ - xfs_alert(mp, - "I/O error occurred: meta-data dev %s block 0x%llx" - " (\"%s\") error %d buf count %zd", - XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)), - (__uint64_t)blkno, func, - XFS_BUF_GETERROR(bp), XFS_BUF_COUNT(bp)); -} - -/* * This isn't an absolute requirement, but it is * just a good idea to call xfs_read_buf instead of * directly doing a read_buf call. For one, we shouldn't @@ -137,20 +119,19 @@ xfs_read_buf( bp = xfs_buf_read(target, blkno, len, flags); if (!bp) return XFS_ERROR(EIO); - error = XFS_BUF_GETERROR(bp); - if (bp && !error && !XFS_FORCED_SHUTDOWN(mp)) { + error = bp->b_error; + if (!error && !XFS_FORCED_SHUTDOWN(mp)) { *bpp = bp; } else { *bpp = NULL; if (error) { - xfs_ioerror_alert("xfs_read_buf", mp, bp, XFS_BUF_ADDR(bp)); + xfs_buf_ioerror_alert(bp, __func__); } else { error = XFS_ERROR(EIO); } if (bp) { XFS_BUF_UNDONE(bp); - XFS_BUF_UNDELAYWRITE(bp); - XFS_BUF_STALE(bp); + xfs_buf_stale(bp); /* * brelse clears B_ERROR and b_error */ diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h index 11c41ec6ed75..bbdb9ad6a4ba 100644 --- a/fs/xfs/xfs_rw.h +++ b/fs/xfs/xfs_rw.h @@ -42,8 +42,6 @@ xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb) extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp, xfs_daddr_t blkno, int len, uint flags, struct xfs_buf **bpp); -extern void xfs_ioerror_alert(char *func, struct xfs_mount *mp, - xfs_buf_t *bp, xfs_daddr_t blkno); extern xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); #endif /* __XFS_RW_H__ */ diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h index 1eb2ba586814..cb6ae715814a 100644 --- a/fs/xfs/xfs_sb.h +++ b/fs/xfs/xfs_sb.h @@ -509,7 +509,7 @@ static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp) #define XFS_SB_DADDR ((xfs_daddr_t)0) /* daddr in filesystem/ag */ #define XFS_SB_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_SB_DADDR) -#define XFS_BUF_TO_SBP(bp) ((xfs_dsb_t *)XFS_BUF_PTR(bp)) +#define XFS_BUF_TO_SBP(bp) ((xfs_dsb_t *)((bp)->b_addr)) #define XFS_HDR_BLOCK(mp,d) ((xfs_agblock_t)XFS_BB_TO_FSBT(mp,d)) #define XFS_DADDR_TO_FSB(mp,d) XFS_AGB_TO_FSB(mp, \ diff --git a/fs/xfs/linux-2.6/xfs_stats.c b/fs/xfs/xfs_stats.c index 76fdc5861932..76fdc5861932 100644 --- a/fs/xfs/linux-2.6/xfs_stats.c +++ b/fs/xfs/xfs_stats.c diff --git a/fs/xfs/linux-2.6/xfs_stats.h b/fs/xfs/xfs_stats.h index 736854b1ca1a..736854b1ca1a 100644 --- a/fs/xfs/linux-2.6/xfs_stats.h +++ b/fs/xfs/xfs_stats.h diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/xfs_super.c index 9a72dda58bd0..3eca58f51ae9 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -356,6 +356,8 @@ xfs_parseargs( mp->m_flags |= XFS_MOUNT_DELAYLOG; } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) { mp->m_flags &= ~XFS_MOUNT_DELAYLOG; + xfs_warn(mp, + "nodelaylog is deprecated and will be removed in Linux 3.3"); } else if (!strcmp(this_char, MNTOPT_DISCARD)) { mp->m_flags |= XFS_MOUNT_DISCARD; } else if (!strcmp(this_char, MNTOPT_NODISCARD)) { @@ -794,8 +796,6 @@ xfs_fs_destroy_inode( if (is_bad_inode(inode)) goto out_reclaim; - xfs_ioend_wait(ip); - ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0); /* @@ -835,7 +835,6 @@ xfs_fs_inode_init_once( inode_init_once(VFS_I(ip)); /* xfs inode */ - atomic_set(&ip->i_iocount, 0); atomic_set(&ip->i_pincount, 0); spin_lock_init(&ip->i_flags_lock); init_waitqueue_head(&ip->i_ipin_wait); @@ -877,33 +876,17 @@ xfs_log_inode( struct xfs_trans *tp; int error; - xfs_iunlock(ip, XFS_ILOCK_SHARED); tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0); - if (error) { xfs_trans_cancel(tp, 0); - /* we need to return with the lock hold shared */ - xfs_ilock(ip, XFS_ILOCK_SHARED); return error; } xfs_ilock(ip, XFS_ILOCK_EXCL); - - /* - * Note - it's possible that we might have pushed ourselves out of the - * way during trans_reserve which would flush the inode. But there's - * no guarantee that the inode buffer has actually gone out yet (it's - * delwri). Plus the buffer could be pinned anyway if it's part of - * an inode in another recent transaction. So we play it safe and - * fire off the transaction anyway. - */ - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - error = xfs_trans_commit(tp, 0); - xfs_ilock_demote(ip, XFS_ILOCK_EXCL); - - return error; + return xfs_trans_commit(tp, 0); } STATIC int @@ -918,7 +901,9 @@ xfs_fs_write_inode( trace_xfs_write_inode(ip); if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); + return -XFS_ERROR(EIO); + if (!ip->i_update_core) + return 0; if (wbc->sync_mode == WB_SYNC_ALL) { /* @@ -926,15 +911,12 @@ xfs_fs_write_inode( * of forcing it all the way to stable storage using a * synchronous transaction we let the log force inside the * ->sync_fs call do that for thus, which reduces the number - * of synchronous log foces dramatically. + * of synchronous log forces dramatically. */ - xfs_ioend_wait(ip); - xfs_ilock(ip, XFS_ILOCK_SHARED); - if (ip->i_update_core) { - error = xfs_log_inode(ip); - if (error) - goto out_unlock; - } + error = xfs_log_inode(ip); + if (error) + goto out; + return 0; } else { /* * We make this non-blocking if the inode is contended, return @@ -1033,7 +1015,7 @@ xfs_fs_put_super( */ xfs_filestream_unmount(mp); - XFS_bflush(mp->m_ddev_targp); + xfs_flush_buftarg(mp->m_ddev_targp, 1); xfs_unmountfs(mp); xfs_freesb(mp); @@ -1457,7 +1439,7 @@ xfs_fs_fill_super( */ xfs_filestream_unmount(mp); - XFS_bflush(mp->m_ddev_targp); + xfs_flush_buftarg(mp->m_ddev_targp, 1); xfs_unmountfs(mp); goto out_free_sb; @@ -1666,24 +1648,13 @@ xfs_init_workqueues(void) */ xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_CPU_INTENSIVE, 8); if (!xfs_syncd_wq) - goto out; - - xfs_ail_wq = alloc_workqueue("xfsail", WQ_CPU_INTENSIVE, 8); - if (!xfs_ail_wq) - goto out_destroy_syncd; - + return -ENOMEM; return 0; - -out_destroy_syncd: - destroy_workqueue(xfs_syncd_wq); -out: - return -ENOMEM; } STATIC void xfs_destroy_workqueues(void) { - destroy_workqueue(xfs_ail_wq); destroy_workqueue(xfs_syncd_wq); } @@ -1695,7 +1666,6 @@ init_xfs_fs(void) printk(KERN_INFO XFS_VERSION_STRING " with " XFS_BUILD_OPTIONS " enabled\n"); - xfs_ioend_init(); xfs_dir_startup(); error = xfs_init_zones(); diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/xfs_super.h index 50a3266c999e..50a3266c999e 100644 --- a/fs/xfs/linux-2.6/xfs_super.h +++ b/fs/xfs/xfs_super.h diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/xfs_sync.c index e4c938afb910..aa3dc1a4d53d 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/xfs_sync.c @@ -227,21 +227,17 @@ xfs_sync_inode_data( int error = 0; if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) - goto out_wait; + return 0; if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) { if (flags & SYNC_TRYLOCK) - goto out_wait; + return 0; xfs_ilock(ip, XFS_IOLOCK_SHARED); } error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ? 0 : XBF_ASYNC, FI_NONE); xfs_iunlock(ip, XFS_IOLOCK_SHARED); - - out_wait: - if (flags & SYNC_WAIT) - xfs_ioend_wait(ip); return error; } @@ -322,6 +318,7 @@ xfs_sync_fsdata( struct xfs_mount *mp) { struct xfs_buf *bp; + int error; /* * If the buffer is pinned then push on the log so we won't get stuck @@ -332,10 +329,11 @@ xfs_sync_fsdata( * between there and here. */ bp = xfs_getsb(mp, 0); - if (XFS_BUF_ISPINNED(bp)) + if (xfs_buf_ispinned(bp)) xfs_log_force(mp, 0); - - return xfs_bwrite(mp, bp); + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + return error; } /* @@ -379,7 +377,7 @@ xfs_quiesce_data( /* flush data-only devices */ if (mp->m_rtdev_targp) - XFS_bflush(mp->m_rtdev_targp); + xfs_flush_buftarg(mp->m_rtdev_targp, 1); return error ? error : error2; } diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/xfs_sync.h index 941202e7ac6e..941202e7ac6e 100644 --- a/fs/xfs/linux-2.6/xfs_sync.h +++ b/fs/xfs/xfs_sync.h diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c index ee2d2adaa438..ee2d2adaa438 100644 --- a/fs/xfs/linux-2.6/xfs_sysctl.c +++ b/fs/xfs/xfs_sysctl.c diff --git a/fs/xfs/linux-2.6/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h index b9937d450f8e..b9937d450f8e 100644 --- a/fs/xfs/linux-2.6/xfs_sysctl.h +++ b/fs/xfs/xfs_sysctl.h diff --git a/fs/xfs/linux-2.6/xfs_trace.c b/fs/xfs/xfs_trace.c index 88d25d4aa56e..9010ce885e6a 100644 --- a/fs/xfs/linux-2.6/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -43,8 +43,8 @@ #include "xfs_quota.h" #include "xfs_iomap.h" #include "xfs_aops.h" -#include "quota/xfs_dquot_item.h" -#include "quota/xfs_dquot.h" +#include "xfs_dquot_item.h" +#include "xfs_dquot.h" #include "xfs_log_recover.h" #include "xfs_inode_item.h" diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/xfs_trace.h index fda0708ef2ea..f1d2802b2f07 100644 --- a/fs/xfs/linux-2.6/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -30,6 +30,7 @@ struct xfs_buf_log_item; struct xfs_da_args; struct xfs_da_node_entry; struct xfs_dquot; +struct xfs_log_item; struct xlog_ticket; struct log; struct xlog_recover; @@ -320,7 +321,6 @@ DEFINE_BUF_EVENT(xfs_buf_rele); DEFINE_BUF_EVENT(xfs_buf_iodone); DEFINE_BUF_EVENT(xfs_buf_iorequest); DEFINE_BUF_EVENT(xfs_buf_bawrite); -DEFINE_BUF_EVENT(xfs_buf_bdwrite); DEFINE_BUF_EVENT(xfs_buf_lock); DEFINE_BUF_EVENT(xfs_buf_lock_done); DEFINE_BUF_EVENT(xfs_buf_trylock); @@ -571,12 +571,13 @@ DEFINE_INODE_EVENT(xfs_alloc_file_space); DEFINE_INODE_EVENT(xfs_free_file_space); DEFINE_INODE_EVENT(xfs_readdir); #ifdef CONFIG_XFS_POSIX_ACL -DEFINE_INODE_EVENT(xfs_check_acl); +DEFINE_INODE_EVENT(xfs_get_acl); #endif DEFINE_INODE_EVENT(xfs_vm_bmap); DEFINE_INODE_EVENT(xfs_file_ioctl); DEFINE_INODE_EVENT(xfs_file_compat_ioctl); DEFINE_INODE_EVENT(xfs_ioctl_setattr); +DEFINE_INODE_EVENT(xfs_dir_fsync); DEFINE_INODE_EVENT(xfs_file_fsync); DEFINE_INODE_EVENT(xfs_destroy_inode); DEFINE_INODE_EVENT(xfs_write_inode); @@ -853,6 +854,42 @@ DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_enter); DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_exit); DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_sub); +DECLARE_EVENT_CLASS(xfs_log_item_class, + TP_PROTO(struct xfs_log_item *lip), + TP_ARGS(lip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(void *, lip) + __field(uint, type) + __field(uint, flags) + __field(xfs_lsn_t, lsn) + ), + TP_fast_assign( + __entry->dev = lip->li_mountp->m_super->s_dev; + __entry->lip = lip; + __entry->type = lip->li_type; + __entry->flags = lip->li_flags; + __entry->lsn = lip->li_lsn; + ), + TP_printk("dev %d:%d lip 0x%p lsn %d/%d type %s flags %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->lip, + CYCLE_LSN(__entry->lsn), BLOCK_LSN(__entry->lsn), + __print_symbolic(__entry->type, XFS_LI_TYPE_DESC), + __print_flags(__entry->flags, "|", XFS_LI_FLAGS)) +) + +#define DEFINE_LOG_ITEM_EVENT(name) \ +DEFINE_EVENT(xfs_log_item_class, name, \ + TP_PROTO(struct xfs_log_item *lip), \ + TP_ARGS(lip)) +DEFINE_LOG_ITEM_EVENT(xfs_ail_push); +DEFINE_LOG_ITEM_EVENT(xfs_ail_pushbuf); +DEFINE_LOG_ITEM_EVENT(xfs_ail_pushbuf_pinned); +DEFINE_LOG_ITEM_EVENT(xfs_ail_pinned); +DEFINE_LOG_ITEM_EVENT(xfs_ail_locked); + + DECLARE_EVENT_CLASS(xfs_file_class, TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), TP_ARGS(ip, count, offset, flags), diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index efc147f0e9b6..1f35b2feca97 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1790,9 +1790,7 @@ xfs_trans_commit_cil( } /* - * xfs_trans_commit - * - * Commit the given transaction to the log a/synchronously. + * Commit the given transaction to the log. * * XFS disk error handling mechanism is not based on a typical * transaction abort mechanism. Logically after the filesystem @@ -1804,10 +1802,9 @@ xfs_trans_commit_cil( * Do not reference the transaction structure after this call. */ int -_xfs_trans_commit( +xfs_trans_commit( struct xfs_trans *tp, - uint flags, - int *log_flushed) + uint flags) { struct xfs_mount *mp = tp->t_mountp; xfs_lsn_t commit_lsn = -1; @@ -1866,7 +1863,7 @@ _xfs_trans_commit( if (sync) { if (!error) { error = _xfs_log_force_lsn(mp, commit_lsn, - XFS_LOG_SYNC, log_flushed); + XFS_LOG_SYNC, NULL); } XFS_STATS_INC(xs_trans_sync); } else { @@ -2021,6 +2018,6 @@ xfs_trans_roll( if (error) return error; - xfs_trans_ijoin(trans, dp); + xfs_trans_ijoin(trans, dp, 0); return 0; } diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 06a9759b6352..3ae713c0abd9 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -326,7 +326,7 @@ typedef struct xfs_log_item { struct xfs_log_item *); /* buffer item iodone */ /* callback func */ - struct xfs_item_ops *li_ops; /* function list */ + const struct xfs_item_ops *li_ops; /* function list */ /* delayed logging */ struct list_head li_cil; /* CIL pointers */ @@ -341,7 +341,7 @@ typedef struct xfs_log_item { { XFS_LI_IN_AIL, "IN_AIL" }, \ { XFS_LI_ABORTED, "ABORTED" } -typedef struct xfs_item_ops { +struct xfs_item_ops { uint (*iop_size)(xfs_log_item_t *); void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *); void (*iop_pin)(xfs_log_item_t *); @@ -350,9 +350,9 @@ typedef struct xfs_item_ops { void (*iop_unlock)(xfs_log_item_t *); xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t); void (*iop_push)(xfs_log_item_t *); - void (*iop_pushbuf)(xfs_log_item_t *); + bool (*iop_pushbuf)(xfs_log_item_t *); void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t); -} xfs_item_ops_t; +}; #define IOP_SIZE(ip) (*(ip)->li_ops->iop_size)(ip) #define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp) @@ -470,8 +470,7 @@ void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *); void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint); void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *); void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int); -void xfs_trans_ijoin_ref(struct xfs_trans *, struct xfs_inode *, uint); -void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *); +void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint); void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint); void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint); struct xfs_efi_log_item *xfs_trans_get_efi(xfs_trans_t *, uint); @@ -487,10 +486,7 @@ void xfs_trans_log_efd_extent(xfs_trans_t *, struct xfs_efd_log_item *, xfs_fsblock_t, xfs_extlen_t); -int _xfs_trans_commit(xfs_trans_t *, - uint flags, - int *); -#define xfs_trans_commit(tp, flags) _xfs_trans_commit(tp, flags, NULL) +int xfs_trans_commit(xfs_trans_t *, uint flags); void xfs_trans_cancel(xfs_trans_t *, int); int xfs_trans_ail_init(struct xfs_mount *); void xfs_trans_ail_destroy(struct xfs_mount *); diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 43233e92f0f6..ed9252bcdac9 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -26,10 +26,9 @@ #include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_trans_priv.h" +#include "xfs_trace.h" #include "xfs_error.h" -struct workqueue_struct *xfs_ail_wq; /* AIL workqueue */ - #ifdef DEBUG /* * Check that the list is sorted as it should be. @@ -299,7 +298,7 @@ xfs_trans_ail_cursor_last( * Splice the log item list into the AIL at the given LSN. We splice to the * tail of the given LSN to maintain insert order for push traversals. The * cursor is optional, allowing repeated updates to the same LSN to avoid - * repeated traversals. + * repeated traversals. This should not be called with an empty list. */ static void xfs_ail_splice( @@ -308,50 +307,39 @@ xfs_ail_splice( struct list_head *list, xfs_lsn_t lsn) { - struct xfs_log_item *lip = cur ? cur->item : NULL; - struct xfs_log_item *next_lip; + struct xfs_log_item *lip; + + ASSERT(!list_empty(list)); /* - * Get a new cursor if we don't have a placeholder or the existing one - * has been invalidated. + * Use the cursor to determine the insertion point if one is + * provided. If not, or if the one we got is not valid, + * find the place in the AIL where the items belong. */ - if (!lip || (__psint_t)lip & 1) { + lip = cur ? cur->item : NULL; + if (!lip || (__psint_t) lip & 1) lip = __xfs_trans_ail_cursor_last(ailp, lsn); - if (!lip) { - /* The list is empty, so just splice and return. */ - if (cur) - cur->item = NULL; - list_splice(list, &ailp->xa_ail); - return; - } - } + /* + * If a cursor is provided, we know we're processing the AIL + * in lsn order, and future items to be spliced in will + * follow the last one being inserted now. Update the + * cursor to point to that last item, now while we have a + * reliable pointer to it. + */ + if (cur) + cur->item = list_entry(list->prev, struct xfs_log_item, li_ail); /* - * Our cursor points to the item we want to insert _after_, so we have - * to update the cursor to point to the end of the list we are splicing - * in so that it points to the correct location for the next splice. - * i.e. before the splice - * - * lsn -> lsn -> lsn + x -> lsn + x ... - * ^ - * | cursor points here - * - * After the splice we have: - * - * lsn -> lsn -> lsn -> lsn -> .... -> lsn -> lsn + x -> lsn + x ... - * ^ ^ - * | cursor points here | needs to move here - * - * So we set the cursor to the last item in the list to be spliced - * before we execute the splice, resulting in the cursor pointing to - * the correct item after the splice occurs. + * Finally perform the splice. Unless the AIL was empty, + * lip points to the item in the AIL _after_ which the new + * items should go. If lip is null the AIL was empty, so + * the new items go at the head of the AIL. */ - if (cur) { - next_lip = list_entry(list->prev, struct xfs_log_item, li_ail); - cur->item = next_lip; - } - list_splice(list, &lip->li_ail); + if (lip) + list_splice(list, &lip->li_ail); + else + list_splice(list, &ailp->xa_ail); } /* @@ -367,28 +355,34 @@ xfs_ail_delete( xfs_trans_ail_cursor_clear(ailp, lip); } -/* - * xfs_ail_worker does the work of pushing on the AIL. It will requeue itself - * to run at a later time if there is more work to do to complete the push. - */ -STATIC void -xfs_ail_worker( - struct work_struct *work) +static long +xfsaild_push( + struct xfs_ail *ailp) { - struct xfs_ail *ailp = container_of(to_delayed_work(work), - struct xfs_ail, xa_work); xfs_mount_t *mp = ailp->xa_mount; struct xfs_ail_cursor cur; xfs_log_item_t *lip; xfs_lsn_t lsn; xfs_lsn_t target; long tout = 10; - int flush_log = 0; int stuck = 0; int count = 0; int push_xfsbufd = 0; + /* + * If last time we ran we encountered pinned items, force the log first + * and wait for it before pushing again. + */ spin_lock(&ailp->xa_lock); + if (ailp->xa_last_pushed_lsn == 0 && ailp->xa_log_flush && + !list_empty(&ailp->xa_ail)) { + ailp->xa_log_flush = 0; + spin_unlock(&ailp->xa_lock); + XFS_STATS_INC(xs_push_ail_flush); + xfs_log_force(mp, XFS_LOG_SYNC); + spin_lock(&ailp->xa_lock); + } + target = ailp->xa_target; lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->xa_last_pushed_lsn); if (!lip || XFS_FORCED_SHUTDOWN(mp)) { @@ -432,26 +426,37 @@ xfs_ail_worker( switch (lock_result) { case XFS_ITEM_SUCCESS: XFS_STATS_INC(xs_push_ail_success); + trace_xfs_ail_push(lip); + IOP_PUSH(lip); ailp->xa_last_pushed_lsn = lsn; break; case XFS_ITEM_PUSHBUF: XFS_STATS_INC(xs_push_ail_pushbuf); - IOP_PUSHBUF(lip); - ailp->xa_last_pushed_lsn = lsn; + trace_xfs_ail_pushbuf(lip); + + if (!IOP_PUSHBUF(lip)) { + trace_xfs_ail_pushbuf_pinned(lip); + stuck++; + ailp->xa_log_flush++; + } else { + ailp->xa_last_pushed_lsn = lsn; + } push_xfsbufd = 1; break; case XFS_ITEM_PINNED: XFS_STATS_INC(xs_push_ail_pinned); + trace_xfs_ail_pinned(lip); + stuck++; - flush_log = 1; + ailp->xa_log_flush++; break; case XFS_ITEM_LOCKED: XFS_STATS_INC(xs_push_ail_locked); - ailp->xa_last_pushed_lsn = lsn; + trace_xfs_ail_locked(lip); stuck++; break; @@ -491,16 +496,6 @@ xfs_ail_worker( xfs_trans_ail_cursor_done(ailp, &cur); spin_unlock(&ailp->xa_lock); - if (flush_log) { - /* - * If something we need to push out was pinned, then - * push out the log so it will become unpinned and - * move forward in the AIL. - */ - XFS_STATS_INC(xs_push_ail_flush); - xfs_log_force(mp, 0); - } - if (push_xfsbufd) { /* we've got delayed write buffers to flush */ wake_up_process(mp->m_ddev_targp->bt_task); @@ -511,20 +506,7 @@ out_done: if (!count) { /* We're past our target or empty, so idle */ ailp->xa_last_pushed_lsn = 0; - - /* - * We clear the XFS_AIL_PUSHING_BIT first before checking - * whether the target has changed. If the target has changed, - * this pushes the requeue race directly onto the result of the - * atomic test/set bit, so we are guaranteed that either the - * the pusher that changed the target or ourselves will requeue - * the work (but not both). - */ - clear_bit(XFS_AIL_PUSHING_BIT, &ailp->xa_flags); - smp_rmb(); - if (XFS_LSN_CMP(ailp->xa_target, target) == 0 || - test_and_set_bit(XFS_AIL_PUSHING_BIT, &ailp->xa_flags)) - return; + ailp->xa_log_flush = 0; tout = 50; } else if (XFS_LSN_CMP(lsn, target) >= 0) { @@ -543,14 +525,39 @@ out_done: * were stuck. * * Backoff a bit more to allow some I/O to complete before - * continuing from where we were. + * restarting from the start of the AIL. This prevents us + * from spinning on the same items, and if they are pinned will + * all the restart to issue a log force to unpin the stuck + * items. */ tout = 20; + ailp->xa_last_pushed_lsn = 0; + } + + return tout; +} + +static int +xfsaild( + void *data) +{ + struct xfs_ail *ailp = data; + long tout = 0; /* milliseconds */ + + while (!kthread_should_stop()) { + if (tout && tout <= 20) + __set_current_state(TASK_KILLABLE); + else + __set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(tout ? + msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT); + + try_to_freeze(); + + tout = xfsaild_push(ailp); } - /* There is more to do, requeue us. */ - queue_delayed_work(xfs_syncd_wq, &ailp->xa_work, - msecs_to_jiffies(tout)); + return 0; } /* @@ -585,8 +592,9 @@ xfs_ail_push( */ smp_wmb(); xfs_trans_ail_copy_lsn(ailp, &ailp->xa_target, &threshold_lsn); - if (!test_and_set_bit(XFS_AIL_PUSHING_BIT, &ailp->xa_flags)) - queue_delayed_work(xfs_syncd_wq, &ailp->xa_work, 0); + smp_wmb(); + + wake_up_process(ailp->xa_task); } /* @@ -682,6 +690,7 @@ xfs_trans_ail_update_bulk( int i; LIST_HEAD(tmp); + ASSERT(nr_items > 0); /* Not required, but true. */ mlip = xfs_ail_min(ailp); for (i = 0; i < nr_items; i++) { @@ -701,7 +710,8 @@ xfs_trans_ail_update_bulk( list_add(&lip->li_ail, &tmp); } - xfs_ail_splice(ailp, cur, &tmp, lsn); + if (!list_empty(&tmp)) + xfs_ail_splice(ailp, cur, &tmp, lsn); if (!mlip_changed) { spin_unlock(&ailp->xa_lock); @@ -822,9 +832,18 @@ xfs_trans_ail_init( INIT_LIST_HEAD(&ailp->xa_ail); INIT_LIST_HEAD(&ailp->xa_cursors); spin_lock_init(&ailp->xa_lock); - INIT_DELAYED_WORK(&ailp->xa_work, xfs_ail_worker); + + ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s", + ailp->xa_mount->m_fsname); + if (IS_ERR(ailp->xa_task)) + goto out_free_ailp; + mp->m_ail = ailp; return 0; + +out_free_ailp: + kmem_free(ailp); + return ENOMEM; } void @@ -833,6 +852,6 @@ xfs_trans_ail_destroy( { struct xfs_ail *ailp = mp->m_ail; - cancel_delayed_work_sync(&ailp->xa_work); + kthread_stop(ailp->xa_task); kmem_free(ailp); } diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 15584fc3ed7d..475a4ded4f41 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -54,7 +54,7 @@ xfs_trans_buf_item_match( list_for_each_entry(lidp, &tp->t_items, lid_trans) { blip = (struct xfs_buf_log_item *)lidp->lid_item; if (blip->bli_item.li_type == XFS_LI_BUF && - XFS_BUF_TARGET(blip->bli_buf) == target && + blip->bli_buf->b_target == target && XFS_BUF_ADDR(blip->bli_buf) == blkno && XFS_BUF_COUNT(blip->bli_buf) == len) return blip->bli_buf; @@ -80,7 +80,6 @@ _xfs_trans_bjoin( { struct xfs_buf_log_item *bip; - ASSERT(XFS_BUF_ISBUSY(bp)); ASSERT(bp->b_transp == NULL); /* @@ -161,8 +160,10 @@ xfs_trans_get_buf(xfs_trans_t *tp, bp = xfs_trans_buf_item_match(tp, target_dev, blkno, len); if (bp != NULL) { ASSERT(xfs_buf_islocked(bp)); - if (XFS_FORCED_SHUTDOWN(tp->t_mountp)) - XFS_BUF_SUPER_STALE(bp); + if (XFS_FORCED_SHUTDOWN(tp->t_mountp)) { + xfs_buf_stale(bp); + XFS_BUF_DONE(bp); + } /* * If the buffer is stale then it was binval'ed @@ -194,7 +195,7 @@ xfs_trans_get_buf(xfs_trans_t *tp, return NULL; } - ASSERT(!XFS_BUF_GETERROR(bp)); + ASSERT(!bp->b_error); _xfs_trans_bjoin(tp, bp, 1); trace_xfs_trans_get_buf(bp->b_fspriv); @@ -293,10 +294,9 @@ xfs_trans_read_buf( return (flags & XBF_TRYLOCK) ? EAGAIN : XFS_ERROR(ENOMEM); - if (XFS_BUF_GETERROR(bp) != 0) { - xfs_ioerror_alert("xfs_trans_read_buf", mp, - bp, blkno); - error = XFS_BUF_GETERROR(bp); + if (bp->b_error) { + error = bp->b_error; + xfs_buf_ioerror_alert(bp, __func__); xfs_buf_relse(bp); return error; } @@ -330,7 +330,7 @@ xfs_trans_read_buf( ASSERT(xfs_buf_islocked(bp)); ASSERT(bp->b_transp == tp); ASSERT(bp->b_fspriv != NULL); - ASSERT((XFS_BUF_ISERROR(bp)) == 0); + ASSERT(!bp->b_error); if (!(XFS_BUF_ISDONE(bp))) { trace_xfs_trans_read_buf_io(bp, _RET_IP_); ASSERT(!XFS_BUF_ISASYNC(bp)); @@ -338,8 +338,7 @@ xfs_trans_read_buf( xfsbdstrat(tp->t_mountp, bp); error = xfs_buf_iowait(bp); if (error) { - xfs_ioerror_alert("xfs_trans_read_buf", mp, - bp, blkno); + xfs_buf_ioerror_alert(bp, __func__); xfs_buf_relse(bp); /* * We can gracefully recover from most read @@ -386,12 +385,11 @@ xfs_trans_read_buf( return (flags & XBF_TRYLOCK) ? 0 : XFS_ERROR(ENOMEM); } - if (XFS_BUF_GETERROR(bp) != 0) { - XFS_BUF_SUPER_STALE(bp); - error = XFS_BUF_GETERROR(bp); - - xfs_ioerror_alert("xfs_trans_read_buf", mp, - bp, blkno); + if (bp->b_error) { + error = bp->b_error; + xfs_buf_stale(bp); + XFS_BUF_DONE(bp); + xfs_buf_ioerror_alert(bp, __func__); if (tp->t_flags & XFS_TRANS_DIRTY) xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR); xfs_buf_relse(bp); @@ -430,7 +428,7 @@ shutdown_abort: if (XFS_BUF_ISSTALE(bp) && XFS_BUF_ISDELAYWRITE(bp)) xfs_notice(mp, "about to pop assert, bp == 0x%p", bp); #endif - ASSERT((XFS_BUF_BFLAGS(bp) & (XBF_STALE|XBF_DELWRI)) != + ASSERT((bp->b_flags & (XBF_STALE|XBF_DELWRI)) != (XBF_STALE|XBF_DELWRI)); trace_xfs_trans_read_buf_shut(bp, _RET_IP_); @@ -581,7 +579,6 @@ xfs_trans_bhold(xfs_trans_t *tp, { xfs_buf_log_item_t *bip = bp->b_fspriv; - ASSERT(XFS_BUF_ISBUSY(bp)); ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); @@ -602,7 +599,6 @@ xfs_trans_bhold_release(xfs_trans_t *tp, { xfs_buf_log_item_t *bip = bp->b_fspriv; - ASSERT(XFS_BUF_ISBUSY(bp)); ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); @@ -631,7 +627,6 @@ xfs_trans_log_buf(xfs_trans_t *tp, { xfs_buf_log_item_t *bip = bp->b_fspriv; - ASSERT(XFS_BUF_ISBUSY(bp)); ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); ASSERT((first <= last) && (last < XFS_BUF_COUNT(bp))); @@ -648,13 +643,14 @@ xfs_trans_log_buf(xfs_trans_t *tp, * inside the b_bdstrat callback so that this won't get written to * disk. */ - XFS_BUF_DELAYWRITE(bp); XFS_BUF_DONE(bp); ASSERT(atomic_read(&bip->bli_refcount) > 0); bp->b_iodone = xfs_buf_iodone_callbacks; bip->bli_item.li_cb = xfs_buf_iodone; + xfs_buf_delwri_queue(bp); + trace_xfs_trans_log_buf(bip); /* @@ -702,7 +698,6 @@ xfs_trans_binval( { xfs_buf_log_item_t *bip = bp->b_fspriv; - ASSERT(XFS_BUF_ISBUSY(bp)); ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); @@ -744,8 +739,7 @@ xfs_trans_binval( * We set the stale bit in the buffer as well since we're getting * rid of it. */ - XFS_BUF_UNDELAYWRITE(bp); - XFS_BUF_STALE(bp); + xfs_buf_stale(bp); bip->bli_flags |= XFS_BLI_STALE; bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY); bip->bli_format.blf_flags &= ~XFS_BLF_INODE_BUF; @@ -774,7 +768,6 @@ xfs_trans_inode_buf( { xfs_buf_log_item_t *bip = bp->b_fspriv; - ASSERT(XFS_BUF_ISBUSY(bp)); ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); @@ -798,7 +791,6 @@ xfs_trans_stale_inode_buf( { xfs_buf_log_item_t *bip = bp->b_fspriv; - ASSERT(XFS_BUF_ISBUSY(bp)); ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); @@ -823,7 +815,6 @@ xfs_trans_inode_alloc_buf( { xfs_buf_log_item_t *bip = bp->b_fspriv; - ASSERT(XFS_BUF_ISBUSY(bp)); ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); @@ -851,7 +842,6 @@ xfs_trans_dquot_buf( { xfs_buf_log_item_t *bip = bp->b_fspriv; - ASSERT(XFS_BUF_ISBUSY(bp)); ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); ASSERT(type == XFS_BLF_UDQUOT_BUF || diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 4d00ee67792d..4d00ee67792d 100644 --- a/fs/xfs/quota/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c index c8dea2fd7e68..32f0288ae10f 100644 --- a/fs/xfs/xfs_trans_inode.c +++ b/fs/xfs/xfs_trans_inode.c @@ -47,11 +47,13 @@ xfs_trans_inode_broot_debug( * Add a locked inode to the transaction. * * The inode must be locked, and it cannot be associated with any transaction. + * If lock_flags is non-zero the inode will be unlocked on transaction commit. */ void xfs_trans_ijoin( struct xfs_trans *tp, - struct xfs_inode *ip) + struct xfs_inode *ip, + uint lock_flags) { xfs_inode_log_item_t *iip; @@ -59,7 +61,9 @@ xfs_trans_ijoin( if (ip->i_itemp == NULL) xfs_inode_item_init(ip, ip->i_mount); iip = ip->i_itemp; + ASSERT(iip->ili_lock_flags == 0); + iip->ili_lock_flags = lock_flags; /* * Get a log_item_desc to point at the new item. @@ -70,25 +74,6 @@ xfs_trans_ijoin( } /* - * Add a locked inode to the transaction. - * - * - * Grabs a reference to the inode which will be dropped when the transaction - * is committed. The inode will also be unlocked at that point. The inode - * must be locked, and it cannot be associated with any transaction. - */ -void -xfs_trans_ijoin_ref( - struct xfs_trans *tp, - struct xfs_inode *ip, - uint lock_flags) -{ - xfs_trans_ijoin(tp, ip); - IHOLD(ip); - ip->i_itemp->ili_lock_flags = lock_flags; -} - -/* * Transactional inode timestamp update. Requires the inode to be locked and * joined to the transaction supplied. Relies on the transaction subsystem to * track dirty state and update/writeback the inode accordingly. diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index 212946b97239..44820b9fcb43 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -64,23 +64,18 @@ struct xfs_ail_cursor { */ struct xfs_ail { struct xfs_mount *xa_mount; + struct task_struct *xa_task; struct list_head xa_ail; xfs_lsn_t xa_target; struct list_head xa_cursors; spinlock_t xa_lock; - struct delayed_work xa_work; xfs_lsn_t xa_last_pushed_lsn; - unsigned long xa_flags; + int xa_log_flush; }; -#define XFS_AIL_PUSHING_BIT 0 - /* * From xfs_trans_ail.c */ - -extern struct workqueue_struct *xfs_ail_wq; /* AIL workqueue */ - void xfs_trans_ail_update_bulk(struct xfs_ail *ailp, struct xfs_ail_cursor *cur, struct xfs_log_item **log_items, int nr_items, diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/xfs_vnode.h index 7c220b4227bc..7c220b4227bc 100644 --- a/fs/xfs/linux-2.6/xfs_vnode.h +++ b/fs/xfs/xfs_vnode.h diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 88d121486c52..ce9268a2f56b 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -72,8 +72,8 @@ xfs_readlink_bmap( xfs_buf_t *bp; int error = 0; - error = xfs_bmapi(NULL, ip, 0, XFS_B_TO_FSB(mp, pathlen), 0, NULL, 0, - mval, &nmaps, NULL); + error = xfs_bmapi_read(ip, 0, XFS_B_TO_FSB(mp, pathlen), mval, &nmaps, + 0); if (error) goto out; @@ -83,10 +83,11 @@ xfs_readlink_bmap( bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), XBF_LOCK | XBF_MAPPED | XBF_DONT_BLOCK); - error = XFS_BUF_GETERROR(bp); + if (!bp) + return XFS_ERROR(ENOMEM); + error = bp->b_error; if (error) { - xfs_ioerror_alert("xfs_readlink", - ip->i_mount, bp, XFS_BUF_ADDR(bp)); + xfs_buf_ioerror_alert(bp, __func__); xfs_buf_relse(bp); goto out; } @@ -94,7 +95,7 @@ xfs_readlink_bmap( byte_cnt = pathlen; pathlen -= byte_cnt; - memcpy(link, XFS_BUF_PTR(bp), byte_cnt); + memcpy(link, bp->b_addr, byte_cnt); xfs_buf_relse(bp); } @@ -111,7 +112,7 @@ xfs_readlink( char *link) { xfs_mount_t *mp = ip->i_mount; - int pathlen; + xfs_fsize_t pathlen; int error = 0; trace_xfs_readlink(ip); @@ -121,13 +122,19 @@ xfs_readlink( xfs_ilock(ip, XFS_ILOCK_SHARED); - ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFLNK); - ASSERT(ip->i_d.di_size <= MAXPATHLEN); - pathlen = ip->i_d.di_size; if (!pathlen) goto out; + if (pathlen < 0 || pathlen > MAXPATHLEN) { + xfs_alert(mp, "%s: inode (%llu) bad symlink length (%lld)", + __func__, (unsigned long long) ip->i_ino, + (long long) pathlen); + ASSERT(0); + return XFS_ERROR(EFSCORRUPTED); + } + + if (ip->i_df.if_flags & XFS_IFINLINE) { memcpy(link, ip->i_df.if_u1.if_data, pathlen); link[pathlen] = '\0'; @@ -176,8 +183,7 @@ xfs_free_eofblocks( nimaps = 1; xfs_ilock(ip, XFS_ILOCK_SHARED); - error = xfs_bmapi(NULL, ip, end_fsb, map_len, 0, - NULL, 0, &imap, &nimaps, NULL); + error = xfs_bmapi_read(ip, end_fsb, map_len, &imap, &nimaps, 0); xfs_iunlock(ip, XFS_ILOCK_SHARED); if (!error && (nimaps != 0) && @@ -218,7 +224,7 @@ xfs_free_eofblocks( } xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); error = xfs_itruncate_data(&tp, ip, ip->i_size); if (error) { @@ -287,7 +293,7 @@ xfs_inactive_symlink_rmt( xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); size = (int)ip->i_d.di_size; ip->i_d.di_size = 0; - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); /* * Find the block(s) so we can inval and unmap them. @@ -295,9 +301,9 @@ xfs_inactive_symlink_rmt( done = 0; xfs_bmap_init(&free_list, &first_block); nmaps = ARRAY_SIZE(mval); - if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size), - XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps, - &free_list))) + error = xfs_bmapi_read(ip, 0, XFS_B_TO_FSB(mp, size), + mval, &nmaps, 0); + if (error) goto error0; /* * Invalidate the block(s). @@ -306,6 +312,10 @@ xfs_inactive_symlink_rmt( bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, mval[i].br_startblock), XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0); + if (!bp) { + error = ENOMEM; + goto error1; + } xfs_trans_binval(tp, bp); } /* @@ -331,7 +341,7 @@ xfs_inactive_symlink_rmt( * Mark it dirty so it will be logged and moved forward in the log as * part of every commit. */ - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); /* * Get a new, empty transaction to return to our caller. @@ -464,7 +474,7 @@ xfs_inactive_attrs( goto error_cancel; xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); xfs_idestroy_fork(ip, XFS_ATTR_FORK); ASSERT(ip->i_d.di_anextents == 0); @@ -529,7 +539,7 @@ xfs_release( if (ip->i_d.di_nlink == 0) return 0; - if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) && + if ((S_ISREG(ip->i_d.di_mode) && ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) && (ip->i_df.if_flags & XFS_IFEXTENTS)) && @@ -610,7 +620,7 @@ xfs_inactive( truncate = ((ip->i_d.di_nlink == 0) && ((ip->i_d.di_size != 0) || (ip->i_size != 0) || (ip->i_d.di_nextents > 0) || (ip->i_delayed_blks > 0)) && - ((ip->i_d.di_mode & S_IFMT) == S_IFREG)); + S_ISREG(ip->i_d.di_mode)); mp = ip->i_mount; @@ -621,7 +631,7 @@ xfs_inactive( goto out; if (ip->i_d.di_nlink != 0) { - if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) && + if ((S_ISREG(ip->i_d.di_mode) && ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) && (ip->i_df.if_flags & XFS_IFEXTENTS) && @@ -645,8 +655,6 @@ xfs_inactive( if (truncate) { xfs_ilock(ip, XFS_IOLOCK_EXCL); - xfs_ioend_wait(ip); - error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, XFS_TRANS_PERM_LOG_RES, @@ -660,7 +668,7 @@ xfs_inactive( } xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); error = xfs_itruncate_data(&tp, ip, 0); if (error) { @@ -669,7 +677,7 @@ xfs_inactive( xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); return VN_INACTIVE_CACHE; } - } else if ((ip->i_d.di_mode & S_IFMT) == S_IFLNK) { + } else if (S_ISLNK(ip->i_d.di_mode)) { /* * If we get an error while cleaning up a @@ -684,7 +692,7 @@ xfs_inactive( return VN_INACTIVE_CACHE; } - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); } else { error = xfs_trans_reserve(tp, 0, XFS_IFREE_LOG_RES(mp), @@ -697,7 +705,7 @@ xfs_inactive( } xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); } /* @@ -937,7 +945,7 @@ xfs_create( * the transaction cancel unlocking dp so don't do it explicitly in the * error path. */ - xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); unlock_dp_on_error = B_FALSE; error = xfs_dir_createname(tp, dp, name, ip->i_ino, @@ -1258,8 +1266,8 @@ xfs_remove( xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL); - xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); /* * If we're removing a directory perform some additional validation. @@ -1404,8 +1412,8 @@ xfs_link( xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL); - xfs_trans_ijoin_ref(tp, sip, XFS_ILOCK_EXCL); - xfs_trans_ijoin_ref(tp, tdp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL); /* * If the source has too many links, we can't make any more to it. @@ -1599,7 +1607,7 @@ xfs_symlink( * transaction cancel unlocking dp so don't do it explicitly in the * error path. */ - xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); unlock_dp_on_error = B_FALSE; /* @@ -1630,10 +1638,9 @@ xfs_symlink( first_fsb = 0; nmaps = SYMLINK_MAPS; - error = xfs_bmapi(tp, ip, first_fsb, fs_blocks, - XFS_BMAPI_WRITE | XFS_BMAPI_METADATA, - &first_block, resblks, mval, &nmaps, - &free_list); + error = xfs_bmapi_write(tp, ip, first_fsb, fs_blocks, + XFS_BMAPI_METADATA, &first_block, resblks, + mval, &nmaps, &free_list); if (error) goto error2; @@ -1648,13 +1655,16 @@ xfs_symlink( byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, BTOBB(byte_cnt), 0); - ASSERT(bp && !XFS_BUF_GETERROR(bp)); + if (!bp) { + error = ENOMEM; + goto error2; + } if (pathlen < byte_cnt) { byte_cnt = pathlen; } pathlen -= byte_cnt; - memcpy(XFS_BUF_PTR(bp), cur_chunk, byte_cnt); + memcpy(bp->b_addr, cur_chunk, byte_cnt); cur_chunk += byte_cnt; xfs_trans_log_buf(tp, bp, 0, byte_cnt - 1); @@ -1730,7 +1740,7 @@ xfs_set_dmattrs( return error; } xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); ip->i_d.di_dmevmask = evmask; ip->i_d.di_dmstate = state; @@ -1776,7 +1786,6 @@ xfs_alloc_file_space( xfs_fileoff_t startoffset_fsb; xfs_fsblock_t firstfsb; int nimaps; - int bmapi_flag; int quota_flag; int rt; xfs_trans_t *tp; @@ -1804,7 +1813,6 @@ xfs_alloc_file_space( count = len; imapp = &imaps[0]; nimaps = 1; - bmapi_flag = XFS_BMAPI_WRITE | alloc_type; startoffset_fsb = XFS_B_TO_FSBT(mp, offset); allocatesize_fsb = XFS_B_TO_FSB(mp, count); @@ -1875,16 +1883,12 @@ xfs_alloc_file_space( if (error) goto error1; - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); - /* - * Issue the xfs_bmapi() call to allocate the blocks - */ xfs_bmap_init(&free_list, &firstfsb); - error = xfs_bmapi(tp, ip, startoffset_fsb, - allocatesize_fsb, bmapi_flag, - &firstfsb, 0, imapp, &nimaps, - &free_list); + error = xfs_bmapi_write(tp, ip, startoffset_fsb, + allocatesize_fsb, alloc_type, &firstfsb, + 0, imapp, &nimaps, &free_list); if (error) { goto error0; } @@ -1974,8 +1978,7 @@ xfs_zero_remaining_bytes( for (offset = startoff; offset <= endoff; offset = lastoffset + 1) { offset_fsb = XFS_B_TO_FSBT(mp, offset); nimap = 1; - error = xfs_bmapi(NULL, ip, offset_fsb, 1, 0, - NULL, 0, &imap, &nimap, NULL); + error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0); if (error || nimap < 1) break; ASSERT(imap.br_blockcount >= 1); @@ -1995,11 +1998,11 @@ xfs_zero_remaining_bytes( xfsbdstrat(mp, bp); error = xfs_buf_iowait(bp); if (error) { - xfs_ioerror_alert("xfs_zero_remaining_bytes(read)", - mp, bp, XFS_BUF_ADDR(bp)); + xfs_buf_ioerror_alert(bp, + "xfs_zero_remaining_bytes(read)"); break; } - memset(XFS_BUF_PTR(bp) + + memset(bp->b_addr + (offset - XFS_FSB_TO_B(mp, imap.br_startoff)), 0, lastoffset - offset + 1); XFS_BUF_UNDONE(bp); @@ -2008,8 +2011,8 @@ xfs_zero_remaining_bytes( xfsbdstrat(mp, bp); error = xfs_buf_iowait(bp); if (error) { - xfs_ioerror_alert("xfs_zero_remaining_bytes(write)", - mp, bp, XFS_BUF_ADDR(bp)); + xfs_buf_ioerror_alert(bp, + "xfs_zero_remaining_bytes(write)"); break; } } @@ -2074,7 +2077,7 @@ xfs_free_file_space( if (need_iolock) { xfs_ilock(ip, XFS_IOLOCK_EXCL); /* wait for the completion of any pending DIOs */ - xfs_ioend_wait(ip); + inode_dio_wait(VFS_I(ip)); } rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); @@ -2094,8 +2097,8 @@ xfs_free_file_space( */ if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) { nimap = 1; - error = xfs_bmapi(NULL, ip, startoffset_fsb, - 1, 0, NULL, 0, &imap, &nimap, NULL); + error = xfs_bmapi_read(ip, startoffset_fsb, 1, + &imap, &nimap, 0); if (error) goto out_unlock_iolock; ASSERT(nimap == 0 || nimap == 1); @@ -2109,8 +2112,8 @@ xfs_free_file_space( startoffset_fsb += mp->m_sb.sb_rextsize - mod; } nimap = 1; - error = xfs_bmapi(NULL, ip, endoffset_fsb - 1, - 1, 0, NULL, 0, &imap, &nimap, NULL); + error = xfs_bmapi_read(ip, endoffset_fsb - 1, 1, + &imap, &nimap, 0); if (error) goto out_unlock_iolock; ASSERT(nimap == 0 || nimap == 1); @@ -2178,7 +2181,7 @@ xfs_free_file_space( if (error) goto error1; - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); /* * issue the bunmapi() call to free the blocks @@ -2351,8 +2354,7 @@ xfs_change_file_space( } xfs_ilock(ip, XFS_ILOCK_EXCL); - - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); if ((attr_flags & XFS_ATTR_DMI) == 0) { ip->i_d.di_mode &= ~S_ISUID; @@ -2377,10 +2379,5 @@ xfs_change_file_space( xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); if (attr_flags & XFS_ATTR_SYNC) xfs_trans_set_sync(tp); - - error = xfs_trans_commit(tp, 0); - - xfs_iunlock(ip, XFS_ILOCK_EXCL); - - return error; + return xfs_trans_commit(tp, 0); } diff --git a/fs/xfs/linux-2.6/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 87d3e03878c8..87d3e03878c8 100644 --- a/fs/xfs/linux-2.6/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c |