summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/ceph/addr.c90
-rw-r--r--fs/ceph/cache.c2
-rw-r--r--fs/ceph/caps.c536
-rw-r--r--fs/ceph/debugfs.c16
-rw-r--r--fs/ceph/dir.c132
-rw-r--r--fs/ceph/export.c5
-rw-r--r--fs/ceph/file.c486
-rw-r--r--fs/ceph/inode.c84
-rw-r--r--fs/ceph/ioctl.c2
-rw-r--r--fs/ceph/locks.c31
-rw-r--r--fs/ceph/mds_client.c240
-rw-r--r--fs/ceph/mds_client.h30
-rw-r--r--fs/ceph/super.c28
-rw-r--r--fs/ceph/super.h70
-rw-r--r--fs/cifs/Kconfig2
-rw-r--r--fs/cifs/cifs_debug.c6
-rw-r--r--fs/cifs/cifsfs.c4
-rw-r--r--fs/cifs/cifsglob.h4
-rw-r--r--fs/cifs/file.c61
-rw-r--r--fs/cifs/inode.c4
-rw-r--r--fs/cifs/readdir.c2
-rw-r--r--fs/cifs/smb2misc.c14
-rw-r--r--fs/cifs/smb2proto.h6
-rw-r--r--fs/cifs/smb2transport.c87
-rw-r--r--fs/cifs/smbdirect.c313
-rw-r--r--fs/cifs/smbdirect.h7
-rw-r--r--fs/dax.c59
-rw-r--r--fs/filesystems.c4
-rw-r--r--fs/hfsplus/attributes.c4
-rw-r--r--fs/io-wq.c12
-rw-r--r--fs/io-wq.h2
-rw-r--r--fs/io_uring.c428
-rw-r--r--fs/iomap/buffered-io.c17
-rw-r--r--fs/nfs/pnfs_nfs.c1
-rw-r--r--fs/ocfs2/alloc.c4
-rw-r--r--fs/orangefs/file.c34
-rw-r--r--fs/orangefs/inode.c39
-rw-r--r--fs/orangefs/orangefs-kernel.h4
-rw-r--r--fs/overlayfs/copy_up.c16
-rw-r--r--fs/overlayfs/dir.c31
-rw-r--r--fs/overlayfs/export.c40
-rw-r--r--fs/overlayfs/inode.c99
-rw-r--r--fs/overlayfs/namei.c5
-rw-r--r--fs/overlayfs/overlayfs.h25
-rw-r--r--fs/overlayfs/ovl_entry.h2
-rw-r--r--fs/overlayfs/readdir.c25
-rw-r--r--fs/overlayfs/super.c258
-rw-r--r--fs/overlayfs/util.c40
-rw-r--r--fs/proc/base.c10
-rw-r--r--fs/read_write.c3
-rw-r--r--fs/seq_file.c7
-rw-r--r--fs/udf/ecma_167.h2
-rw-r--r--fs/udf/osta_udf.h2
-rw-r--r--fs/xfs/libxfs/xfs_sb.c32
-rw-r--r--fs/xfs/xfs_buf.c11
-rw-r--r--fs/xfs/xfs_dquot.c6
-rw-r--r--fs/xfs/xfs_dquot_item.c3
-rw-r--r--fs/xfs/xfs_export.c14
-rw-r--r--fs/xfs/xfs_file.c16
-rw-r--r--fs/xfs/xfs_inode.c174
-rw-r--r--fs/xfs/xfs_inode.h1
-rw-r--r--fs/xfs/xfs_inode_item.c31
-rw-r--r--fs/xfs/xfs_log.c372
-rw-r--r--fs/xfs/xfs_log.h4
-rw-r--r--fs/xfs/xfs_log_cil.c55
-rw-r--r--fs/xfs/xfs_log_priv.h75
-rw-r--r--fs/xfs/xfs_mount.h1
-rw-r--r--fs/xfs/xfs_qm.c14
-rw-r--r--fs/xfs/xfs_super.c17
-rw-r--r--fs/xfs/xfs_symlink.c1
-rw-r--r--fs/xfs/xfs_trace.h15
-rw-r--r--fs/xfs/xfs_trans.c27
-rw-r--r--fs/xfs/xfs_trans_ail.c88
-rw-r--r--fs/xfs/xfs_trans_priv.h6
74 files changed, 2768 insertions, 1630 deletions
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 7ab616601141..6f4678d98df7 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -159,8 +159,6 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset,
if (!PagePrivate(page))
return;
- ClearPageChecked(page);
-
dout("%p invalidatepage %p idx %lu full dirty page\n",
inode, page, page->index);
@@ -183,6 +181,47 @@ static int ceph_releasepage(struct page *page, gfp_t g)
}
/*
+ * Read some contiguous pages. If we cross a stripe boundary, shorten
+ * *plen. Return number of bytes read, or error.
+ */
+static int ceph_sync_readpages(struct ceph_fs_client *fsc,
+ struct ceph_vino vino,
+ struct ceph_file_layout *layout,
+ u64 off, u64 *plen,
+ u32 truncate_seq, u64 truncate_size,
+ struct page **pages, int num_pages,
+ int page_align)
+{
+ struct ceph_osd_client *osdc = &fsc->client->osdc;
+ struct ceph_osd_request *req;
+ int rc = 0;
+
+ dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
+ vino.snap, off, *plen);
+ req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 0, 1,
+ CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
+ NULL, truncate_seq, truncate_size,
+ false);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ /* it may be a short read due to an object boundary */
+ osd_req_op_extent_osd_data_pages(req, 0,
+ pages, *plen, page_align, false, false);
+
+ dout("readpages final extent is %llu~%llu (%llu bytes align %d)\n",
+ off, *plen, *plen, page_align);
+
+ rc = ceph_osdc_start_request(osdc, req, false);
+ if (!rc)
+ rc = ceph_osdc_wait_request(osdc, req);
+
+ ceph_osdc_put_request(req);
+ dout("readpages result %d\n", rc);
+ return rc;
+}
+
+/*
* read a single page, without unlocking it.
*/
static int ceph_do_readpage(struct file *filp, struct page *page)
@@ -218,7 +257,7 @@ static int ceph_do_readpage(struct file *filp, struct page *page)
dout("readpage inode %p file %p page %p index %lu\n",
inode, filp, page, page->index);
- err = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
+ err = ceph_sync_readpages(fsc, ceph_vino(inode),
&ci->i_layout, off, &len,
ci->i_truncate_seq, ci->i_truncate_size,
&page, 1, 0);
@@ -571,6 +610,47 @@ static u64 get_writepages_data_length(struct inode *inode,
}
/*
+ * do a synchronous write on N pages
+ */
+static int ceph_sync_writepages(struct ceph_fs_client *fsc,
+ struct ceph_vino vino,
+ struct ceph_file_layout *layout,
+ struct ceph_snap_context *snapc,
+ u64 off, u64 len,
+ u32 truncate_seq, u64 truncate_size,
+ struct timespec64 *mtime,
+ struct page **pages, int num_pages)
+{
+ struct ceph_osd_client *osdc = &fsc->client->osdc;
+ struct ceph_osd_request *req;
+ int rc = 0;
+ int page_align = off & ~PAGE_MASK;
+
+ req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1,
+ CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE,
+ snapc, truncate_seq, truncate_size,
+ true);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ /* it may be a short write due to an object boundary */
+ osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align,
+ false, false);
+ dout("writepages %llu~%llu (%llu bytes)\n", off, len, len);
+
+ req->r_mtime = *mtime;
+ rc = ceph_osdc_start_request(osdc, req, true);
+ if (!rc)
+ rc = ceph_osdc_wait_request(osdc, req);
+
+ ceph_osdc_put_request(req);
+ if (rc == 0)
+ rc = len;
+ dout("writepages result %d\n", rc);
+ return rc;
+}
+
+/*
* Write a single page, but leave the page locked.
*
* If we get a write error, mark the mapping for error, but still adjust the
@@ -628,7 +708,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
set_page_writeback(page);
- err = ceph_osdc_writepages(&fsc->client->osdc, ceph_vino(inode),
+ err = ceph_sync_writepages(fsc, ceph_vino(inode),
&ci->i_layout, snapc, page_off, len,
ceph_wbc.truncate_seq,
ceph_wbc.truncate_size,
@@ -1575,7 +1655,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
do {
lock_page(page);
- if ((off > size) || (page->mapping != inode->i_mapping)) {
+ if (page_mkwrite_check_truncate(page, inode) < 0) {
unlock_page(page);
ret = VM_FAULT_NOPAGE;
break;
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index 270b769607a2..2f5cb6bc78e1 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -32,7 +32,7 @@ struct ceph_fscache_entry {
size_t uniq_len;
/* The following members must be last */
struct ceph_fsid fsid;
- char uniquifier[0];
+ char uniquifier[];
};
static const struct fscache_cookie_def ceph_fscache_fsid_object_def = {
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 28ae0c134700..185db76300b3 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -490,13 +490,10 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
struct ceph_inode_info *ci)
{
struct ceph_mount_options *opt = mdsc->fsc->mount_options;
-
- ci->i_hold_caps_min = round_jiffies(jiffies +
- opt->caps_wanted_delay_min * HZ);
ci->i_hold_caps_max = round_jiffies(jiffies +
opt->caps_wanted_delay_max * HZ);
- dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode,
- ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies);
+ dout("__cap_set_timeouts %p %lu\n", &ci->vfs_inode,
+ ci->i_hold_caps_max - jiffies);
}
/*
@@ -508,10 +505,9 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
* -> we take mdsc->cap_delay_lock
*/
static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
- struct ceph_inode_info *ci,
- bool set_timeout)
+ struct ceph_inode_info *ci)
{
- dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
+ dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->vfs_inode,
ci->i_ceph_flags, ci->i_hold_caps_max);
if (!mdsc->stopping) {
spin_lock(&mdsc->cap_delay_lock);
@@ -520,8 +516,7 @@ static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
goto no_change;
list_del_init(&ci->i_cap_delay_list);
}
- if (set_timeout)
- __cap_set_timeouts(mdsc, ci);
+ __cap_set_timeouts(mdsc, ci);
list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
no_change:
spin_unlock(&mdsc->cap_delay_lock);
@@ -561,19 +556,20 @@ static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
spin_unlock(&mdsc->cap_delay_lock);
}
-/*
- * Common issue checks for add_cap, handle_cap_grant.
- */
+/* Common issue checks for add_cap, handle_cap_grant. */
static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
unsigned issued)
{
unsigned had = __ceph_caps_issued(ci, NULL);
+ lockdep_assert_held(&ci->i_ceph_lock);
+
/*
* Each time we receive FILE_CACHE anew, we increment
* i_rdcache_gen.
*/
- if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
+ if (S_ISREG(ci->vfs_inode.i_mode) &&
+ (issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
(had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) {
ci->i_rdcache_gen++;
}
@@ -592,6 +588,13 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
__ceph_dir_clear_complete(ci);
}
}
+
+ /* Wipe saved layout if we're losing DIR_CREATE caps */
+ if (S_ISDIR(ci->vfs_inode.i_mode) && (had & CEPH_CAP_DIR_CREATE) &&
+ !(issued & CEPH_CAP_DIR_CREATE)) {
+ ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns));
+ memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout));
+ }
}
/*
@@ -605,7 +608,7 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
*/
void ceph_add_cap(struct inode *inode,
struct ceph_mds_session *session, u64 cap_id,
- int fmode, unsigned issued, unsigned wanted,
+ unsigned issued, unsigned wanted,
unsigned seq, unsigned mseq, u64 realmino, int flags,
struct ceph_cap **new_cap)
{
@@ -621,13 +624,6 @@ void ceph_add_cap(struct inode *inode,
dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
session->s_mds, cap_id, ceph_cap_string(issued), seq);
- /*
- * If we are opening the file, include file mode wanted bits
- * in wanted.
- */
- if (fmode >= 0)
- wanted |= ceph_caps_for_mode(fmode);
-
spin_lock(&session->s_gen_ttl_lock);
gen = session->s_cap_gen;
spin_unlock(&session->s_gen_ttl_lock);
@@ -725,7 +721,7 @@ void ceph_add_cap(struct inode *inode,
dout(" issued %s, mds wanted %s, actual %s, queueing\n",
ceph_cap_string(issued), ceph_cap_string(wanted),
ceph_cap_string(actual_wanted));
- __cap_delay_requeue(mdsc, ci, true);
+ __cap_delay_requeue(mdsc, ci);
}
if (flags & CEPH_CAP_FLAG_AUTH) {
@@ -752,9 +748,6 @@ void ceph_add_cap(struct inode *inode,
cap->issue_seq = seq;
cap->mseq = mseq;
cap->cap_gen = gen;
-
- if (fmode >= 0)
- __ceph_get_fmode(ci, fmode);
}
/*
@@ -958,29 +951,97 @@ int __ceph_caps_used(struct ceph_inode_info *ci)
if (ci->i_rd_ref)
used |= CEPH_CAP_FILE_RD;
if (ci->i_rdcache_ref ||
- (!S_ISDIR(ci->vfs_inode.i_mode) && /* ignore readdir cache */
+ (S_ISREG(ci->vfs_inode.i_mode) &&
ci->vfs_inode.i_data.nrpages))
used |= CEPH_CAP_FILE_CACHE;
if (ci->i_wr_ref)
used |= CEPH_CAP_FILE_WR;
if (ci->i_wb_ref || ci->i_wrbuffer_ref)
used |= CEPH_CAP_FILE_BUFFER;
+ if (ci->i_fx_ref)
+ used |= CEPH_CAP_FILE_EXCL;
return used;
}
+#define FMODE_WAIT_BIAS 1000
+
/*
* wanted, by virtue of open file modes
*/
int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
{
- int i, bits = 0;
- for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
- if (ci->i_nr_by_mode[i])
- bits |= 1 << i;
+ const int PIN_SHIFT = ffs(CEPH_FILE_MODE_PIN);
+ const int RD_SHIFT = ffs(CEPH_FILE_MODE_RD);
+ const int WR_SHIFT = ffs(CEPH_FILE_MODE_WR);
+ const int LAZY_SHIFT = ffs(CEPH_FILE_MODE_LAZY);
+ struct ceph_mount_options *opt =
+ ceph_inode_to_client(&ci->vfs_inode)->mount_options;
+ unsigned long used_cutoff = jiffies - opt->caps_wanted_delay_max * HZ;
+ unsigned long idle_cutoff = jiffies - opt->caps_wanted_delay_min * HZ;
+
+ if (S_ISDIR(ci->vfs_inode.i_mode)) {
+ int want = 0;
+
+ /* use used_cutoff here, to keep dir's wanted caps longer */
+ if (ci->i_nr_by_mode[RD_SHIFT] > 0 ||
+ time_after(ci->i_last_rd, used_cutoff))
+ want |= CEPH_CAP_ANY_SHARED;
+
+ if (ci->i_nr_by_mode[WR_SHIFT] > 0 ||
+ time_after(ci->i_last_wr, used_cutoff)) {
+ want |= CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
+ if (opt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS)
+ want |= CEPH_CAP_ANY_DIR_OPS;
+ }
+
+ if (want || ci->i_nr_by_mode[PIN_SHIFT] > 0)
+ want |= CEPH_CAP_PIN;
+
+ return want;
+ } else {
+ int bits = 0;
+
+ if (ci->i_nr_by_mode[RD_SHIFT] > 0) {
+ if (ci->i_nr_by_mode[RD_SHIFT] >= FMODE_WAIT_BIAS ||
+ time_after(ci->i_last_rd, used_cutoff))
+ bits |= 1 << RD_SHIFT;
+ } else if (time_after(ci->i_last_rd, idle_cutoff)) {
+ bits |= 1 << RD_SHIFT;
+ }
+
+ if (ci->i_nr_by_mode[WR_SHIFT] > 0) {
+ if (ci->i_nr_by_mode[WR_SHIFT] >= FMODE_WAIT_BIAS ||
+ time_after(ci->i_last_wr, used_cutoff))
+ bits |= 1 << WR_SHIFT;
+ } else if (time_after(ci->i_last_wr, idle_cutoff)) {
+ bits |= 1 << WR_SHIFT;
+ }
+
+ /* check lazyio only when read/write is wanted */
+ if ((bits & (CEPH_FILE_MODE_RDWR << 1)) &&
+ ci->i_nr_by_mode[LAZY_SHIFT] > 0)
+ bits |= 1 << LAZY_SHIFT;
+
+ return bits ? ceph_caps_for_mode(bits >> 1) : 0;
}
- if (bits == 0)
- return 0;
- return ceph_caps_for_mode(bits >> 1);
+}
+
+/*
+ * wanted, by virtue of open file modes AND cap refs (buffered/cached data)
+ */
+int __ceph_caps_wanted(struct ceph_inode_info *ci)
+{
+ int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci);
+ if (S_ISDIR(ci->vfs_inode.i_mode)) {
+ /* we want EXCL if holding caps of dir ops */
+ if (w & CEPH_CAP_ANY_DIR_OPS)
+ w |= CEPH_CAP_FILE_EXCL;
+ } else {
+ /* we want EXCL if dirty data */
+ if (w & CEPH_CAP_FILE_BUFFER)
+ w |= CEPH_CAP_FILE_EXCL;
+ }
+ return w;
}
/*
@@ -1004,14 +1065,6 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check)
return mds_wanted;
}
-/*
- * called under i_ceph_lock
- */
-static int __ceph_is_single_caps(struct ceph_inode_info *ci)
-{
- return rb_first(&ci->i_caps) == rb_last(&ci->i_caps);
-}
-
int ceph_is_any_caps(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
@@ -1274,9 +1327,15 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
struct cap_msg_args arg;
int held, revoking;
int wake = 0;
- int delayed = 0;
int ret;
+ /* Don't send anything if it's still being created. Return delayed */
+ if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) {
+ spin_unlock(&ci->i_ceph_lock);
+ dout("%s async create in flight for %p\n", __func__, inode);
+ return 1;
+ }
+
held = cap->issued | cap->implemented;
revoking = cap->implemented & ~cap->issued;
retain &= ~revoking;
@@ -1287,28 +1346,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
ceph_cap_string(revoking));
BUG_ON((retain & CEPH_CAP_PIN) == 0);
- arg.session = cap->session;
-
- /* don't release wanted unless we've waited a bit. */
- if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
- time_before(jiffies, ci->i_hold_caps_min)) {
- dout(" delaying issued %s -> %s, wanted %s -> %s on send\n",
- ceph_cap_string(cap->issued),
- ceph_cap_string(cap->issued & retain),
- ceph_cap_string(cap->mds_wanted),
- ceph_cap_string(want));
- want |= cap->mds_wanted;
- retain |= cap->issued;
- delayed = 1;
- }
- ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH);
- if (want & ~cap->mds_wanted) {
- /* user space may open/close single file frequently.
- * This avoids droping mds_wanted immediately after
- * requesting new mds_wanted.
- */
- __cap_set_timeouts(mdsc, ci);
- }
+ ci->i_ceph_flags &= ~CEPH_I_FLUSH;
cap->issued &= retain; /* drop bits we don't want */
if (cap->implemented & ~cap->issued) {
@@ -1323,6 +1361,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
cap->implemented &= cap->issued | used;
cap->mds_wanted = want;
+ arg.session = cap->session;
arg.ino = ceph_vino(inode).ino;
arg.cid = cap->cap_id;
arg.follows = flushing ? ci->i_head_snapc->seq : 0;
@@ -1332,7 +1371,8 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
arg.size = inode->i_size;
ci->i_reported_size = arg.size;
arg.max_size = ci->i_wanted_max_size;
- ci->i_requested_max_size = arg.max_size;
+ if (cap == ci->i_auth_cap)
+ ci->i_requested_max_size = arg.max_size;
if (flushing & CEPH_CAP_XATTR_EXCL) {
old_blob = __ceph_build_xattrs_blob(ci);
@@ -1383,14 +1423,19 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
ret = send_cap_msg(&arg);
if (ret < 0) {
- dout("error sending cap msg, must requeue %p\n", inode);
- delayed = 1;
+ pr_err("error sending cap msg, ino (%llx.%llx) "
+ "flushing %s tid %llu, requeue\n",
+ ceph_vinop(inode), ceph_cap_string(flushing),
+ flush_tid);
+ spin_lock(&ci->i_ceph_lock);
+ __cap_delay_requeue(mdsc, ci);
+ spin_unlock(&ci->i_ceph_lock);
}
if (wake)
wake_up_all(&ci->i_cap_wq);
- return delayed;
+ return ret;
}
static inline int __send_flush_snap(struct inode *inode,
@@ -1617,6 +1662,8 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
int was = ci->i_dirty_caps;
int dirty = 0;
+ lockdep_assert_held(&ci->i_ceph_lock);
+
if (!ci->i_auth_cap) {
pr_warn("__mark_dirty_caps %p %llx mask %s, "
"but no auth cap (session was closed?)\n",
@@ -1654,7 +1701,7 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
(mask & CEPH_CAP_FILE_BUFFER))
dirty |= I_DIRTY_DATASYNC;
- __cap_delay_requeue(mdsc, ci, true);
+ __cap_delay_requeue(mdsc, ci);
return dirty;
}
@@ -1726,6 +1773,7 @@ static u64 __mark_caps_flushing(struct inode *inode,
struct ceph_cap_flush *cf = NULL;
int flushing;
+ lockdep_assert_held(&ci->i_ceph_lock);
BUG_ON(ci->i_dirty_caps == 0);
BUG_ON(list_empty(&ci->i_dirty_item));
BUG_ON(!ci->i_prealloc_cap_flush);
@@ -1805,8 +1853,6 @@ bool __ceph_should_report_size(struct ceph_inode_info *ci)
* versus held caps. Release, flush, ack revoked caps to mds as
* appropriate.
*
- * CHECK_CAPS_NODELAY - caller is delayed work and we should not delay
- * cap release further.
* CHECK_CAPS_AUTHONLY - we should only check the auth cap
* CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
* further delay.
@@ -1825,24 +1871,13 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
int mds = -1; /* keep track of how far we've gone through i_caps list
to avoid an infinite loop on retry */
struct rb_node *p;
- int delayed = 0, sent = 0;
- bool no_delay = flags & CHECK_CAPS_NODELAY;
bool queue_invalidate = false;
bool tried_invalidate = false;
- /* if we are unmounting, flush any unused caps immediately. */
- if (mdsc->stopping)
- no_delay = true;
-
spin_lock(&ci->i_ceph_lock);
-
if (ci->i_ceph_flags & CEPH_I_FLUSH)
flags |= CHECK_CAPS_FLUSH;
- if (!(flags & CHECK_CAPS_AUTHONLY) ||
- (ci->i_auth_cap && __ceph_is_single_caps(ci)))
- __cap_delay_cancel(mdsc, ci);
-
goto retry_locked;
retry:
spin_lock(&ci->i_ceph_lock);
@@ -1866,10 +1901,11 @@ retry_locked:
* revoking the shared cap on every create/unlink
* operation.
*/
- if (IS_RDONLY(inode))
+ if (IS_RDONLY(inode)) {
want = CEPH_CAP_ANY_SHARED;
- else
- want = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
+ } else {
+ want |= CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
+ }
retain |= want;
} else {
@@ -1885,14 +1921,13 @@ retry_locked:
}
dout("check_caps %p file_want %s used %s dirty %s flushing %s"
- " issued %s revoking %s retain %s %s%s%s\n", inode,
+ " issued %s revoking %s retain %s %s%s\n", inode,
ceph_cap_string(file_wanted),
ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
ceph_cap_string(ci->i_flushing_caps),
ceph_cap_string(issued), ceph_cap_string(revoking),
ceph_cap_string(retain),
(flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
- (flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "",
(flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "");
/*
@@ -1900,8 +1935,8 @@ retry_locked:
* have cached pages, but don't want them, then try to invalidate.
* If we fail, it's because pages are locked.... try again later.
*/
- if ((!no_delay || mdsc->stopping) &&
- !S_ISDIR(inode->i_mode) && /* ignore readdir cache */
+ if ((!(flags & CHECK_CAPS_NOINVAL) || mdsc->stopping) &&
+ S_ISREG(inode->i_mode) &&
!(ci->i_wb_ref || ci->i_wrbuffer_ref) && /* no dirty pages... */
inode->i_data.nrpages && /* have cached pages */
(revoking & (CEPH_CAP_FILE_CACHE|
@@ -1973,28 +2008,17 @@ retry_locked:
}
/* want more caps from mds? */
- if (want & ~(cap->mds_wanted | cap->issued))
- goto ack;
+ if (want & ~cap->mds_wanted) {
+ if (want & ~(cap->mds_wanted | cap->issued))
+ goto ack;
+ if (!__cap_is_valid(cap))
+ goto ack;
+ }
/* things we might delay */
if ((cap->issued & ~retain) == 0)
continue; /* nope, all good */
- if (no_delay)
- goto ack;
-
- /* delay? */
- if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
- time_before(jiffies, ci->i_hold_caps_max)) {
- dout(" delaying issued %s -> %s, wanted %s -> %s\n",
- ceph_cap_string(cap->issued),
- ceph_cap_string(cap->issued & retain),
- ceph_cap_string(cap->mds_wanted),
- ceph_cap_string(want));
- delayed++;
- continue;
- }
-
ack:
if (session && session != cap->session) {
dout("oops, wrong session %p mutex\n", session);
@@ -2055,18 +2079,20 @@ ack:
}
mds = cap->mds; /* remember mds, so we don't repeat */
- sent++;
/* __send_cap drops i_ceph_lock */
- delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, 0,
- cap_used, want, retain, flushing,
- flush_tid, oldest_flush_tid);
+ __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, 0, cap_used, want,
+ retain, flushing, flush_tid, oldest_flush_tid);
goto retry; /* retake i_ceph_lock and restart our cap scan. */
}
- /* Reschedule delayed caps release if we delayed anything */
- if (delayed)
- __cap_delay_requeue(mdsc, ci, false);
+ /* periodically re-calculate caps wanted by open files */
+ if (__ceph_is_any_real_caps(ci) &&
+ list_empty(&ci->i_cap_delay_list) &&
+ (file_wanted & ~CEPH_CAP_PIN) &&
+ !(used & (CEPH_CAP_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
+ __cap_delay_requeue(mdsc, ci);
+ }
spin_unlock(&ci->i_ceph_lock);
@@ -2095,7 +2121,6 @@ retry:
retry_locked:
if (ci->i_dirty_caps && ci->i_auth_cap) {
struct ceph_cap *cap = ci->i_auth_cap;
- int delayed;
if (session != cap->session) {
spin_unlock(&ci->i_ceph_lock);
@@ -2124,18 +2149,10 @@ retry_locked:
&oldest_flush_tid);
/* __send_cap drops i_ceph_lock */
- delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
- CEPH_CLIENT_CAPS_SYNC,
- __ceph_caps_used(ci),
- __ceph_caps_wanted(ci),
- (cap->issued | cap->implemented),
- flushing, flush_tid, oldest_flush_tid);
-
- if (delayed) {
- spin_lock(&ci->i_ceph_lock);
- __cap_delay_requeue(mdsc, ci, true);
- spin_unlock(&ci->i_ceph_lock);
- }
+ __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, CEPH_CLIENT_CAPS_SYNC,
+ __ceph_caps_used(ci), __ceph_caps_wanted(ci),
+ (cap->issued | cap->implemented),
+ flushing, flush_tid, oldest_flush_tid);
} else {
if (!list_empty(&ci->i_cap_flush_list)) {
struct ceph_cap_flush *cf =
@@ -2233,6 +2250,10 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
if (datasync)
goto out;
+ ret = ceph_wait_on_async_create(inode);
+ if (ret)
+ goto out;
+
dirty = try_flush_caps(inode, &flush_tid);
dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
@@ -2335,22 +2356,13 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
if (cf->caps) {
dout("kick_flushing_caps %p cap %p tid %llu %s\n",
inode, cap, cf->tid, ceph_cap_string(cf->caps));
- ci->i_ceph_flags |= CEPH_I_NODELAY;
-
- ret = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
+ __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
(cf->tid < last_snap_flush ?
CEPH_CLIENT_CAPS_PENDING_CAPSNAP : 0),
__ceph_caps_used(ci),
__ceph_caps_wanted(ci),
(cap->issued | cap->implemented),
cf->caps, cf->tid, oldest_flush_tid);
- if (ret) {
- pr_err("kick_flushing_caps: error sending "
- "cap flush, ino (%llx.%llx) "
- "tid %llu flushing %s\n",
- ceph_vinop(inode), cf->tid,
- ceph_cap_string(cf->caps));
- }
} else {
struct ceph_cap_snap *capsnap =
container_of(cf, struct ceph_cap_snap,
@@ -2457,16 +2469,15 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
}
}
-static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session,
- struct inode *inode)
- __releases(ci->i_ceph_lock)
+void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session,
+ struct ceph_inode_info *ci)
{
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_cap *cap;
+ struct ceph_mds_client *mdsc = session->s_mdsc;
+ struct ceph_cap *cap = ci->i_auth_cap;
+
+ lockdep_assert_held(&ci->i_ceph_lock);
- cap = ci->i_auth_cap;
- dout("kick_flushing_inode_caps %p flushing %s\n", inode,
+ dout("%s %p flushing %s\n", __func__, &ci->vfs_inode,
ceph_cap_string(ci->i_flushing_caps));
if (!list_empty(&ci->i_cap_flush_list)) {
@@ -2478,9 +2489,6 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
spin_unlock(&mdsc->cap_dirty_lock);
__kick_flushing_caps(mdsc, session, ci, oldest_flush_tid);
- spin_unlock(&ci->i_ceph_lock);
- } else {
- spin_unlock(&ci->i_ceph_lock);
}
}
@@ -2488,18 +2496,20 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
/*
* Take references to capabilities we hold, so that we don't release
* them to the MDS prematurely.
- *
- * Protected by i_ceph_lock.
*/
-static void __take_cap_refs(struct ceph_inode_info *ci, int got,
+void ceph_take_cap_refs(struct ceph_inode_info *ci, int got,
bool snap_rwsem_locked)
{
+ lockdep_assert_held(&ci->i_ceph_lock);
+
if (got & CEPH_CAP_PIN)
ci->i_pin_ref++;
if (got & CEPH_CAP_FILE_RD)
ci->i_rd_ref++;
if (got & CEPH_CAP_FILE_CACHE)
ci->i_rdcache_ref++;
+ if (got & CEPH_CAP_FILE_EXCL)
+ ci->i_fx_ref++;
if (got & CEPH_CAP_FILE_WR) {
if (ci->i_wr_ref == 0 && !ci->i_head_snapc) {
BUG_ON(!snap_rwsem_locked);
@@ -2512,7 +2522,7 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got,
if (ci->i_wb_ref == 0)
ihold(&ci->vfs_inode);
ci->i_wb_ref++;
- dout("__take_cap_refs %p wb %d -> %d (?)\n",
+ dout("%s %p wb %d -> %d (?)\n", __func__,
&ci->vfs_inode, ci->i_wb_ref-1, ci->i_wb_ref);
}
}
@@ -2524,14 +2534,16 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got,
* Note that caller is responsible for ensuring max_size increases are
* requested from the MDS.
*
- * Returns 0 if caps were not able to be acquired (yet), a 1 if they were,
- * or a negative error code.
- *
- * FIXME: how does a 0 return differ from -EAGAIN?
+ * Returns 0 if caps were not able to be acquired (yet), 1 if succeed,
+ * or a negative error code. There are 3 speical error codes:
+ * -EAGAIN: need to sleep but non-blocking is specified
+ * -EFBIG: ask caller to call check_max_size() and try again.
+ * -ESTALE: ask caller to call ceph_renew_caps() and try again.
*/
enum {
- NON_BLOCKING = 1,
- CHECK_FILELOCK = 2,
+ /* first 8 bits are reserved for CEPH_FILE_MODE_FOO */
+ NON_BLOCKING = (1 << 8),
+ CHECK_FILELOCK = (1 << 9),
};
static int try_get_cap_refs(struct inode *inode, int need, int want,
@@ -2541,7 +2553,6 @@ static int try_get_cap_refs(struct inode *inode, int need, int want,
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
int ret = 0;
int have, implemented;
- int file_wanted;
bool snap_rwsem_locked = false;
dout("get_cap_refs %p need %s want %s\n", inode,
@@ -2557,15 +2568,6 @@ again:
goto out_unlock;
}
- /* make sure file is actually open */
- file_wanted = __ceph_caps_file_wanted(ci);
- if ((file_wanted & need) != need) {
- dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
- ceph_cap_string(need), ceph_cap_string(file_wanted));
- ret = -EBADF;
- goto out_unlock;
- }
-
/* finish pending truncate */
while (ci->i_truncate_pending) {
spin_unlock(&ci->i_ceph_lock);
@@ -2584,7 +2586,7 @@ again:
dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
inode, endoff, ci->i_max_size);
if (endoff > ci->i_requested_max_size)
- ret = -EAGAIN;
+ ret = ci->i_auth_cap ? -EFBIG : -ESTALE;
goto out_unlock;
}
/*
@@ -2630,51 +2632,55 @@ again:
}
snap_rwsem_locked = true;
}
- *got = need | (have & want);
- if ((need & CEPH_CAP_FILE_RD) &&
+ if ((have & want) == want)
+ *got = need | want;
+ else
+ *got = need;
+ if (S_ISREG(inode->i_mode) &&
+ (need & CEPH_CAP_FILE_RD) &&
!(*got & CEPH_CAP_FILE_CACHE))
ceph_disable_fscache_readpage(ci);
- __take_cap_refs(ci, *got, true);
+ ceph_take_cap_refs(ci, *got, true);
ret = 1;
}
} else {
int session_readonly = false;
- if ((need & CEPH_CAP_FILE_WR) && ci->i_auth_cap) {
+ int mds_wanted;
+ if (ci->i_auth_cap &&
+ (need & (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_EXCL))) {
struct ceph_mds_session *s = ci->i_auth_cap->session;
spin_lock(&s->s_cap_lock);
session_readonly = s->s_readonly;
spin_unlock(&s->s_cap_lock);
}
if (session_readonly) {
- dout("get_cap_refs %p needed %s but mds%d readonly\n",
+ dout("get_cap_refs %p need %s but mds%d readonly\n",
inode, ceph_cap_string(need), ci->i_auth_cap->mds);
ret = -EROFS;
goto out_unlock;
}
- if (ci->i_ceph_flags & CEPH_I_CAP_DROPPED) {
- int mds_wanted;
- if (READ_ONCE(mdsc->fsc->mount_state) ==
- CEPH_MOUNT_SHUTDOWN) {
- dout("get_cap_refs %p forced umount\n", inode);
- ret = -EIO;
- goto out_unlock;
- }
- mds_wanted = __ceph_caps_mds_wanted(ci, false);
- if (need & ~(mds_wanted & need)) {
- dout("get_cap_refs %p caps were dropped"
- " (session killed?)\n", inode);
- ret = -ESTALE;
- goto out_unlock;
- }
- if (!(file_wanted & ~mds_wanted))
- ci->i_ceph_flags &= ~CEPH_I_CAP_DROPPED;
+ if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+ dout("get_cap_refs %p forced umount\n", inode);
+ ret = -EIO;
+ goto out_unlock;
+ }
+ mds_wanted = __ceph_caps_mds_wanted(ci, false);
+ if (need & ~mds_wanted) {
+ dout("get_cap_refs %p need %s > mds_wanted %s\n",
+ inode, ceph_cap_string(need),
+ ceph_cap_string(mds_wanted));
+ ret = -ESTALE;
+ goto out_unlock;
}
- dout("get_cap_refs %p have %s needed %s\n", inode,
+ dout("get_cap_refs %p have %s need %s\n", inode,
ceph_cap_string(have), ceph_cap_string(need));
}
out_unlock:
+
+ __ceph_touch_fmode(ci, mdsc, flags);
+
spin_unlock(&ci->i_ceph_lock);
if (snap_rwsem_locked)
up_read(&mdsc->snap_rwsem);
@@ -2712,20 +2718,40 @@ static void check_max_size(struct inode *inode, loff_t endoff)
ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
}
+static inline int get_used_fmode(int caps)
+{
+ int fmode = 0;
+ if (caps & CEPH_CAP_FILE_RD)
+ fmode |= CEPH_FILE_MODE_RD;
+ if (caps & CEPH_CAP_FILE_WR)
+ fmode |= CEPH_FILE_MODE_WR;
+ return fmode;
+}
+
int ceph_try_get_caps(struct inode *inode, int need, int want,
bool nonblock, int *got)
{
- int ret;
+ int ret, flags;
BUG_ON(need & ~CEPH_CAP_FILE_RD);
- BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO|CEPH_CAP_FILE_SHARED));
- ret = ceph_pool_perm_check(inode, need);
- if (ret < 0)
- return ret;
+ BUG_ON(want & ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO |
+ CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL |
+ CEPH_CAP_ANY_DIR_OPS));
+ if (need) {
+ ret = ceph_pool_perm_check(inode, need);
+ if (ret < 0)
+ return ret;
+ }
+
+ flags = get_used_fmode(need | want);
+ if (nonblock)
+ flags |= NON_BLOCKING;
- ret = try_get_cap_refs(inode, need, want, 0,
- (nonblock ? NON_BLOCKING : 0), got);
- return ret == -EAGAIN ? 0 : ret;
+ ret = try_get_cap_refs(inode, need, want, 0, flags, got);
+ /* three special error codes */
+ if (ret == -EAGAIN || ret == -EFBIG || ret == -EAGAIN)
+ ret = 0;
+ return ret;
}
/*
@@ -2750,16 +2776,16 @@ int ceph_get_caps(struct file *filp, int need, int want,
fi->filp_gen != READ_ONCE(fsc->filp_gen))
return -EBADF;
- while (true) {
- if (endoff > 0)
- check_max_size(inode, endoff);
+ flags = get_used_fmode(need | want);
- flags = atomic_read(&fi->num_locks) ? CHECK_FILELOCK : 0;
+ while (true) {
+ flags &= CEPH_FILE_MODE_MASK;
+ if (atomic_read(&fi->num_locks))
+ flags |= CHECK_FILELOCK;
_got = 0;
ret = try_get_cap_refs(inode, need, want, endoff,
flags, &_got);
- if (ret == -EAGAIN)
- continue;
+ WARN_ON_ONCE(ret == -EAGAIN);
if (!ret) {
struct ceph_mds_client *mdsc = fsc->mdsc;
struct cap_wait cw;
@@ -2774,6 +2800,8 @@ int ceph_get_caps(struct file *filp, int need, int want,
list_add(&cw.list, &mdsc->cap_wait_list);
spin_unlock(&mdsc->caps_list_lock);
+ /* make sure used fmode not timeout */
+ ceph_get_fmode(ci, flags, FMODE_WAIT_BIAS);
add_wait_queue(&ci->i_cap_wq, &wait);
flags |= NON_BLOCKING;
@@ -2787,6 +2815,7 @@ int ceph_get_caps(struct file *filp, int need, int want,
}
remove_wait_queue(&ci->i_cap_wq, &wait);
+ ceph_put_fmode(ci, flags, FMODE_WAIT_BIAS);
spin_lock(&mdsc->caps_list_lock);
list_del(&cw.list);
@@ -2804,16 +2833,26 @@ int ceph_get_caps(struct file *filp, int need, int want,
}
if (ret < 0) {
+ if (ret == -EFBIG || ret == -ESTALE) {
+ int ret2 = ceph_wait_on_async_create(inode);
+ if (ret2 < 0)
+ return ret2;
+ }
+ if (ret == -EFBIG) {
+ check_max_size(inode, endoff);
+ continue;
+ }
if (ret == -ESTALE) {
/* session was killed, try renew caps */
- ret = ceph_renew_caps(inode);
+ ret = ceph_renew_caps(inode, flags);
if (ret == 0)
continue;
}
return ret;
}
- if (ci->i_inline_version != CEPH_INLINE_NONE &&
+ if (S_ISREG(ci->vfs_inode.i_mode) &&
+ ci->i_inline_version != CEPH_INLINE_NONE &&
(_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
i_size_read(inode) > 0) {
struct page *page =
@@ -2846,7 +2885,8 @@ int ceph_get_caps(struct file *filp, int need, int want,
break;
}
- if ((_got & CEPH_CAP_FILE_RD) && (_got & CEPH_CAP_FILE_CACHE))
+ if (S_ISREG(ci->vfs_inode.i_mode) &&
+ (_got & CEPH_CAP_FILE_RD) && (_got & CEPH_CAP_FILE_CACHE))
ceph_fscache_revalidate_cookie(ci);
*got = _got;
@@ -2860,7 +2900,7 @@ int ceph_get_caps(struct file *filp, int need, int want,
void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
{
spin_lock(&ci->i_ceph_lock);
- __take_cap_refs(ci, caps, false);
+ ceph_take_cap_refs(ci, caps, false);
spin_unlock(&ci->i_ceph_lock);
}
@@ -2911,6 +2951,9 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
if (had & CEPH_CAP_FILE_CACHE)
if (--ci->i_rdcache_ref == 0)
last++;
+ if (had & CEPH_CAP_FILE_EXCL)
+ if (--ci->i_fx_ref == 0)
+ last++;
if (had & CEPH_CAP_FILE_BUFFER) {
if (--ci->i_wb_ref == 0) {
last++;
@@ -2950,7 +2993,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had),
last ? " last" : "", put ? " put" : "");
- if (last && !flushsnaps)
+ if (last)
ceph_check_caps(ci, 0, NULL);
else if (flushsnaps)
ceph_flush_snaps(ci, NULL);
@@ -3032,7 +3075,7 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
spin_unlock(&ci->i_ceph_lock);
if (last) {
- ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
+ ceph_check_caps(ci, 0, NULL);
} else if (flush_snaps) {
ceph_flush_snaps(ci, NULL);
}
@@ -3133,7 +3176,7 @@ static void handle_cap_grant(struct inode *inode,
* try to invalidate (once). (If there are dirty buffers, we
* will invalidate _after_ writeback.)
*/
- if (!S_ISDIR(inode->i_mode) && /* don't invalidate readdir cache */
+ if (S_ISREG(inode->i_mode) && /* don't invalidate readdir cache */
((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
(newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
!(ci->i_wrbuffer_ref || ci->i_wb_ref)) {
@@ -3297,11 +3340,12 @@ static void handle_cap_grant(struct inode *inode,
ceph_cap_string(cap->issued),
ceph_cap_string(newcaps),
ceph_cap_string(revoking));
- if (revoking & used & CEPH_CAP_FILE_BUFFER)
+ if (S_ISREG(inode->i_mode) &&
+ (revoking & used & CEPH_CAP_FILE_BUFFER))
writeback = true; /* initiate writeback; will delay ack */
- else if (revoking == CEPH_CAP_FILE_CACHE &&
- (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
- queue_invalidate)
+ else if (queue_invalidate &&
+ revoking == CEPH_CAP_FILE_CACHE &&
+ (newcaps & CEPH_CAP_FILE_LAZYIO) == 0)
; /* do nothing yet, invalidation will be queued */
else if (cap == ci->i_auth_cap)
check_caps = 1; /* check auth cap only */
@@ -3339,7 +3383,8 @@ static void handle_cap_grant(struct inode *inode,
if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) {
if (newcaps & ~extra_info->issued)
wake = true;
- kick_flushing_inode_caps(session->s_mdsc, session, inode);
+ ceph_kick_flushing_inode_caps(session, ci);
+ spin_unlock(&ci->i_ceph_lock);
up_read(&session->s_mdsc->snap_rwsem);
} else {
spin_unlock(&ci->i_ceph_lock);
@@ -3367,10 +3412,10 @@ static void handle_cap_grant(struct inode *inode,
wake_up_all(&ci->i_cap_wq);
if (check_caps == 1)
- ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
+ ceph_check_caps(ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_NOINVAL,
session);
else if (check_caps == 2)
- ceph_check_caps(ci, CHECK_CAPS_NODELAY, session);
+ ceph_check_caps(ci, CHECK_CAPS_NOINVAL, session);
else
mutex_unlock(&session->s_mutex);
}
@@ -3619,8 +3664,6 @@ retry:
goto out_unlock;
if (target < 0) {
- if (cap->mds_wanted | cap->issued)
- ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
__ceph_remove_cap(cap, false);
goto out_unlock;
}
@@ -3668,7 +3711,7 @@ retry:
/* add placeholder for the export tagert */
int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0;
tcap = new_cap;
- ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0,
+ ceph_add_cap(inode, tsession, t_cap_id, issued, 0,
t_seq - 1, t_mseq, (u64)-1, flag, &new_cap);
if (!list_empty(&ci->i_cap_flush_list) &&
@@ -3773,7 +3816,7 @@ retry:
__ceph_caps_issued(ci, &issued);
issued |= __ceph_caps_dirty(ci);
- ceph_add_cap(inode, session, cap_id, -1, caps, wanted, seq, mseq,
+ ceph_add_cap(inode, session, cap_id, caps, wanted, seq, mseq,
realmino, CEPH_CAP_FLAG_AUTH, &new_cap);
ocap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL;
@@ -4047,7 +4090,6 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
{
struct inode *inode;
struct ceph_inode_info *ci;
- int flags = CHECK_CAPS_NODELAY;
dout("check_delayed_caps\n");
while (1) {
@@ -4067,7 +4109,7 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
if (inode) {
dout("check_delayed_caps on %p\n", inode);
- ceph_check_caps(ci, flags, NULL);
+ ceph_check_caps(ci, 0, NULL);
/* avoid calling iput_final() in tick thread */
ceph_async_iput(inode);
}
@@ -4092,7 +4134,7 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
ihold(inode);
dout("flush_dirty_caps %p\n", inode);
spin_unlock(&mdsc->cap_dirty_lock);
- ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH, NULL);
+ ceph_check_caps(ci, CHECK_CAPS_FLUSH, NULL);
iput(inode);
spin_lock(&mdsc->cap_dirty_lock);
}
@@ -4100,14 +4142,31 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
dout("flush_dirty_caps done\n");
}
-void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode)
+void __ceph_touch_fmode(struct ceph_inode_info *ci,
+ struct ceph_mds_client *mdsc, int fmode)
+{
+ unsigned long now = jiffies;
+ if (fmode & CEPH_FILE_MODE_RD)
+ ci->i_last_rd = now;
+ if (fmode & CEPH_FILE_MODE_WR)
+ ci->i_last_wr = now;
+ /* queue periodic check */
+ if (fmode &&
+ __ceph_is_any_real_caps(ci) &&
+ list_empty(&ci->i_cap_delay_list))
+ __cap_delay_requeue(mdsc, ci);
+}
+
+void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
{
int i;
int bits = (fmode << 1) | 1;
+ spin_lock(&ci->i_ceph_lock);
for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
if (bits & (1 << i))
- ci->i_nr_by_mode[i]++;
+ ci->i_nr_by_mode[i] += count;
}
+ spin_unlock(&ci->i_ceph_lock);
}
/*
@@ -4115,26 +4174,18 @@ void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode)
* we may need to release capabilities to the MDS (or schedule
* their delayed release).
*/
-void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
+void ceph_put_fmode(struct ceph_inode_info *ci, int fmode, int count)
{
- int i, last = 0;
+ int i;
int bits = (fmode << 1) | 1;
spin_lock(&ci->i_ceph_lock);
for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
if (bits & (1 << i)) {
- BUG_ON(ci->i_nr_by_mode[i] == 0);
- if (--ci->i_nr_by_mode[i] == 0)
- last++;
+ BUG_ON(ci->i_nr_by_mode[i] < count);
+ ci->i_nr_by_mode[i] -= count;
}
}
- dout("put_fmode %p fmode %d {%d,%d,%d,%d}\n",
- &ci->vfs_inode, fmode,
- ci->i_nr_by_mode[0], ci->i_nr_by_mode[1],
- ci->i_nr_by_mode[2], ci->i_nr_by_mode[3]);
spin_unlock(&ci->i_ceph_lock);
-
- if (last && ci->i_vino.snap == CEPH_NOSNAP)
- ceph_check_caps(ci, 0, NULL);
}
/*
@@ -4152,7 +4203,6 @@ int ceph_drop_caps_for_unlink(struct inode *inode)
if (inode->i_nlink == 1) {
drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN);
- ci->i_ceph_flags |= CEPH_I_NODELAY;
if (__ceph_caps_dirty(ci)) {
struct ceph_mds_client *mdsc =
ceph_inode_to_client(inode)->mdsc;
@@ -4208,8 +4258,6 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
if (force || (cap->issued & drop)) {
if (cap->issued & drop) {
int wanted = __ceph_caps_wanted(ci);
- if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0)
- wanted |= cap->mds_wanted;
dout("encode_inode_release %p cap %p "
"%s -> %s, wanted %s -> %s\n", inode, cap,
ceph_cap_string(cap->issued),
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index fb7cabd98e7b..481ac97b4d25 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -218,10 +218,10 @@ static int mds_sessions_show(struct seq_file *s, void *ptr)
return 0;
}
-CEPH_DEFINE_SHOW_FUNC(mdsmap_show)
-CEPH_DEFINE_SHOW_FUNC(mdsc_show)
-CEPH_DEFINE_SHOW_FUNC(caps_show)
-CEPH_DEFINE_SHOW_FUNC(mds_sessions_show)
+DEFINE_SHOW_ATTRIBUTE(mdsmap);
+DEFINE_SHOW_ATTRIBUTE(mdsc);
+DEFINE_SHOW_ATTRIBUTE(caps);
+DEFINE_SHOW_ATTRIBUTE(mds_sessions);
/*
@@ -281,25 +281,25 @@ void ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
0400,
fsc->client->debugfs_dir,
fsc,
- &mdsmap_show_fops);
+ &mdsmap_fops);
fsc->debugfs_mds_sessions = debugfs_create_file("mds_sessions",
0400,
fsc->client->debugfs_dir,
fsc,
- &mds_sessions_show_fops);
+ &mds_sessions_fops);
fsc->debugfs_mdsc = debugfs_create_file("mdsc",
0400,
fsc->client->debugfs_dir,
fsc,
- &mdsc_show_fops);
+ &mdsc_fops);
fsc->debugfs_caps = debugfs_create_file("caps",
0400,
fsc->client->debugfs_dir,
fsc,
- &caps_show_fops);
+ &caps_fops);
}
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index d0cd0aba5843..d594c2627430 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -335,8 +335,11 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
ctx->pos = 2;
}
- /* can we use the dcache? */
spin_lock(&ci->i_ceph_lock);
+ /* request Fx cap. if have Fx, we don't need to release Fs cap
+ * for later create/unlink. */
+ __ceph_touch_fmode(ci, mdsc, CEPH_FILE_MODE_WR);
+ /* can we use the dcache? */
if (ceph_test_mount_opt(fsc, DCACHE) &&
!ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
ceph_snap(inode) != CEPH_SNAPDIR &&
@@ -752,7 +755,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
struct ceph_dentry_info *di = ceph_dentry(dentry);
spin_lock(&ci->i_ceph_lock);
- dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
+ dout(" dir %p flags are 0x%lx\n", dir, ci->i_ceph_flags);
if (strncmp(dentry->d_name.name,
fsc->mount_options->snapdir_name,
dentry->d_name.len) &&
@@ -760,6 +763,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
ceph_test_mount_opt(fsc, DCACHE) &&
__ceph_dir_is_complete(ci) &&
(__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
+ __ceph_touch_fmode(ci, mdsc, CEPH_FILE_MODE_RD);
spin_unlock(&ci->i_ceph_lock);
dout(" dir %p complete, -ENOENT\n", dir);
d_add(dentry, NULL);
@@ -1036,6 +1040,78 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
return err;
}
+static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
+ struct ceph_mds_request *req)
+{
+ int result = req->r_err ? req->r_err :
+ le32_to_cpu(req->r_reply_info.head->result);
+
+ if (result == -EJUKEBOX)
+ goto out;
+
+ /* If op failed, mark everyone involved for errors */
+ if (result) {
+ int pathlen;
+ u64 base;
+ char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
+ &base, 0);
+
+ /* mark error on parent + clear complete */
+ mapping_set_error(req->r_parent->i_mapping, result);
+ ceph_dir_clear_complete(req->r_parent);
+
+ /* drop the dentry -- we don't know its status */
+ if (!d_unhashed(req->r_dentry))
+ d_drop(req->r_dentry);
+
+ /* mark inode itself for an error (since metadata is bogus) */
+ mapping_set_error(req->r_old_inode->i_mapping, result);
+
+ pr_warn("ceph: async unlink failure path=(%llx)%s result=%d!\n",
+ base, IS_ERR(path) ? "<<bad>>" : path, result);
+ ceph_mdsc_free_path(path, pathlen);
+ }
+out:
+ iput(req->r_old_inode);
+ ceph_mdsc_release_dir_caps(req);
+}
+
+static int get_caps_for_async_unlink(struct inode *dir, struct dentry *dentry)
+{
+ struct ceph_inode_info *ci = ceph_inode(dir);
+ struct ceph_dentry_info *di;
+ int got = 0, want = CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_UNLINK;
+
+ spin_lock(&ci->i_ceph_lock);
+ if ((__ceph_caps_issued(ci, NULL) & want) == want) {
+ ceph_take_cap_refs(ci, want, false);
+ got = want;
+ }
+ spin_unlock(&ci->i_ceph_lock);
+
+ /* If we didn't get anything, return 0 */
+ if (!got)
+ return 0;
+
+ spin_lock(&dentry->d_lock);
+ di = ceph_dentry(dentry);
+ /*
+ * - We are holding Fx, which implies Fs caps.
+ * - Only support async unlink for primary linkage
+ */
+ if (atomic_read(&ci->i_shared_gen) != di->lease_shared_gen ||
+ !(di->flags & CEPH_DENTRY_PRIMARY_LINK))
+ want = 0;
+ spin_unlock(&dentry->d_lock);
+
+ /* Do we still want what we've got? */
+ if (want == got)
+ return got;
+
+ ceph_put_cap_refs(ci, got);
+ return 0;
+}
+
/*
* rmdir and unlink are differ only by the metadata op code
*/
@@ -1045,6 +1121,7 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
struct ceph_mds_client *mdsc = fsc->mdsc;
struct inode *inode = d_inode(dentry);
struct ceph_mds_request *req;
+ bool try_async = ceph_test_mount_opt(fsc, ASYNC_DIROPS);
int err = -EROFS;
int op;
@@ -1059,6 +1136,7 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK;
} else
goto out;
+retry:
req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
if (IS_ERR(req)) {
err = PTR_ERR(req);
@@ -1067,13 +1145,39 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
req->r_dentry = dget(dentry);
req->r_num_caps = 2;
req->r_parent = dir;
- set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
req->r_inode_drop = ceph_drop_caps_for_unlink(inode);
- err = ceph_mdsc_do_request(mdsc, dir, req);
- if (!err && !req->r_reply_info.head->is_dentry)
- d_delete(dentry);
+
+ if (try_async && op == CEPH_MDS_OP_UNLINK &&
+ (req->r_dir_caps = get_caps_for_async_unlink(dir, dentry))) {
+ dout("async unlink on %lu/%.*s caps=%s", dir->i_ino,
+ dentry->d_name.len, dentry->d_name.name,
+ ceph_cap_string(req->r_dir_caps));
+ set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags);
+ req->r_callback = ceph_async_unlink_cb;
+ req->r_old_inode = d_inode(dentry);
+ ihold(req->r_old_inode);
+ err = ceph_mdsc_submit_request(mdsc, dir, req);
+ if (!err) {
+ /*
+ * We have enough caps, so we assume that the unlink
+ * will succeed. Fix up the target inode and dcache.
+ */
+ drop_nlink(inode);
+ d_delete(dentry);
+ } else if (err == -EJUKEBOX) {
+ try_async = false;
+ ceph_mdsc_put_request(req);
+ goto retry;
+ }
+ } else {
+ set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
+ err = ceph_mdsc_do_request(mdsc, dir, req);
+ if (!err && !req->r_reply_info.head->is_dentry)
+ d_delete(dentry);
+ }
+
ceph_mdsc_put_request(req);
out:
return err;
@@ -1411,6 +1515,7 @@ void ceph_invalidate_dentry_lease(struct dentry *dentry)
spin_lock(&dentry->d_lock);
di->time = jiffies;
di->lease_shared_gen = 0;
+ di->flags &= ~CEPH_DENTRY_PRIMARY_LINK;
__dentry_lease_unlist(di);
spin_unlock(&dentry->d_lock);
}
@@ -1520,7 +1625,8 @@ static int __dir_lease_try_check(const struct dentry *dentry)
/*
* Check if directory-wide content lease/cap is valid.
*/
-static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
+static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry,
+ struct ceph_mds_client *mdsc)
{
struct ceph_inode_info *ci = ceph_inode(dir);
int valid;
@@ -1528,7 +1634,10 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
spin_lock(&ci->i_ceph_lock);
valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1);
- shared_gen = atomic_read(&ci->i_shared_gen);
+ if (valid) {
+ __ceph_touch_fmode(ci, mdsc, CEPH_FILE_MODE_RD);
+ shared_gen = atomic_read(&ci->i_shared_gen);
+ }
spin_unlock(&ci->i_ceph_lock);
if (valid) {
struct ceph_dentry_info *di;
@@ -1554,6 +1663,7 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
int valid = 0;
struct dentry *parent;
struct inode *dir, *inode;
+ struct ceph_mds_client *mdsc;
if (flags & LOOKUP_RCU) {
parent = READ_ONCE(dentry->d_parent);
@@ -1570,6 +1680,8 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
dout("d_revalidate %p '%pd' inode %p offset 0x%llx\n", dentry,
dentry, inode, ceph_dentry(dentry)->offset);
+ mdsc = ceph_sb_to_client(dir->i_sb)->mdsc;
+
/* always trust cached snapped dentries, snapdir dentry */
if (ceph_snap(dir) != CEPH_NOSNAP) {
dout("d_revalidate %p '%pd' inode %p is SNAPPED\n", dentry,
@@ -1581,7 +1693,7 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
valid = dentry_lease_is_valid(dentry, flags);
if (valid == -ECHILD)
return valid;
- if (valid || dir_lease_is_valid(dir, dentry)) {
+ if (valid || dir_lease_is_valid(dir, dentry, mdsc)) {
if (inode)
valid = ceph_is_any_caps(inode);
else
@@ -1590,8 +1702,6 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
}
if (!valid) {
- struct ceph_mds_client *mdsc =
- ceph_sb_to_client(dir->i_sb)->mdsc;
struct ceph_mds_request *req;
int op, err;
u32 mask;
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index b6bfa94332c3..79dc06881e78 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -315,6 +315,11 @@ static struct dentry *__get_parent(struct super_block *sb,
req->r_num_caps = 1;
err = ceph_mdsc_do_request(mdsc, NULL, req);
+ if (err) {
+ ceph_mdsc_put_request(req);
+ return ERR_PTR(err);
+ }
+
inode = req->r_target_inode;
if (inode)
ihold(inode);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 5a478cd06e11..4a5ccbb7e808 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -212,10 +212,8 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
if (isdir) {
struct ceph_dir_file_info *dfi =
kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL);
- if (!dfi) {
- ceph_put_fmode(ci, fmode); /* clean up */
+ if (!dfi)
return -ENOMEM;
- }
file->private_data = dfi;
fi = &dfi->file_info;
@@ -223,15 +221,15 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
dfi->readdir_cache_idx = -1;
} else {
fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL);
- if (!fi) {
- ceph_put_fmode(ci, fmode); /* clean up */
+ if (!fi)
return -ENOMEM;
- }
file->private_data = fi;
}
+ ceph_get_fmode(ci, fmode, 1);
fi->fmode = fmode;
+
spin_lock_init(&fi->rw_contexts_lock);
INIT_LIST_HEAD(&fi->rw_contexts);
fi->meta_err = errseq_sample(&ci->i_meta_err);
@@ -263,7 +261,6 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
case S_IFLNK:
dout("init_file %p %p 0%o (symlink)\n", inode, file,
inode->i_mode);
- ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
break;
default:
@@ -273,7 +270,6 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
* we need to drop the open ref now, since we don't
* have .release set to ceph_release.
*/
- ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
BUG_ON(inode->i_fop->release == ceph_release);
/* call the proper open fop */
@@ -285,14 +281,15 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
/*
* try renew caps after session gets killed.
*/
-int ceph_renew_caps(struct inode *inode)
+int ceph_renew_caps(struct inode *inode, int fmode)
{
- struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_request *req;
int err, flags, wanted;
spin_lock(&ci->i_ceph_lock);
+ __ceph_touch_fmode(ci, mdsc, fmode);
wanted = __ceph_caps_file_wanted(ci);
if (__ceph_is_any_real_caps(ci) &&
(!(wanted & CEPH_CAP_ANY_WR) || ci->i_auth_cap)) {
@@ -326,7 +323,6 @@ int ceph_renew_caps(struct inode *inode)
req->r_inode = inode;
ihold(inode);
req->r_num_caps = 1;
- req->r_fmode = -1;
err = ceph_mdsc_do_request(mdsc, NULL, req);
ceph_mdsc_put_request(req);
@@ -372,9 +368,6 @@ int ceph_open(struct inode *inode, struct file *file)
/* trivially open snapdir */
if (ceph_snap(inode) == CEPH_SNAPDIR) {
- spin_lock(&ci->i_ceph_lock);
- __ceph_get_fmode(ci, fmode);
- spin_unlock(&ci->i_ceph_lock);
return ceph_init_file(inode, file, fmode);
}
@@ -392,7 +385,7 @@ int ceph_open(struct inode *inode, struct file *file)
dout("open %p fmode %d want %s issued %s using existing\n",
inode, fmode, ceph_cap_string(wanted),
ceph_cap_string(issued));
- __ceph_get_fmode(ci, fmode);
+ __ceph_touch_fmode(ci, mdsc, fmode);
spin_unlock(&ci->i_ceph_lock);
/* adjust wanted? */
@@ -404,7 +397,7 @@ int ceph_open(struct inode *inode, struct file *file)
return ceph_init_file(inode, file, fmode);
} else if (ceph_snap(inode) != CEPH_NOSNAP &&
(ci->i_snap_caps & wanted) == wanted) {
- __ceph_get_fmode(ci, fmode);
+ __ceph_touch_fmode(ci, mdsc, fmode);
spin_unlock(&ci->i_ceph_lock);
return ceph_init_file(inode, file, fmode);
}
@@ -430,6 +423,236 @@ out:
return err;
}
+/* Clone the layout from a synchronous create, if the dir now has Dc caps */
+static void
+cache_file_layout(struct inode *dst, struct inode *src)
+{
+ struct ceph_inode_info *cdst = ceph_inode(dst);
+ struct ceph_inode_info *csrc = ceph_inode(src);
+
+ spin_lock(&cdst->i_ceph_lock);
+ if ((__ceph_caps_issued(cdst, NULL) & CEPH_CAP_DIR_CREATE) &&
+ !ceph_file_layout_is_valid(&cdst->i_cached_layout)) {
+ memcpy(&cdst->i_cached_layout, &csrc->i_layout,
+ sizeof(cdst->i_cached_layout));
+ rcu_assign_pointer(cdst->i_cached_layout.pool_ns,
+ ceph_try_get_string(csrc->i_layout.pool_ns));
+ }
+ spin_unlock(&cdst->i_ceph_lock);
+}
+
+/*
+ * Try to set up an async create. We need caps, a file layout, and inode number,
+ * and either a lease on the dentry or complete dir info. If any of those
+ * criteria are not satisfied, then return false and the caller can go
+ * synchronous.
+ */
+static int try_prep_async_create(struct inode *dir, struct dentry *dentry,
+ struct ceph_file_layout *lo, u64 *pino)
+{
+ struct ceph_inode_info *ci = ceph_inode(dir);
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
+ int got = 0, want = CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_CREATE;
+ u64 ino;
+
+ spin_lock(&ci->i_ceph_lock);
+ /* No auth cap means no chance for Dc caps */
+ if (!ci->i_auth_cap)
+ goto no_async;
+
+ /* Any delegated inos? */
+ if (xa_empty(&ci->i_auth_cap->session->s_delegated_inos))
+ goto no_async;
+
+ if (!ceph_file_layout_is_valid(&ci->i_cached_layout))
+ goto no_async;
+
+ if ((__ceph_caps_issued(ci, NULL) & want) != want)
+ goto no_async;
+
+ if (d_in_lookup(dentry)) {
+ if (!__ceph_dir_is_complete(ci))
+ goto no_async;
+ spin_lock(&dentry->d_lock);
+ di->lease_shared_gen = atomic_read(&ci->i_shared_gen);
+ spin_unlock(&dentry->d_lock);
+ } else if (atomic_read(&ci->i_shared_gen) !=
+ READ_ONCE(di->lease_shared_gen)) {
+ goto no_async;
+ }
+
+ ino = ceph_get_deleg_ino(ci->i_auth_cap->session);
+ if (!ino)
+ goto no_async;
+
+ *pino = ino;
+ ceph_take_cap_refs(ci, want, false);
+ memcpy(lo, &ci->i_cached_layout, sizeof(*lo));
+ rcu_assign_pointer(lo->pool_ns,
+ ceph_try_get_string(ci->i_cached_layout.pool_ns));
+ got = want;
+no_async:
+ spin_unlock(&ci->i_ceph_lock);
+ return got;
+}
+
+static void restore_deleg_ino(struct inode *dir, u64 ino)
+{
+ struct ceph_inode_info *ci = ceph_inode(dir);
+ struct ceph_mds_session *s = NULL;
+
+ spin_lock(&ci->i_ceph_lock);
+ if (ci->i_auth_cap)
+ s = ceph_get_mds_session(ci->i_auth_cap->session);
+ spin_unlock(&ci->i_ceph_lock);
+ if (s) {
+ int err = ceph_restore_deleg_ino(s, ino);
+ if (err)
+ pr_warn("ceph: unable to restore delegated ino 0x%llx to session: %d\n",
+ ino, err);
+ ceph_put_mds_session(s);
+ }
+}
+
+static void ceph_async_create_cb(struct ceph_mds_client *mdsc,
+ struct ceph_mds_request *req)
+{
+ int result = req->r_err ? req->r_err :
+ le32_to_cpu(req->r_reply_info.head->result);
+
+ if (result == -EJUKEBOX)
+ goto out;
+
+ mapping_set_error(req->r_parent->i_mapping, result);
+
+ if (result) {
+ struct dentry *dentry = req->r_dentry;
+ int pathlen;
+ u64 base;
+ char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
+ &base, 0);
+
+ ceph_dir_clear_complete(req->r_parent);
+ if (!d_unhashed(dentry))
+ d_drop(dentry);
+
+ /* FIXME: start returning I/O errors on all accesses? */
+ pr_warn("ceph: async create failure path=(%llx)%s result=%d!\n",
+ base, IS_ERR(path) ? "<<bad>>" : path, result);
+ ceph_mdsc_free_path(path, pathlen);
+ }
+
+ if (req->r_target_inode) {
+ struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
+ u64 ino = ceph_vino(req->r_target_inode).ino;
+
+ if (req->r_deleg_ino != ino)
+ pr_warn("%s: inode number mismatch! err=%d deleg_ino=0x%llx target=0x%llx\n",
+ __func__, req->r_err, req->r_deleg_ino, ino);
+ mapping_set_error(req->r_target_inode->i_mapping, result);
+
+ spin_lock(&ci->i_ceph_lock);
+ if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) {
+ ci->i_ceph_flags &= ~CEPH_I_ASYNC_CREATE;
+ wake_up_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT);
+ }
+ ceph_kick_flushing_inode_caps(req->r_session, ci);
+ spin_unlock(&ci->i_ceph_lock);
+ } else {
+ pr_warn("%s: no req->r_target_inode for 0x%llx\n", __func__,
+ req->r_deleg_ino);
+ }
+out:
+ ceph_mdsc_release_dir_caps(req);
+}
+
+static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry,
+ struct file *file, umode_t mode,
+ struct ceph_mds_request *req,
+ struct ceph_acl_sec_ctx *as_ctx,
+ struct ceph_file_layout *lo)
+{
+ int ret;
+ char xattr_buf[4];
+ struct ceph_mds_reply_inode in = { };
+ struct ceph_mds_reply_info_in iinfo = { .in = &in };
+ struct ceph_inode_info *ci = ceph_inode(dir);
+ struct inode *inode;
+ struct timespec64 now;
+ struct ceph_vino vino = { .ino = req->r_deleg_ino,
+ .snap = CEPH_NOSNAP };
+
+ ktime_get_real_ts64(&now);
+
+ inode = ceph_get_inode(dentry->d_sb, vino);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ iinfo.inline_version = CEPH_INLINE_NONE;
+ iinfo.change_attr = 1;
+ ceph_encode_timespec64(&iinfo.btime, &now);
+
+ iinfo.xattr_len = ARRAY_SIZE(xattr_buf);
+ iinfo.xattr_data = xattr_buf;
+ memset(iinfo.xattr_data, 0, iinfo.xattr_len);
+
+ in.ino = cpu_to_le64(vino.ino);
+ in.snapid = cpu_to_le64(CEPH_NOSNAP);
+ in.version = cpu_to_le64(1); // ???
+ in.cap.caps = in.cap.wanted = cpu_to_le32(CEPH_CAP_ALL_FILE);
+ in.cap.cap_id = cpu_to_le64(1);
+ in.cap.realm = cpu_to_le64(ci->i_snap_realm->ino);
+ in.cap.flags = CEPH_CAP_FLAG_AUTH;
+ in.ctime = in.mtime = in.atime = iinfo.btime;
+ in.mode = cpu_to_le32((u32)mode);
+ in.truncate_seq = cpu_to_le32(1);
+ in.truncate_size = cpu_to_le64(-1ULL);
+ in.xattr_version = cpu_to_le64(1);
+ in.uid = cpu_to_le32(from_kuid(&init_user_ns, current_fsuid()));
+ in.gid = cpu_to_le32(from_kgid(&init_user_ns, dir->i_mode & S_ISGID ?
+ dir->i_gid : current_fsgid()));
+ in.nlink = cpu_to_le32(1);
+ in.max_size = cpu_to_le64(lo->stripe_unit);
+
+ ceph_file_layout_to_legacy(lo, &in.layout);
+
+ ret = ceph_fill_inode(inode, NULL, &iinfo, NULL, req->r_session,
+ req->r_fmode, NULL);
+ if (ret) {
+ dout("%s failed to fill inode: %d\n", __func__, ret);
+ ceph_dir_clear_complete(dir);
+ if (!d_unhashed(dentry))
+ d_drop(dentry);
+ if (inode->i_state & I_NEW)
+ discard_new_inode(inode);
+ } else {
+ struct dentry *dn;
+
+ dout("%s d_adding new inode 0x%llx to 0x%lx/%s\n", __func__,
+ vino.ino, dir->i_ino, dentry->d_name.name);
+ ceph_dir_clear_ordered(dir);
+ ceph_init_inode_acls(inode, as_ctx);
+ if (inode->i_state & I_NEW) {
+ /*
+ * If it's not I_NEW, then someone created this before
+ * we got here. Assume the server is aware of it at
+ * that point and don't worry about setting
+ * CEPH_I_ASYNC_CREATE.
+ */
+ ceph_inode(inode)->i_ceph_flags = CEPH_I_ASYNC_CREATE;
+ unlock_new_inode(inode);
+ }
+ if (d_in_lookup(dentry) || d_really_is_negative(dentry)) {
+ if (!d_unhashed(dentry))
+ d_drop(dentry);
+ dn = d_splice_alias(inode, dentry);
+ WARN_ON_ONCE(dn && dn != dentry);
+ }
+ file->f_mode |= FMODE_CREATED;
+ ret = finish_open(file, dentry, ceph_open);
+ }
+ return ret;
+}
/*
* Do a lookup + open with a single request. If we get a non-existent
@@ -443,6 +666,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
struct ceph_mds_request *req;
struct dentry *dn;
struct ceph_acl_sec_ctx as_ctx = {};
+ bool try_async = ceph_test_mount_opt(fsc, ASYNC_DIROPS);
int mask;
int err;
@@ -466,7 +690,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
/* If it's not being looked up, it's negative */
return -ENOENT;
}
-
+retry:
/* do the open */
req = prepare_open_request(dir->i_sb, flags, mode);
if (IS_ERR(req)) {
@@ -475,21 +699,43 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
}
req->r_dentry = dget(dentry);
req->r_num_caps = 2;
+ mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
+ if (ceph_security_xattr_wanted(dir))
+ mask |= CEPH_CAP_XATTR_SHARED;
+ req->r_args.open.mask = cpu_to_le32(mask);
+ req->r_parent = dir;
+
if (flags & O_CREAT) {
+ struct ceph_file_layout lo;
+
req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL;
req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
if (as_ctx.pagelist) {
req->r_pagelist = as_ctx.pagelist;
as_ctx.pagelist = NULL;
}
+ if (try_async &&
+ (req->r_dir_caps =
+ try_prep_async_create(dir, dentry, &lo,
+ &req->r_deleg_ino))) {
+ set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags);
+ req->r_args.open.flags |= cpu_to_le32(CEPH_O_EXCL);
+ req->r_callback = ceph_async_create_cb;
+ err = ceph_mdsc_submit_request(mdsc, dir, req);
+ if (!err) {
+ err = ceph_finish_async_create(dir, dentry,
+ file, mode, req,
+ &as_ctx, &lo);
+ } else if (err == -EJUKEBOX) {
+ restore_deleg_ino(dir, req->r_deleg_ino);
+ ceph_mdsc_put_request(req);
+ try_async = false;
+ goto retry;
+ }
+ goto out_req;
+ }
}
- mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
- if (ceph_security_xattr_wanted(dir))
- mask |= CEPH_CAP_XATTR_SHARED;
- req->r_args.open.mask = cpu_to_le32(mask);
-
- req->r_parent = dir;
set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
err = ceph_mdsc_do_request(mdsc,
(flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
@@ -518,14 +764,15 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
} else {
dout("atomic_open finish_open on dn %p\n", dn);
if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) {
- ceph_init_inode_acls(d_inode(dentry), &as_ctx);
+ struct inode *newino = d_inode(dentry);
+
+ cache_file_layout(dir, newino);
+ ceph_init_inode_acls(newino, &as_ctx);
file->f_mode |= FMODE_CREATED;
}
err = finish_open(file, dentry, ceph_open);
}
out_req:
- if (!req->r_err && req->r_target_inode)
- ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode);
ceph_mdsc_put_request(req);
out_ctx:
ceph_release_acl_sec_ctx(&as_ctx);
@@ -542,7 +789,7 @@ int ceph_release(struct inode *inode, struct file *file)
dout("release inode %p dir file %p\n", inode, file);
WARN_ON(!list_empty(&dfi->file_info.rw_contexts));
- ceph_put_fmode(ci, dfi->file_info.fmode);
+ ceph_put_fmode(ci, dfi->file_info.fmode, 1);
if (dfi->last_readdir)
ceph_mdsc_put_request(dfi->last_readdir);
@@ -554,7 +801,8 @@ int ceph_release(struct inode *inode, struct file *file)
dout("release inode %p regular file %p\n", inode, file);
WARN_ON(!list_empty(&fi->rw_contexts));
- ceph_put_fmode(ci, fi->fmode);
+ ceph_put_fmode(ci, fi->fmode, 1);
+
kmem_cache_free(ceph_file_cachep, fi);
}
@@ -1567,7 +1815,7 @@ retry_snap:
if (dirty)
__mark_inode_dirty(inode, dirty);
if (ceph_quota_is_max_bytes_approaching(inode, iocb->ki_pos))
- ceph_check_caps(ci, CHECK_CAPS_NODELAY, NULL);
+ ceph_check_caps(ci, 0, NULL);
}
dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n",
@@ -1944,6 +2192,71 @@ static int is_file_size_ok(struct inode *src_inode, struct inode *dst_inode,
return 0;
}
+static ssize_t ceph_do_objects_copy(struct ceph_inode_info *src_ci, u64 *src_off,
+ struct ceph_inode_info *dst_ci, u64 *dst_off,
+ struct ceph_fs_client *fsc,
+ size_t len, unsigned int flags)
+{
+ struct ceph_object_locator src_oloc, dst_oloc;
+ struct ceph_object_id src_oid, dst_oid;
+ size_t bytes = 0;
+ u64 src_objnum, src_objoff, dst_objnum, dst_objoff;
+ u32 src_objlen, dst_objlen;
+ u32 object_size = src_ci->i_layout.object_size;
+ int ret;
+
+ src_oloc.pool = src_ci->i_layout.pool_id;
+ src_oloc.pool_ns = ceph_try_get_string(src_ci->i_layout.pool_ns);
+ dst_oloc.pool = dst_ci->i_layout.pool_id;
+ dst_oloc.pool_ns = ceph_try_get_string(dst_ci->i_layout.pool_ns);
+
+ while (len >= object_size) {
+ ceph_calc_file_object_mapping(&src_ci->i_layout, *src_off,
+ object_size, &src_objnum,
+ &src_objoff, &src_objlen);
+ ceph_calc_file_object_mapping(&dst_ci->i_layout, *dst_off,
+ object_size, &dst_objnum,
+ &dst_objoff, &dst_objlen);
+ ceph_oid_init(&src_oid);
+ ceph_oid_printf(&src_oid, "%llx.%08llx",
+ src_ci->i_vino.ino, src_objnum);
+ ceph_oid_init(&dst_oid);
+ ceph_oid_printf(&dst_oid, "%llx.%08llx",
+ dst_ci->i_vino.ino, dst_objnum);
+ /* Do an object remote copy */
+ ret = ceph_osdc_copy_from(&fsc->client->osdc,
+ src_ci->i_vino.snap, 0,
+ &src_oid, &src_oloc,
+ CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
+ CEPH_OSD_OP_FLAG_FADVISE_NOCACHE,
+ &dst_oid, &dst_oloc,
+ CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
+ CEPH_OSD_OP_FLAG_FADVISE_DONTNEED,
+ dst_ci->i_truncate_seq,
+ dst_ci->i_truncate_size,
+ CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ);
+ if (ret) {
+ if (ret == -EOPNOTSUPP) {
+ fsc->have_copy_from2 = false;
+ pr_notice("OSDs don't support copy-from2; disabling copy offload\n");
+ }
+ dout("ceph_osdc_copy_from returned %d\n", ret);
+ if (!bytes)
+ bytes = ret;
+ goto out;
+ }
+ len -= object_size;
+ bytes += object_size;
+ *src_off += object_size;
+ *dst_off += object_size;
+ }
+
+out:
+ ceph_oloc_destroy(&src_oloc);
+ ceph_oloc_destroy(&dst_oloc);
+ return bytes;
+}
+
static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
struct file *dst_file, loff_t dst_off,
size_t len, unsigned int flags)
@@ -1954,14 +2267,11 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
struct ceph_inode_info *dst_ci = ceph_inode(dst_inode);
struct ceph_cap_flush *prealloc_cf;
struct ceph_fs_client *src_fsc = ceph_inode_to_client(src_inode);
- struct ceph_object_locator src_oloc, dst_oloc;
- struct ceph_object_id src_oid, dst_oid;
- loff_t endoff = 0, size;
- ssize_t ret = -EIO;
+ loff_t size;
+ ssize_t ret = -EIO, bytes;
u64 src_objnum, dst_objnum, src_objoff, dst_objoff;
- u32 src_objlen, dst_objlen, object_size;
+ u32 src_objlen, dst_objlen;
int src_got = 0, dst_got = 0, err, dirty;
- bool do_final_copy = false;
if (src_inode->i_sb != dst_inode->i_sb) {
struct ceph_fs_client *dst_fsc = ceph_inode_to_client(dst_inode);
@@ -2039,22 +2349,14 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
if (ret < 0)
goto out_caps;
- size = i_size_read(dst_inode);
- endoff = dst_off + len;
-
/* Drop dst file cached pages */
ret = invalidate_inode_pages2_range(dst_inode->i_mapping,
dst_off >> PAGE_SHIFT,
- endoff >> PAGE_SHIFT);
+ (dst_off + len) >> PAGE_SHIFT);
if (ret < 0) {
dout("Failed to invalidate inode pages (%zd)\n", ret);
ret = 0; /* XXX */
}
- src_oloc.pool = src_ci->i_layout.pool_id;
- src_oloc.pool_ns = ceph_try_get_string(src_ci->i_layout.pool_ns);
- dst_oloc.pool = dst_ci->i_layout.pool_id;
- dst_oloc.pool_ns = ceph_try_get_string(dst_ci->i_layout.pool_ns);
-
ceph_calc_file_object_mapping(&src_ci->i_layout, src_off,
src_ci->i_layout.object_size,
&src_objnum, &src_objoff, &src_objlen);
@@ -2073,6 +2375,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
* starting at the src_off
*/
if (src_objoff) {
+ dout("Initial partial copy of %u bytes\n", src_objlen);
+
/*
* we need to temporarily drop all caps as we'll be calling
* {read,write}_iter, which will get caps again.
@@ -2080,8 +2384,9 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got);
ret = do_splice_direct(src_file, &src_off, dst_file,
&dst_off, src_objlen, flags);
- if (ret < 0) {
- dout("do_splice_direct returned %d\n", err);
+ /* Abort on short copies or on error */
+ if (ret < src_objlen) {
+ dout("Failed partial copy (%zd)\n", ret);
goto out;
}
len -= ret;
@@ -2094,65 +2399,27 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
if (err < 0)
goto out_caps;
}
- object_size = src_ci->i_layout.object_size;
- while (len >= object_size) {
- ceph_calc_file_object_mapping(&src_ci->i_layout, src_off,
- object_size, &src_objnum,
- &src_objoff, &src_objlen);
- ceph_calc_file_object_mapping(&dst_ci->i_layout, dst_off,
- object_size, &dst_objnum,
- &dst_objoff, &dst_objlen);
- ceph_oid_init(&src_oid);
- ceph_oid_printf(&src_oid, "%llx.%08llx",
- src_ci->i_vino.ino, src_objnum);
- ceph_oid_init(&dst_oid);
- ceph_oid_printf(&dst_oid, "%llx.%08llx",
- dst_ci->i_vino.ino, dst_objnum);
- /* Do an object remote copy */
- err = ceph_osdc_copy_from(
- &src_fsc->client->osdc,
- src_ci->i_vino.snap, 0,
- &src_oid, &src_oloc,
- CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
- CEPH_OSD_OP_FLAG_FADVISE_NOCACHE,
- &dst_oid, &dst_oloc,
- CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
- CEPH_OSD_OP_FLAG_FADVISE_DONTNEED,
- dst_ci->i_truncate_seq, dst_ci->i_truncate_size,
- CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ);
- if (err) {
- if (err == -EOPNOTSUPP) {
- src_fsc->have_copy_from2 = false;
- pr_notice("OSDs don't support copy-from2; disabling copy offload\n");
- }
- dout("ceph_osdc_copy_from returned %d\n", err);
- if (!ret)
- ret = err;
- goto out_caps;
- }
- len -= object_size;
- src_off += object_size;
- dst_off += object_size;
- ret += object_size;
- }
- if (len)
- /* We still need one final local copy */
- do_final_copy = true;
+ size = i_size_read(dst_inode);
+ bytes = ceph_do_objects_copy(src_ci, &src_off, dst_ci, &dst_off,
+ src_fsc, len, flags);
+ if (bytes <= 0) {
+ if (!ret)
+ ret = bytes;
+ goto out_caps;
+ }
+ dout("Copied %zu bytes out of %zu\n", bytes, len);
+ len -= bytes;
+ ret += bytes;
file_update_time(dst_file);
inode_inc_iversion_raw(dst_inode);
- if (endoff > size) {
- int caps_flags = 0;
-
+ if (dst_off > size) {
/* Let the MDS know about dst file size change */
- if (ceph_quota_is_max_bytes_approaching(dst_inode, endoff))
- caps_flags |= CHECK_CAPS_NODELAY;
- if (ceph_inode_set_size(dst_inode, endoff))
- caps_flags |= CHECK_CAPS_AUTHONLY;
- if (caps_flags)
- ceph_check_caps(dst_ci, caps_flags, NULL);
+ if (ceph_inode_set_size(dst_inode, dst_off) ||
+ ceph_quota_is_max_bytes_approaching(dst_inode, dst_off))
+ ceph_check_caps(dst_ci, CHECK_CAPS_AUTHONLY, NULL);
}
/* Mark Fw dirty */
spin_lock(&dst_ci->i_ceph_lock);
@@ -2165,15 +2432,18 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
out_caps:
put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got);
- if (do_final_copy) {
- err = do_splice_direct(src_file, &src_off, dst_file,
- &dst_off, len, flags);
- if (err < 0) {
- dout("do_splice_direct returned %d\n", err);
- goto out;
- }
- len -= err;
- ret += err;
+ /*
+ * Do the final manual copy if we still have some bytes left, unless
+ * there were errors in remote object copies (len >= object_size).
+ */
+ if (len && (len < src_ci->i_layout.object_size)) {
+ dout("Final partial copy of %zu bytes\n", len);
+ bytes = do_splice_direct(src_file, &src_off, dst_file,
+ &dst_off, len, flags);
+ if (bytes > 0)
+ ret += bytes;
+ else
+ dout("Failed partial copy (%zd)\n", bytes);
}
out:
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index d01710a16a4a..7fef94fd1e55 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -82,10 +82,14 @@ struct inode *ceph_get_snapdir(struct inode *parent)
inode->i_mode = parent->i_mode;
inode->i_uid = parent->i_uid;
inode->i_gid = parent->i_gid;
+ inode->i_mtime = parent->i_mtime;
+ inode->i_ctime = parent->i_ctime;
+ inode->i_atime = parent->i_atime;
inode->i_op = &ceph_snapdir_iops;
inode->i_fop = &ceph_snapdir_fops;
ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
ci->i_rbytes = 0;
+ ci->i_btime = ceph_inode(parent)->i_btime;
if (inode->i_state & I_NEW)
unlock_new_inode(inode);
@@ -447,6 +451,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_max_files = 0;
memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
+ memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout));
RCU_INIT_POINTER(ci->i_layout.pool_ns, NULL);
ci->i_fragtree = RB_ROOT;
@@ -471,13 +476,13 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_prealloc_cap_flush = NULL;
INIT_LIST_HEAD(&ci->i_cap_flush_list);
init_waitqueue_head(&ci->i_cap_wq);
- ci->i_hold_caps_min = 0;
ci->i_hold_caps_max = 0;
INIT_LIST_HEAD(&ci->i_cap_delay_list);
INIT_LIST_HEAD(&ci->i_cap_snaps);
ci->i_head_snapc = NULL;
ci->i_snap_caps = 0;
+ ci->i_last_rd = ci->i_last_wr = jiffies - 3600 * HZ;
for (i = 0; i < CEPH_FILE_MODE_BITS; i++)
ci->i_nr_by_mode[i] = 0;
@@ -496,6 +501,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_rdcache_ref = 0;
ci->i_wr_ref = 0;
ci->i_wb_ref = 0;
+ ci->i_fx_ref = 0;
ci->i_wrbuffer_ref = 0;
ci->i_wrbuffer_ref_head = 0;
atomic_set(&ci->i_filelock_ref, 0);
@@ -586,6 +592,7 @@ void ceph_evict_inode(struct inode *inode)
ceph_buffer_put(ci->i_xattrs.prealloc_blob);
ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns));
+ ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns));
}
static inline blkcnt_t calc_inode_blocks(u64 size)
@@ -636,7 +643,7 @@ int ceph_fill_file_size(struct inode *inode, int issued,
if ((issued & (CEPH_CAP_FILE_CACHE|
CEPH_CAP_FILE_BUFFER)) ||
mapping_mapped(inode->i_mapping) ||
- __ceph_caps_file_wanted(ci)) {
+ __ceph_is_file_opened(ci)) {
ci->i_truncate_pending++;
queue_trunc = 1;
}
@@ -727,11 +734,11 @@ void ceph_fill_file_time(struct inode *inode, int issued,
* Populate an inode based on info from mds. May be called on new or
* existing inodes.
*/
-static int fill_inode(struct inode *inode, struct page *locked_page,
- struct ceph_mds_reply_info_in *iinfo,
- struct ceph_mds_reply_dirfrag *dirinfo,
- struct ceph_mds_session *session, int cap_fmode,
- struct ceph_cap_reservation *caps_reservation)
+int ceph_fill_inode(struct inode *inode, struct page *locked_page,
+ struct ceph_mds_reply_info_in *iinfo,
+ struct ceph_mds_reply_dirfrag *dirinfo,
+ struct ceph_mds_session *session, int cap_fmode,
+ struct ceph_cap_reservation *caps_reservation)
{
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
struct ceph_mds_reply_inode *info = iinfo->in;
@@ -748,7 +755,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
bool new_version = false;
bool fill_inline = false;
- dout("fill_inode %p ino %llx.%llx v %llu had %llu\n",
+ dout("%s %p ino %llx.%llx v %llu had %llu\n", __func__,
inode, ceph_vinop(inode), le64_to_cpu(info->version),
ci->i_version);
@@ -769,7 +776,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
if (iinfo->xattr_len > 4) {
xattr_blob = ceph_buffer_new(iinfo->xattr_len, GFP_NOFS);
if (!xattr_blob)
- pr_err("fill_inode ENOMEM xattr blob %d bytes\n",
+ pr_err("%s ENOMEM xattr blob %d bytes\n", __func__,
iinfo->xattr_len);
}
@@ -932,8 +939,9 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
spin_unlock(&ci->i_ceph_lock);
if (symlen != i_size_read(inode)) {
- pr_err("fill_inode %llx.%llx BAD symlink "
- "size %lld\n", ceph_vinop(inode),
+ pr_err("%s %llx.%llx BAD symlink "
+ "size %lld\n", __func__,
+ ceph_vinop(inode),
i_size_read(inode));
i_size_write(inode, symlen);
inode->i_blocks = calc_inode_blocks(symlen);
@@ -957,7 +965,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
inode->i_fop = &ceph_dir_fops;
break;
default:
- pr_err("fill_inode %llx.%llx BAD mode 0%o\n",
+ pr_err("%s %llx.%llx BAD mode 0%o\n", __func__,
ceph_vinop(inode), inode->i_mode);
}
@@ -966,7 +974,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
if (ceph_snap(inode) == CEPH_NOSNAP) {
ceph_add_cap(inode, session,
le64_to_cpu(info->cap.cap_id),
- cap_fmode, info_caps,
+ info_caps,
le32_to_cpu(info->cap.wanted),
le32_to_cpu(info->cap.seq),
le32_to_cpu(info->cap.mseq),
@@ -991,13 +999,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
dout(" %p got snap_caps %s\n", inode,
ceph_cap_string(info_caps));
ci->i_snap_caps |= info_caps;
- if (cap_fmode >= 0)
- __ceph_get_fmode(ci, cap_fmode);
}
- } else if (cap_fmode >= 0) {
- pr_warn("mds issued no caps on %llx.%llx\n",
- ceph_vinop(inode));
- __ceph_get_fmode(ci, cap_fmode);
}
if (iinfo->inline_version > 0 &&
@@ -1009,6 +1011,13 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
fill_inline = true;
}
+ if (cap_fmode >= 0) {
+ if (!info_caps)
+ pr_warn("mds issued no caps on %llx.%llx\n",
+ ceph_vinop(inode));
+ __ceph_touch_fmode(ci, mdsc, cap_fmode);
+ }
+
spin_unlock(&ci->i_ceph_lock);
if (fill_inline)
@@ -1050,6 +1059,7 @@ static void __update_dentry_lease(struct inode *dir, struct dentry *dentry,
struct ceph_mds_session **old_lease_session)
{
struct ceph_dentry_info *di = ceph_dentry(dentry);
+ unsigned mask = le16_to_cpu(lease->mask);
long unsigned duration = le32_to_cpu(lease->duration_ms);
long unsigned ttl = from_time + (duration * HZ) / 1000;
long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000;
@@ -1061,8 +1071,13 @@ static void __update_dentry_lease(struct inode *dir, struct dentry *dentry,
if (ceph_snap(dir) != CEPH_NOSNAP)
return;
+ if (mask & CEPH_LEASE_PRIMARY_LINK)
+ di->flags |= CEPH_DENTRY_PRIMARY_LINK;
+ else
+ di->flags &= ~CEPH_DENTRY_PRIMARY_LINK;
+
di->lease_shared_gen = atomic_read(&ceph_inode(dir)->i_shared_gen);
- if (duration == 0) {
+ if (!(mask & CEPH_LEASE_VALID)) {
__ceph_dentry_dir_lease_touch(di);
return;
}
@@ -1239,10 +1254,9 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
struct inode *dir = req->r_parent;
if (dir) {
- err = fill_inode(dir, NULL,
- &rinfo->diri, rinfo->dirfrag,
- session, -1,
- &req->r_caps_reservation);
+ err = ceph_fill_inode(dir, NULL, &rinfo->diri,
+ rinfo->dirfrag, session, -1,
+ &req->r_caps_reservation);
if (err < 0)
goto done;
} else {
@@ -1307,13 +1321,14 @@ retry_lookup:
goto done;
}
- err = fill_inode(in, req->r_locked_page, &rinfo->targeti, NULL,
- session,
+ err = ceph_fill_inode(in, req->r_locked_page, &rinfo->targeti,
+ NULL, session,
(!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) &&
+ !test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags) &&
rinfo->head->result == 0) ? req->r_fmode : -1,
&req->r_caps_reservation);
if (err < 0) {
- pr_err("fill_inode badness %p %llx.%llx\n",
+ pr_err("ceph_fill_inode badness %p %llx.%llx\n",
in, ceph_vinop(in));
if (in->i_state & I_NEW)
discard_new_inode(in);
@@ -1500,10 +1515,11 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
dout("new_inode badness got %d\n", err);
continue;
}
- rc = fill_inode(in, NULL, &rde->inode, NULL, session,
- -1, &req->r_caps_reservation);
+ rc = ceph_fill_inode(in, NULL, &rde->inode, NULL, session,
+ -1, &req->r_caps_reservation);
if (rc < 0) {
- pr_err("fill_inode badness on %p got %d\n", in, rc);
+ pr_err("ceph_fill_inode badness on %p got %d\n",
+ in, rc);
err = rc;
if (in->i_state & I_NEW) {
ihold(in);
@@ -1707,10 +1723,10 @@ retry_lookup:
}
}
- ret = fill_inode(in, NULL, &rde->inode, NULL, session,
- -1, &req->r_caps_reservation);
+ ret = ceph_fill_inode(in, NULL, &rde->inode, NULL, session,
+ -1, &req->r_caps_reservation);
if (ret < 0) {
- pr_err("fill_inode badness on %p\n", in);
+ pr_err("ceph_fill_inode badness on %p\n", in);
if (d_really_is_negative(dn)) {
/* avoid calling iput_final() in mds
* dispatch threads */
@@ -1972,7 +1988,7 @@ retry:
mutex_unlock(&ci->i_truncate_mutex);
if (wrbuffer_refs == 0)
- ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
+ ceph_check_caps(ci, 0, NULL);
wake_up_all(&ci->i_cap_wq);
}
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index c90f03beb15d..6e061bf62ad4 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -243,11 +243,13 @@ static long ceph_ioctl_lazyio(struct file *file)
struct ceph_file_info *fi = file->private_data;
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) {
spin_lock(&ci->i_ceph_lock);
fi->fmode |= CEPH_FILE_MODE_LAZY;
ci->i_nr_by_mode[ffs(CEPH_FILE_MODE_LAZY)]++;
+ __ceph_touch_fmode(ci, mdsc, fi->fmode);
spin_unlock(&ci->i_ceph_lock);
dout("ioctl_layzio: file %p marked lazy\n", file);
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 544e9e85b120..d6b9166e71e4 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -210,6 +210,21 @@ static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc,
return 0;
}
+static int try_unlock_file(struct file *file, struct file_lock *fl)
+{
+ int err;
+ unsigned int orig_flags = fl->fl_flags;
+ fl->fl_flags |= FL_EXISTS;
+ err = locks_lock_file_wait(file, fl);
+ fl->fl_flags = orig_flags;
+ if (err == -ENOENT) {
+ if (!(orig_flags & FL_EXISTS))
+ err = 0;
+ return err;
+ }
+ return 1;
+}
+
/**
* Attempt to set an fcntl lock.
* For now, this just goes away to the server. Later it may be more awesome.
@@ -255,9 +270,15 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
else
lock_cmd = CEPH_LOCK_UNLOCK;
+ if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK == fl->fl_type) {
+ err = try_unlock_file(file, fl);
+ if (err <= 0)
+ return err;
+ }
+
err = ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, lock_cmd, wait, fl);
if (!err) {
- if (op == CEPH_MDS_OP_SETFILELOCK) {
+ if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK != fl->fl_type) {
dout("mds locked, locking locally\n");
err = posix_lock_file(file, fl, NULL);
if (err) {
@@ -311,9 +332,15 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
else
lock_cmd = CEPH_LOCK_UNLOCK;
+ if (F_UNLCK == fl->fl_type) {
+ err = try_unlock_file(file, fl);
+ if (err <= 0)
+ return err;
+ }
+
err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK,
inode, lock_cmd, wait, fl);
- if (!err) {
+ if (!err && F_UNLCK != fl->fl_type) {
err = locks_lock_file_wait(file, fl);
if (err) {
ceph_lock_message(CEPH_LOCK_FLOCK,
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index bbbbddf71326..486f91f9685b 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -415,21 +415,121 @@ bad:
return -EIO;
}
+
+#if BITS_PER_LONG == 64
+
+#define DELEGATED_INO_AVAILABLE xa_mk_value(1)
+
+static int ceph_parse_deleg_inos(void **p, void *end,
+ struct ceph_mds_session *s)
+{
+ u32 sets;
+
+ ceph_decode_32_safe(p, end, sets, bad);
+ dout("got %u sets of delegated inodes\n", sets);
+ while (sets--) {
+ u64 start, len, ino;
+
+ ceph_decode_64_safe(p, end, start, bad);
+ ceph_decode_64_safe(p, end, len, bad);
+ while (len--) {
+ int err = xa_insert(&s->s_delegated_inos, ino = start++,
+ DELEGATED_INO_AVAILABLE,
+ GFP_KERNEL);
+ if (!err) {
+ dout("added delegated inode 0x%llx\n",
+ start - 1);
+ } else if (err == -EBUSY) {
+ pr_warn("ceph: MDS delegated inode 0x%llx more than once.\n",
+ start - 1);
+ } else {
+ return err;
+ }
+ }
+ }
+ return 0;
+bad:
+ return -EIO;
+}
+
+u64 ceph_get_deleg_ino(struct ceph_mds_session *s)
+{
+ unsigned long ino;
+ void *val;
+
+ xa_for_each(&s->s_delegated_inos, ino, val) {
+ val = xa_erase(&s->s_delegated_inos, ino);
+ if (val == DELEGATED_INO_AVAILABLE)
+ return ino;
+ }
+ return 0;
+}
+
+int ceph_restore_deleg_ino(struct ceph_mds_session *s, u64 ino)
+{
+ return xa_insert(&s->s_delegated_inos, ino, DELEGATED_INO_AVAILABLE,
+ GFP_KERNEL);
+}
+#else /* BITS_PER_LONG == 64 */
+/*
+ * FIXME: xarrays can't handle 64-bit indexes on a 32-bit arch. For now, just
+ * ignore delegated_inos on 32 bit arch. Maybe eventually add xarrays for top
+ * and bottom words?
+ */
+static int ceph_parse_deleg_inos(void **p, void *end,
+ struct ceph_mds_session *s)
+{
+ u32 sets;
+
+ ceph_decode_32_safe(p, end, sets, bad);
+ if (sets)
+ ceph_decode_skip_n(p, end, sets * 2 * sizeof(__le64), bad);
+ return 0;
+bad:
+ return -EIO;
+}
+
+u64 ceph_get_deleg_ino(struct ceph_mds_session *s)
+{
+ return 0;
+}
+
+int ceph_restore_deleg_ino(struct ceph_mds_session *s, u64 ino)
+{
+ return 0;
+}
+#endif /* BITS_PER_LONG == 64 */
+
/*
* parse create results
*/
static int parse_reply_info_create(void **p, void *end,
struct ceph_mds_reply_info_parsed *info,
- u64 features)
+ u64 features, struct ceph_mds_session *s)
{
+ int ret;
+
if (features == (u64)-1 ||
(features & CEPH_FEATURE_REPLY_CREATE_INODE)) {
- /* Malformed reply? */
if (*p == end) {
+ /* Malformed reply? */
info->has_create_ino = false;
- } else {
+ } else if (test_bit(CEPHFS_FEATURE_DELEG_INO, &s->s_features)) {
+ u8 struct_v, struct_compat;
+ u32 len;
+
info->has_create_ino = true;
+ ceph_decode_8_safe(p, end, struct_v, bad);
+ ceph_decode_8_safe(p, end, struct_compat, bad);
+ ceph_decode_32_safe(p, end, len, bad);
+ ceph_decode_64_safe(p, end, info->ino, bad);
+ ret = ceph_parse_deleg_inos(p, end, s);
+ if (ret)
+ return ret;
+ } else {
+ /* legacy */
ceph_decode_64_safe(p, end, info->ino, bad);
+ info->has_create_ino = true;
}
} else {
if (*p != end)
@@ -448,7 +548,7 @@ bad:
*/
static int parse_reply_info_extra(void **p, void *end,
struct ceph_mds_reply_info_parsed *info,
- u64 features)
+ u64 features, struct ceph_mds_session *s)
{
u32 op = le32_to_cpu(info->head->op);
@@ -457,7 +557,7 @@ static int parse_reply_info_extra(void **p, void *end,
else if (op == CEPH_MDS_OP_READDIR || op == CEPH_MDS_OP_LSSNAP)
return parse_reply_info_readdir(p, end, info, features);
else if (op == CEPH_MDS_OP_CREATE)
- return parse_reply_info_create(p, end, info, features);
+ return parse_reply_info_create(p, end, info, features, s);
else
return -EIO;
}
@@ -465,7 +565,7 @@ static int parse_reply_info_extra(void **p, void *end,
/*
* parse entire mds reply
*/
-static int parse_reply_info(struct ceph_msg *msg,
+static int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg,
struct ceph_mds_reply_info_parsed *info,
u64 features)
{
@@ -490,7 +590,7 @@ static int parse_reply_info(struct ceph_msg *msg,
ceph_decode_32_safe(&p, end, len, bad);
if (len > 0) {
ceph_decode_need(&p, end, len, bad);
- err = parse_reply_info_extra(&p, p+len, info, features);
+ err = parse_reply_info_extra(&p, p+len, info, features, s);
if (err < 0)
goto out_bad;
}
@@ -558,6 +658,7 @@ void ceph_put_mds_session(struct ceph_mds_session *s)
if (refcount_dec_and_test(&s->s_ref)) {
if (s->s_auth.authorizer)
ceph_auth_destroy_authorizer(s->s_auth.authorizer);
+ xa_destroy(&s->s_delegated_inos);
kfree(s);
}
}
@@ -645,6 +746,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
refcount_set(&s->s_ref, 1);
INIT_LIST_HEAD(&s->s_waiting);
INIT_LIST_HEAD(&s->s_unsafe);
+ xa_init(&s->s_delegated_inos);
s->s_num_cap_releases = 0;
s->s_cap_reconnect = 0;
s->s_cap_iterator = NULL;
@@ -699,6 +801,7 @@ void ceph_mdsc_release_request(struct kref *kref)
struct ceph_mds_request *req = container_of(kref,
struct ceph_mds_request,
r_kref);
+ ceph_mdsc_release_dir_caps(req);
destroy_reply_info(&req->r_reply_info);
if (req->r_request)
ceph_msg_put(req->r_request);
@@ -736,7 +839,7 @@ void ceph_mdsc_release_request(struct kref *kref)
put_request_session(req);
ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation);
WARN_ON_ONCE(!list_empty(&req->r_wait));
- kfree(req);
+ kmem_cache_free(ceph_mds_request_cachep, req);
}
DEFINE_RB_FUNCS(request, struct ceph_mds_request, r_tid, r_node)
@@ -793,8 +896,13 @@ static void __register_request(struct ceph_mds_client *mdsc,
mdsc->oldest_tid = req->r_tid;
if (dir) {
+ struct ceph_inode_info *ci = ceph_inode(dir);
+
ihold(dir);
req->r_unsafe_dir = dir;
+ spin_lock(&ci->i_unsafe_lock);
+ list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops);
+ spin_unlock(&ci->i_unsafe_lock);
}
}
@@ -822,8 +930,7 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
erase_request(&mdsc->request_tree, req);
- if (req->r_unsafe_dir &&
- test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
+ if (req->r_unsafe_dir) {
struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
spin_lock(&ci->i_unsafe_lock);
list_del_init(&req->r_unsafe_dir_item);
@@ -1407,8 +1514,6 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
dout("removing cap %p, ci is %p, inode is %p\n",
cap, ci, &ci->vfs_inode);
spin_lock(&ci->i_ceph_lock);
- if (cap->mds_wanted | cap->issued)
- ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
__ceph_remove_cap(cap, false);
if (!ci->i_auth_cap) {
struct ceph_cap_flush *cf;
@@ -1574,9 +1679,6 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
/* mds did not re-issue stale cap */
spin_lock(&ci->i_ceph_lock);
cap->issued = cap->implemented = CEPH_CAP_PIN;
- /* make sure mds knows what we want */
- if (__ceph_caps_file_wanted(ci) & ~cap->mds_wanted)
- ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
spin_unlock(&ci->i_ceph_lock);
}
} else if (ev == FORCE_RO) {
@@ -1772,7 +1874,8 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
}
/* The inode has cached pages, but it's no longer used.
* we can safely drop it */
- if (wanted == 0 && used == CEPH_CAP_FILE_CACHE &&
+ if (S_ISREG(inode->i_mode) &&
+ wanted == 0 && used == CEPH_CAP_FILE_CACHE &&
!(oissued & CEPH_CAP_FILE_CACHE)) {
used = 0;
oissued = 0;
@@ -2089,8 +2192,9 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
struct ceph_mds_request *
ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
{
- struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS);
+ struct ceph_mds_request *req;
+ req = kmem_cache_zalloc(ceph_mds_request_cachep, GFP_NOFS);
if (!req)
return ERR_PTR(-ENOMEM);
@@ -2368,7 +2472,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
head->op = cpu_to_le32(req->r_op);
head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns, req->r_uid));
head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns, req->r_gid));
- head->ino = 0;
+ head->ino = cpu_to_le64(req->r_deleg_ino);
head->args = req->r_args;
ceph_encode_filepath(&p, end, ino1, path1);
@@ -2382,7 +2486,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
if (req->r_inode_drop)
releases += ceph_encode_inode_release(&p,
req->r_inode ? req->r_inode : d_inode(req->r_dentry),
- mds, req->r_inode_drop, req->r_inode_unless, 0);
+ mds, req->r_inode_drop, req->r_inode_unless,
+ req->r_op == CEPH_MDS_OP_READDIR);
if (req->r_dentry_drop)
releases += ceph_encode_dentry_release(&p, req->r_dentry,
req->r_parent, mds, req->r_dentry_drop,
@@ -2522,12 +2627,13 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
flags |= CEPH_MDS_FLAG_REPLAY;
+ if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags))
+ flags |= CEPH_MDS_FLAG_ASYNC;
if (req->r_parent)
flags |= CEPH_MDS_FLAG_WANT_DENTRY;
rhead->flags = cpu_to_le32(flags);
rhead->num_fwd = req->r_num_fwd;
rhead->num_retry = req->r_attempts - 1;
- rhead->ino = 0;
dout(" r_parent = %p\n", req->r_parent);
return 0;
@@ -2573,7 +2679,7 @@ static void __do_request(struct ceph_mds_client *mdsc,
if (req->r_timeout &&
time_after_eq(jiffies, req->r_started + req->r_timeout)) {
dout("do_request timed out\n");
- err = -EIO;
+ err = -ETIMEDOUT;
goto finish;
}
if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
@@ -2605,6 +2711,10 @@ static void __do_request(struct ceph_mds_client *mdsc,
mds = __choose_mds(mdsc, req, &random);
if (mds < 0 ||
ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
+ if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags)) {
+ err = -EJUKEBOX;
+ goto finish;
+ }
dout("do_request no mds or not active, waiting for map\n");
list_add(&req->r_wait, &mdsc->waiting_for_map);
return;
@@ -2629,6 +2739,15 @@ static void __do_request(struct ceph_mds_client *mdsc,
err = -EACCES;
goto out_session;
}
+ /*
+ * We cannot queue async requests since the caps and delegated
+ * inodes are bound to the session. Just return -EJUKEBOX and
+ * let the caller retry a sync request in that case.
+ */
+ if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags)) {
+ err = -EJUKEBOX;
+ goto out_session;
+ }
if (session->s_state == CEPH_MDS_SESSION_NEW ||
session->s_state == CEPH_MDS_SESSION_CLOSING) {
__open_session(mdsc, session);
@@ -2709,19 +2828,43 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds)
int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir,
struct ceph_mds_request *req)
{
- int err;
+ int err = 0;
/* take CAP_PIN refs for r_inode, r_parent, r_old_dentry */
if (req->r_inode)
ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
if (req->r_parent) {
- ceph_get_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN);
+ struct ceph_inode_info *ci = ceph_inode(req->r_parent);
+ int fmode = (req->r_op & CEPH_MDS_OP_WRITE) ?
+ CEPH_FILE_MODE_WR : CEPH_FILE_MODE_RD;
+ spin_lock(&ci->i_ceph_lock);
+ ceph_take_cap_refs(ci, CEPH_CAP_PIN, false);
+ __ceph_touch_fmode(ci, mdsc, fmode);
+ spin_unlock(&ci->i_ceph_lock);
ihold(req->r_parent);
}
if (req->r_old_dentry_dir)
ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
CEPH_CAP_PIN);
+ if (req->r_inode) {
+ err = ceph_wait_on_async_create(req->r_inode);
+ if (err) {
+ dout("%s: wait for async create returned: %d\n",
+ __func__, err);
+ return err;
+ }
+ }
+
+ if (!err && req->r_old_inode) {
+ err = ceph_wait_on_async_create(req->r_old_inode);
+ if (err) {
+ dout("%s: wait for async create returned: %d\n",
+ __func__, err);
+ return err;
+ }
+ }
+
dout("submit_request on %p for inode %p\n", req, dir);
mutex_lock(&mdsc->mutex);
__register_request(mdsc, req, dir);
@@ -2747,7 +2890,7 @@ static int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc,
if (timeleft > 0)
err = 0;
else if (!timeleft)
- err = -EIO; /* timed out */
+ err = -ETIMEDOUT; /* timed out */
else
err = timeleft; /* killed */
}
@@ -2935,22 +3078,14 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
} else {
set_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags);
list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
- if (req->r_unsafe_dir) {
- struct ceph_inode_info *ci =
- ceph_inode(req->r_unsafe_dir);
- spin_lock(&ci->i_unsafe_lock);
- list_add_tail(&req->r_unsafe_dir_item,
- &ci->i_unsafe_dirops);
- spin_unlock(&ci->i_unsafe_lock);
- }
}
dout("handle_reply tid %lld result %d\n", tid, result);
rinfo = &req->r_reply_info;
if (test_bit(CEPHFS_FEATURE_REPLY_ENCODING, &session->s_features))
- err = parse_reply_info(msg, rinfo, (u64)-1);
+ err = parse_reply_info(session, msg, rinfo, (u64)-1);
else
- err = parse_reply_info(msg, rinfo, session->s_con.peer_features);
+ err = parse_reply_info(session, msg, rinfo, session->s_con.peer_features);
mutex_unlock(&mdsc->mutex);
mutex_lock(&session->s_mutex);
@@ -3249,6 +3384,17 @@ bad:
return;
}
+void ceph_mdsc_release_dir_caps(struct ceph_mds_request *req)
+{
+ int dcaps;
+
+ dcaps = xchg(&req->r_dir_caps, 0);
+ if (dcaps) {
+ dout("releasing r_dir_caps=%s\n", ceph_cap_string(dcaps));
+ ceph_put_cap_refs(ceph_inode(req->r_parent), dcaps);
+ }
+}
+
/*
* called under session->mutex.
*/
@@ -3276,9 +3422,14 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
continue;
if (req->r_attempts == 0)
continue; /* only old requests */
- if (req->r_session &&
- req->r_session->s_mds == session->s_mds)
- __send_request(mdsc, session, req, true);
+ if (!req->r_session)
+ continue;
+ if (req->r_session->s_mds != session->s_mds)
+ continue;
+
+ ceph_mdsc_release_dir_caps(req);
+
+ __send_request(mdsc, session, req, true);
}
mutex_unlock(&mdsc->mutex);
}
@@ -3362,7 +3513,7 @@ fail_msg:
/*
* Encode information about a cap for a reconnect with the MDS.
*/
-static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
+static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
void *arg)
{
union {
@@ -3385,6 +3536,15 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
cap->mseq = 0; /* and migrate_seq */
cap->cap_gen = cap->session->s_cap_gen;
+ /* These are lost when the session goes away */
+ if (S_ISDIR(inode->i_mode)) {
+ if (cap->issued & CEPH_CAP_DIR_CREATE) {
+ ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns));
+ memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout));
+ }
+ cap->issued &= ~CEPH_CAP_ANY_DIR_OPS;
+ }
+
if (recon_state->msg_version >= 2) {
rec.v2.cap_id = cpu_to_le64(cap->cap_id);
rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
@@ -3626,6 +3786,8 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
if (!reply)
goto fail_nomsg;
+ xa_destroy(&session->s_delegated_inos);
+
mutex_lock(&session->s_mutex);
session->s_state = CEPH_MDS_SESSION_RECONNECTING;
session->s_seq = 0;
@@ -3681,7 +3843,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
recon_state.msg_version = 2;
}
/* trsaverse this session's caps */
- err = ceph_iterate_session_caps(session, encode_caps_cb, &recon_state);
+ err = ceph_iterate_session_caps(session, reconnect_caps_cb, &recon_state);
spin_lock(&session->s_cap_lock);
session->s_cap_reconnect = 0;
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 27a7446e10d3..4e5be79bf080 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -23,8 +23,9 @@ enum ceph_feature_type {
CEPHFS_FEATURE_RECLAIM_CLIENT,
CEPHFS_FEATURE_LAZY_CAP_WANTED,
CEPHFS_FEATURE_MULTI_RECONNECT,
+ CEPHFS_FEATURE_DELEG_INO,
- CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_MULTI_RECONNECT,
+ CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_DELEG_INO,
};
/*
@@ -37,6 +38,7 @@ enum ceph_feature_type {
CEPHFS_FEATURE_REPLY_ENCODING, \
CEPHFS_FEATURE_LAZY_CAP_WANTED, \
CEPHFS_FEATURE_MULTI_RECONNECT, \
+ CEPHFS_FEATURE_DELEG_INO, \
\
CEPHFS_FEATURE_MAX, \
}
@@ -201,6 +203,7 @@ struct ceph_mds_session {
struct list_head s_waiting; /* waiting requests */
struct list_head s_unsafe; /* unsafe requests */
+ struct xarray s_delegated_inos;
};
/*
@@ -255,6 +258,7 @@ struct ceph_mds_request {
#define CEPH_MDS_R_GOT_RESULT (5) /* got a result */
#define CEPH_MDS_R_DID_PREPOPULATE (6) /* prepopulated readdir */
#define CEPH_MDS_R_PARENT_LOCKED (7) /* is r_parent->i_rwsem wlocked? */
+#define CEPH_MDS_R_ASYNC (8) /* async request */
unsigned long r_req_flags;
struct mutex r_fill_mutex;
@@ -263,6 +267,7 @@ struct ceph_mds_request {
int r_fmode; /* file mode, if expecting cap */
kuid_t r_uid;
kgid_t r_gid;
+ int r_request_release_offset;
struct timespec64 r_stamp;
/* for choosing which mds to send this request to */
@@ -280,12 +285,16 @@ struct ceph_mds_request {
int r_old_inode_drop, r_old_inode_unless;
struct ceph_msg *r_request; /* original request */
- int r_request_release_offset;
struct ceph_msg *r_reply;
struct ceph_mds_reply_info_parsed r_reply_info;
- struct page *r_locked_page;
int r_err;
+
+ struct page *r_locked_page;
+ int r_dir_caps;
+ int r_num_caps;
+ u32 r_readdir_offset;
+
unsigned long r_timeout; /* optional. jiffies, 0 is "wait forever" */
unsigned long r_started; /* start time to measure timeout against */
unsigned long r_request_started; /* start time for mds request only,
@@ -304,6 +313,7 @@ struct ceph_mds_request {
int r_num_fwd; /* number of forward attempts */
int r_resend_mds; /* mds to resend to next, if any*/
u32 r_sent_on_mseq; /* cap mseq request was sent at*/
+ u64 r_deleg_ino;
struct list_head r_wait;
struct completion r_completion;
@@ -315,10 +325,8 @@ struct ceph_mds_request {
long long r_dir_release_cnt;
long long r_dir_ordered_cnt;
int r_readdir_cache_idx;
- u32 r_readdir_offset;
struct ceph_cap_reservation r_caps_reservation;
- int r_num_caps;
};
struct ceph_pool_perm {
@@ -488,6 +496,7 @@ extern int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
struct inode *dir,
struct ceph_mds_request *req);
+extern void ceph_mdsc_release_dir_caps(struct ceph_mds_request *req);
static inline void ceph_mdsc_get_request(struct ceph_mds_request *req)
{
kref_get(&req->r_kref);
@@ -537,4 +546,15 @@ extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
extern int ceph_trim_caps(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session,
int max_caps);
+
+static inline int ceph_wait_on_async_create(struct inode *inode)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+
+ return wait_on_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT,
+ TASK_INTERRUPTIBLE);
+}
+
+extern u64 ceph_get_deleg_ino(struct ceph_mds_session *session);
+extern int ceph_restore_deleg_ino(struct ceph_mds_session *session, u64 ino);
#endif
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index c7f150686a53..c9784eb1159a 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -155,6 +155,7 @@ enum {
Opt_acl,
Opt_quotadf,
Opt_copyfrom,
+ Opt_wsync,
};
enum ceph_recover_session_mode {
@@ -194,6 +195,7 @@ static const struct fs_parameter_spec ceph_mount_parameters[] = {
fsparam_string ("snapdirname", Opt_snapdirname),
fsparam_string ("source", Opt_source),
fsparam_u32 ("wsize", Opt_wsize),
+ fsparam_flag_no ("wsync", Opt_wsync),
{}
};
@@ -444,6 +446,12 @@ static int ceph_parse_mount_param(struct fs_context *fc,
fc->sb_flags &= ~SB_POSIXACL;
}
break;
+ case Opt_wsync:
+ if (!result.negated)
+ fsopt->flags &= ~CEPH_MOUNT_OPT_ASYNC_DIROPS;
+ else
+ fsopt->flags |= CEPH_MOUNT_OPT_ASYNC_DIROPS;
+ break;
default:
BUG();
}
@@ -567,6 +575,9 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
if (fsopt->flags & CEPH_MOUNT_OPT_CLEANRECOVER)
seq_show_option(m, "recover_session", "clean");
+ if (fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS)
+ seq_puts(m, ",nowsync");
+
if (fsopt->wsize != CEPH_MAX_WRITE_SIZE)
seq_printf(m, ",wsize=%u", fsopt->wsize);
if (fsopt->rsize != CEPH_MAX_READ_SIZE)
@@ -729,6 +740,7 @@ struct kmem_cache *ceph_cap_flush_cachep;
struct kmem_cache *ceph_dentry_cachep;
struct kmem_cache *ceph_file_cachep;
struct kmem_cache *ceph_dir_file_cachep;
+struct kmem_cache *ceph_mds_request_cachep;
static void ceph_inode_init_once(void *foo)
{
@@ -769,6 +781,10 @@ static int __init init_caches(void)
if (!ceph_dir_file_cachep)
goto bad_dir_file;
+ ceph_mds_request_cachep = KMEM_CACHE(ceph_mds_request, SLAB_MEM_SPREAD);
+ if (!ceph_mds_request_cachep)
+ goto bad_mds_req;
+
error = ceph_fscache_register();
if (error)
goto bad_fscache;
@@ -776,6 +792,8 @@ static int __init init_caches(void)
return 0;
bad_fscache:
+ kmem_cache_destroy(ceph_mds_request_cachep);
+bad_mds_req:
kmem_cache_destroy(ceph_dir_file_cachep);
bad_dir_file:
kmem_cache_destroy(ceph_file_cachep);
@@ -804,6 +822,7 @@ static void destroy_caches(void)
kmem_cache_destroy(ceph_dentry_cachep);
kmem_cache_destroy(ceph_file_cachep);
kmem_cache_destroy(ceph_dir_file_cachep);
+ kmem_cache_destroy(ceph_mds_request_cachep);
ceph_fscache_unregister();
}
@@ -1107,6 +1126,15 @@ static void ceph_free_fc(struct fs_context *fc)
static int ceph_reconfigure_fc(struct fs_context *fc)
{
+ struct ceph_parse_opts_ctx *pctx = fc->fs_private;
+ struct ceph_mount_options *fsopt = pctx->opts;
+ struct ceph_fs_client *fsc = ceph_sb_to_client(fc->root->d_sb);
+
+ if (fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS)
+ ceph_set_mount_opt(fsc, ASYNC_DIROPS);
+ else
+ ceph_clear_mount_opt(fsc, ASYNC_DIROPS);
+
sync_filesystem(fc->root->d_sb);
return 0;
}
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 037cdfb2ad4f..60aac3aee055 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -43,13 +43,16 @@
#define CEPH_MOUNT_OPT_MOUNTWAIT (1<<12) /* mount waits if no mds is up */
#define CEPH_MOUNT_OPT_NOQUOTADF (1<<13) /* no root dir quota in statfs */
#define CEPH_MOUNT_OPT_NOCOPYFROM (1<<14) /* don't use RADOS 'copy-from' op */
+#define CEPH_MOUNT_OPT_ASYNC_DIROPS (1<<15) /* allow async directory ops */
#define CEPH_MOUNT_OPT_DEFAULT \
(CEPH_MOUNT_OPT_DCACHE | \
CEPH_MOUNT_OPT_NOCOPYFROM)
#define ceph_set_mount_opt(fsc, opt) \
- (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt;
+ (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt
+#define ceph_clear_mount_opt(fsc, opt) \
+ (fsc)->mount_options->flags &= ~CEPH_MOUNT_OPT_##opt
#define ceph_test_mount_opt(fsc, opt) \
(!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt))
@@ -170,9 +173,9 @@ struct ceph_cap {
struct list_head caps_item;
};
-#define CHECK_CAPS_NODELAY 1 /* do not delay any further */
-#define CHECK_CAPS_AUTHONLY 2 /* only check auth cap */
-#define CHECK_CAPS_FLUSH 4 /* flush any dirty caps */
+#define CHECK_CAPS_AUTHONLY 1 /* only check auth cap */
+#define CHECK_CAPS_FLUSH 2 /* flush any dirty caps */
+#define CHECK_CAPS_NOINVAL 4 /* don't invalidate pagecache */
struct ceph_cap_flush {
u64 tid;
@@ -284,6 +287,7 @@ struct ceph_dentry_info {
#define CEPH_DENTRY_REFERENCED 1
#define CEPH_DENTRY_LEASE_LIST 2
#define CEPH_DENTRY_SHRINK_LIST 4
+#define CEPH_DENTRY_PRIMARY_LINK 8
struct ceph_inode_xattrs_info {
/*
@@ -315,13 +319,14 @@ struct ceph_inode_info {
u64 i_inline_version;
u32 i_time_warp_seq;
- unsigned i_ceph_flags;
+ unsigned long i_ceph_flags;
atomic64_t i_release_count;
atomic64_t i_ordered_count;
atomic64_t i_complete_seq[2];
struct ceph_dir_layout i_dir_layout;
struct ceph_file_layout i_layout;
+ struct ceph_file_layout i_cached_layout; // for async creates
char *i_symlink;
/* for dirs */
@@ -352,7 +357,6 @@ struct ceph_inode_info {
struct ceph_cap_flush *i_prealloc_cap_flush;
struct list_head i_cap_flush_list;
wait_queue_head_t i_cap_wq; /* threads waiting on a capability */
- unsigned long i_hold_caps_min; /* jiffies */
unsigned long i_hold_caps_max; /* jiffies */
struct list_head i_cap_delay_list; /* for delayed cap release to mds */
struct ceph_cap_reservation i_cap_migration_resv;
@@ -361,6 +365,8 @@ struct ceph_inode_info {
dirty|flushing caps */
unsigned i_snap_caps; /* cap bits for snapped files */
+ unsigned long i_last_rd;
+ unsigned long i_last_wr;
int i_nr_by_mode[CEPH_FILE_MODE_BITS]; /* open file counts */
struct mutex i_truncate_mutex;
@@ -375,7 +381,7 @@ struct ceph_inode_info {
/* held references to caps */
int i_pin_ref;
- int i_rd_ref, i_rdcache_ref, i_wr_ref, i_wb_ref;
+ int i_rd_ref, i_rdcache_ref, i_wr_ref, i_wb_ref, i_fx_ref;
int i_wrbuffer_ref, i_wrbuffer_ref_head;
atomic_t i_filelock_ref;
atomic_t i_shared_gen; /* increment each time we get FILE_SHARED */
@@ -511,18 +517,18 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
* Ceph inode.
*/
#define CEPH_I_DIR_ORDERED (1 << 0) /* dentries in dir are ordered */
-#define CEPH_I_NODELAY (1 << 1) /* do not delay cap release */
#define CEPH_I_FLUSH (1 << 2) /* do not delay flush of dirty metadata */
#define CEPH_I_POOL_PERM (1 << 3) /* pool rd/wr bits are valid */
#define CEPH_I_POOL_RD (1 << 4) /* can read from pool */
#define CEPH_I_POOL_WR (1 << 5) /* can write to pool */
#define CEPH_I_SEC_INITED (1 << 6) /* security initialized */
-#define CEPH_I_CAP_DROPPED (1 << 7) /* caps were forcibly dropped */
-#define CEPH_I_KICK_FLUSH (1 << 8) /* kick flushing caps */
-#define CEPH_I_FLUSH_SNAPS (1 << 9) /* need flush snapss */
-#define CEPH_I_ERROR_WRITE (1 << 10) /* have seen write errors */
-#define CEPH_I_ERROR_FILELOCK (1 << 11) /* have seen file lock errors */
-#define CEPH_I_ODIRECT (1 << 12) /* inode in direct I/O mode */
+#define CEPH_I_KICK_FLUSH (1 << 7) /* kick flushing caps */
+#define CEPH_I_FLUSH_SNAPS (1 << 8) /* need flush snapss */
+#define CEPH_I_ERROR_WRITE (1 << 9) /* have seen write errors */
+#define CEPH_I_ERROR_FILELOCK (1 << 10) /* have seen file lock errors */
+#define CEPH_I_ODIRECT (1 << 11) /* inode in direct I/O mode */
+#define CEPH_ASYNC_CREATE_BIT (12) /* async create in flight for this */
+#define CEPH_I_ASYNC_CREATE (1 << CEPH_ASYNC_CREATE_BIT)
/*
* Masks of ceph inode work.
@@ -674,18 +680,12 @@ extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask);
extern int __ceph_caps_used(struct ceph_inode_info *ci);
-extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci);
-
-/*
- * wanted, by virtue of open file modes AND cap refs (buffered/cached data)
- */
-static inline int __ceph_caps_wanted(struct ceph_inode_info *ci)
+static inline bool __ceph_is_file_opened(struct ceph_inode_info *ci)
{
- int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci);
- if (w & CEPH_CAP_FILE_BUFFER)
- w |= CEPH_CAP_FILE_EXCL; /* we want EXCL if dirty data */
- return w;
+ return ci->i_nr_by_mode[0];
}
+extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci);
+extern int __ceph_caps_wanted(struct ceph_inode_info *ci);
/* what the mds thinks we want */
extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check);
@@ -899,6 +899,9 @@ static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci)
}
/* inode.c */
+struct ceph_mds_reply_info_in;
+struct ceph_mds_reply_dirfrag;
+
extern const struct inode_operations ceph_file_iops;
extern struct inode *ceph_alloc_inode(struct super_block *sb);
@@ -914,6 +917,11 @@ extern void ceph_fill_file_time(struct inode *inode, int issued,
u64 time_warp_seq, struct timespec64 *ctime,
struct timespec64 *mtime,
struct timespec64 *atime);
+extern int ceph_fill_inode(struct inode *inode, struct page *locked_page,
+ struct ceph_mds_reply_info_in *iinfo,
+ struct ceph_mds_reply_dirfrag *dirinfo,
+ struct ceph_mds_session *session, int cap_fmode,
+ struct ceph_cap_reservation *caps_reservation);
extern int ceph_fill_trace(struct super_block *sb,
struct ceph_mds_request *req);
extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
@@ -1042,7 +1050,7 @@ extern struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
struct ceph_cap_reservation *ctx);
extern void ceph_add_cap(struct inode *inode,
struct ceph_mds_session *session, u64 cap_id,
- int fmode, unsigned issued, unsigned wanted,
+ unsigned issued, unsigned wanted,
unsigned cap, unsigned seq, u64 realmino, int flags,
struct ceph_cap **new_cap);
extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release);
@@ -1058,8 +1066,12 @@ extern void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session);
extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session);
+void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session,
+ struct ceph_inode_info *ci);
extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci,
int mds);
+extern void ceph_take_cap_refs(struct ceph_inode_info *ci, int caps,
+ bool snap_rwsem_locked);
extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
@@ -1084,8 +1096,10 @@ extern int ceph_try_get_caps(struct inode *inode,
int need, int want, bool nonblock, int *got);
/* for counting open files by mode */
-extern void __ceph_get_fmode(struct ceph_inode_info *ci, int mode);
-extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode);
+extern void ceph_get_fmode(struct ceph_inode_info *ci, int mode, int count);
+extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode, int count);
+extern void __ceph_touch_fmode(struct ceph_inode_info *ci,
+ struct ceph_mds_client *mdsc, int fmode);
/* addr.c */
extern const struct address_space_operations ceph_aops;
@@ -1097,7 +1111,7 @@ extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc);
/* file.c */
extern const struct file_operations ceph_file_fops;
-extern int ceph_renew_caps(struct inode *inode);
+extern int ceph_renew_caps(struct inode *inode, int fmode);
extern int ceph_open(struct inode *inode, struct file *file);
extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
struct file *file, unsigned flags, umode_t mode);
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 22cf04fb32d3..604f65f4b6c5 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -202,7 +202,7 @@ config CIFS_SMB_DIRECT
help
Enables SMB Direct support for SMB 3.0, 3.02 and 3.1.1.
SMB Direct allows transferring SMB packets over RDMA. If unsure,
- say N.
+ say Y.
config CIFS_FSCACHE
bool "Provide CIFS client caching support"
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 276e4b5ea8e0..916567d770f5 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -323,10 +323,8 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
atomic_read(&server->smbd_conn->send_credits),
atomic_read(&server->smbd_conn->receive_credits),
server->smbd_conn->receive_credit_target);
- seq_printf(m, "\nPending send_pending: %x "
- "send_payload_pending: %x",
- atomic_read(&server->smbd_conn->send_pending),
- atomic_read(&server->smbd_conn->send_payload_pending));
+ seq_printf(m, "\nPending send_pending: %x ",
+ atomic_read(&server->smbd_conn->send_pending));
seq_printf(m, "\nReceive buffers count_receive_queue: %x "
"count_empty_packet_queue: %x",
server->smbd_conn->count_receive_queue,
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 94e3ed4850b5..c31f362fa098 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -1208,6 +1208,10 @@ static ssize_t cifs_copy_file_range(struct file *src_file, loff_t off,
{
unsigned int xid = get_xid();
ssize_t rc;
+ struct cifsFileInfo *cfile = dst_file->private_data;
+
+ if (cfile->swapfile)
+ return -EOPNOTSUPP;
rc = cifs_file_copychunk_range(xid, src_file, off, dst_file, destoff,
len, flags);
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 0d956360e984..05dd3dea684b 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -426,7 +426,8 @@ struct smb_version_operations {
/* generate new lease key */
void (*new_lease_key)(struct cifs_fid *);
int (*generate_signingkey)(struct cifs_ses *);
- int (*calc_signature)(struct smb_rqst *, struct TCP_Server_Info *);
+ int (*calc_signature)(struct smb_rqst *, struct TCP_Server_Info *,
+ bool allocate_crypto);
int (*set_integrity)(const unsigned int, struct cifs_tcon *tcon,
struct cifsFileInfo *src_file);
int (*enum_snapshots)(const unsigned int xid, struct cifs_tcon *tcon,
@@ -1312,6 +1313,7 @@ struct cifsFileInfo {
struct tcon_link *tlink;
unsigned int f_flags;
bool invalidHandle:1; /* file closed via session abend */
+ bool swapfile:1;
bool oplock_break_cancelled:1;
unsigned int oplock_epoch; /* epoch from the lease break */
__u32 oplock_level; /* oplock/lease level from the lease break */
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 5920820bfbd0..0b1528edebcf 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -4808,6 +4808,60 @@ cifs_direct_io(struct kiocb *iocb, struct iov_iter *iter)
return -EINVAL;
}
+static int cifs_swap_activate(struct swap_info_struct *sis,
+ struct file *swap_file, sector_t *span)
+{
+ struct cifsFileInfo *cfile = swap_file->private_data;
+ struct inode *inode = swap_file->f_mapping->host;
+ unsigned long blocks;
+ long long isize;
+
+ cifs_dbg(FYI, "swap activate\n");
+
+ spin_lock(&inode->i_lock);
+ blocks = inode->i_blocks;
+ isize = inode->i_size;
+ spin_unlock(&inode->i_lock);
+ if (blocks*512 < isize) {
+ pr_warn("swap activate: swapfile has holes\n");
+ return -EINVAL;
+ }
+ *span = sis->pages;
+
+ printk_once(KERN_WARNING "Swap support over SMB3 is experimental\n");
+
+ /*
+ * TODO: consider adding ACL (or documenting how) to prevent other
+ * users (on this or other systems) from reading it
+ */
+
+
+ /* TODO: add sk_set_memalloc(inet) or similar */
+
+ if (cfile)
+ cfile->swapfile = true;
+ /*
+ * TODO: Since file already open, we can't open with DENY_ALL here
+ * but we could add call to grab a byte range lock to prevent others
+ * from reading or writing the file
+ */
+
+ return 0;
+}
+
+static void cifs_swap_deactivate(struct file *file)
+{
+ struct cifsFileInfo *cfile = file->private_data;
+
+ cifs_dbg(FYI, "swap deactivate\n");
+
+ /* TODO: undo sk_set_memalloc(inet) will eventually be needed */
+
+ if (cfile)
+ cfile->swapfile = false;
+
+ /* do we need to unpin (or unlock) the file */
+}
const struct address_space_operations cifs_addr_ops = {
.readpage = cifs_readpage,
@@ -4821,6 +4875,13 @@ const struct address_space_operations cifs_addr_ops = {
.direct_IO = cifs_direct_io,
.invalidatepage = cifs_invalidate_page,
.launder_page = cifs_launder_page,
+ /*
+ * TODO: investigate and if useful we could add an cifs_migratePage
+ * helper (under an CONFIG_MIGRATION) in the future, and also
+ * investigate and add an is_dirty_writeback helper if needed
+ */
+ .swap_activate = cifs_swap_activate,
+ .swap_deactivate = cifs_swap_deactivate,
};
/*
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 8d01ec2dca66..8fbbdcdad8ff 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -2026,6 +2026,10 @@ cifs_revalidate_mapping(struct inode *inode)
int rc;
unsigned long *flags = &CIFS_I(inode)->flags;
+ /* swapfiles are not supposed to be shared */
+ if (IS_SWAPFILE(inode))
+ return 0;
+
rc = wait_on_bit_lock_action(flags, CIFS_INO_LOCK, cifs_wait_bit_killable,
TASK_KILLABLE);
if (rc)
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 19e4a5d3b4ca..50f776a8d4ba 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -246,7 +246,7 @@ cifs_posix_to_fattr(struct cifs_fattr *fattr, struct smb2_posix_info *info,
*/
fattr->cf_mode = le32_to_cpu(info->Mode) & ~S_IFMT;
- cifs_dbg(VFS, "XXX dev %d, reparse %d, mode %o",
+ cifs_dbg(FYI, "posix fattr: dev %d, reparse %d, mode %o",
le32_to_cpu(info->DeviceId),
le32_to_cpu(info->ReparseTag),
le32_to_cpu(info->Mode));
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index 0511aaf451d4..497afb0b9960 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -766,6 +766,20 @@ smb2_handle_cancelled_close(struct cifs_tcon *tcon, __u64 persistent_fid,
cifs_dbg(FYI, "%s: tc_count=%d\n", __func__, tcon->tc_count);
spin_lock(&cifs_tcp_ses_lock);
+ if (tcon->tc_count <= 0) {
+ struct TCP_Server_Info *server = NULL;
+
+ WARN_ONCE(tcon->tc_count < 0, "tcon refcount is negative");
+ spin_unlock(&cifs_tcp_ses_lock);
+
+ if (tcon->ses)
+ server = tcon->ses->server;
+
+ cifs_server_dbg(FYI, "tid=%u: tcon is closing, skipping async close retry of fid %llu %llu\n",
+ tcon->tid, persistent_fid, volatile_fid);
+
+ return 0;
+ }
tcon->tc_count++;
spin_unlock(&cifs_tcp_ses_lock);
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 4d1ff7b66fdc..087d5f14320b 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -55,9 +55,11 @@ extern struct cifs_ses *smb2_find_smb_ses(struct TCP_Server_Info *server,
extern struct cifs_tcon *smb2_find_smb_tcon(struct TCP_Server_Info *server,
__u64 ses_id, __u32 tid);
extern int smb2_calc_signature(struct smb_rqst *rqst,
- struct TCP_Server_Info *server);
+ struct TCP_Server_Info *server,
+ bool allocate_crypto);
extern int smb3_calc_signature(struct smb_rqst *rqst,
- struct TCP_Server_Info *server);
+ struct TCP_Server_Info *server,
+ bool allocate_crypto);
extern void smb2_echo_request(struct work_struct *work);
extern __le32 smb2_get_lease_state(struct cifsInodeInfo *cinode);
extern bool smb2_is_valid_oplock_break(char *buffer,
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index 20cc79e5c15d..1a6c227ada8f 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -41,14 +41,6 @@
#include "smb2glob.h"
static int
-smb2_crypto_shash_allocate(struct TCP_Server_Info *server)
-{
- return cifs_alloc_hash("hmac(sha256)",
- &server->secmech.hmacsha256,
- &server->secmech.sdeschmacsha256);
-}
-
-static int
smb3_crypto_shash_allocate(struct TCP_Server_Info *server)
{
struct cifs_secmech *p = &server->secmech;
@@ -219,7 +211,8 @@ smb2_find_smb_tcon(struct TCP_Server_Info *server, __u64 ses_id, __u32 tid)
}
int
-smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
+smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server,
+ bool allocate_crypto)
{
int rc;
unsigned char smb2_signature[SMB2_HMACSHA256_SIZE];
@@ -228,6 +221,8 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)iov[0].iov_base;
struct cifs_ses *ses;
struct shash_desc *shash;
+ struct crypto_shash *hash;
+ struct sdesc *sdesc = NULL;
struct smb_rqst drqst;
ses = smb2_find_smb_ses(server, shdr->SessionId);
@@ -239,24 +234,32 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
memset(smb2_signature, 0x0, SMB2_HMACSHA256_SIZE);
memset(shdr->Signature, 0x0, SMB2_SIGNATURE_SIZE);
- rc = smb2_crypto_shash_allocate(server);
- if (rc) {
- cifs_server_dbg(VFS, "%s: sha256 alloc failed\n", __func__);
- return rc;
+ if (allocate_crypto) {
+ rc = cifs_alloc_hash("hmac(sha256)", &hash, &sdesc);
+ if (rc) {
+ cifs_server_dbg(VFS,
+ "%s: sha256 alloc failed\n", __func__);
+ return rc;
+ }
+ shash = &sdesc->shash;
+ } else {
+ hash = server->secmech.hmacsha256;
+ shash = &server->secmech.sdeschmacsha256->shash;
}
- rc = crypto_shash_setkey(server->secmech.hmacsha256,
- ses->auth_key.response, SMB2_NTLMV2_SESSKEY_SIZE);
+ rc = crypto_shash_setkey(hash, ses->auth_key.response,
+ SMB2_NTLMV2_SESSKEY_SIZE);
if (rc) {
- cifs_server_dbg(VFS, "%s: Could not update with response\n", __func__);
- return rc;
+ cifs_server_dbg(VFS,
+ "%s: Could not update with response\n",
+ __func__);
+ goto out;
}
- shash = &server->secmech.sdeschmacsha256->shash;
rc = crypto_shash_init(shash);
if (rc) {
cifs_server_dbg(VFS, "%s: Could not init sha256", __func__);
- return rc;
+ goto out;
}
/*
@@ -271,9 +274,10 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
rc = crypto_shash_update(shash, iov[0].iov_base,
iov[0].iov_len);
if (rc) {
- cifs_server_dbg(VFS, "%s: Could not update with payload\n",
- __func__);
- return rc;
+ cifs_server_dbg(VFS,
+ "%s: Could not update with payload\n",
+ __func__);
+ goto out;
}
drqst.rq_iov++;
drqst.rq_nvec--;
@@ -283,6 +287,9 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
if (!rc)
memcpy(shdr->Signature, sigptr, SMB2_SIGNATURE_SIZE);
+out:
+ if (allocate_crypto)
+ cifs_free_hash(&hash, &sdesc);
return rc;
}
@@ -504,14 +511,17 @@ generate_smb311signingkey(struct cifs_ses *ses)
}
int
-smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
+smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server,
+ bool allocate_crypto)
{
int rc;
unsigned char smb3_signature[SMB2_CMACAES_SIZE];
unsigned char *sigptr = smb3_signature;
struct kvec *iov = rqst->rq_iov;
struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)iov[0].iov_base;
- struct shash_desc *shash = &server->secmech.sdesccmacaes->shash;
+ struct shash_desc *shash;
+ struct crypto_shash *hash;
+ struct sdesc *sdesc = NULL;
struct smb_rqst drqst;
u8 key[SMB3_SIGN_KEY_SIZE];
@@ -519,14 +529,24 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
if (rc)
return 0;
+ if (allocate_crypto) {
+ rc = cifs_alloc_hash("cmac(aes)", &hash, &sdesc);
+ if (rc)
+ return rc;
+
+ shash = &sdesc->shash;
+ } else {
+ hash = server->secmech.cmacaes;
+ shash = &server->secmech.sdesccmacaes->shash;
+ }
+
memset(smb3_signature, 0x0, SMB2_CMACAES_SIZE);
memset(shdr->Signature, 0x0, SMB2_SIGNATURE_SIZE);
- rc = crypto_shash_setkey(server->secmech.cmacaes,
- key, SMB2_CMACAES_SIZE);
+ rc = crypto_shash_setkey(hash, key, SMB2_CMACAES_SIZE);
if (rc) {
cifs_server_dbg(VFS, "%s: Could not set key for cmac aes\n", __func__);
- return rc;
+ goto out;
}
/*
@@ -537,7 +557,7 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
rc = crypto_shash_init(shash);
if (rc) {
cifs_server_dbg(VFS, "%s: Could not init cmac aes\n", __func__);
- return rc;
+ goto out;
}
/*
@@ -554,7 +574,7 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
if (rc) {
cifs_server_dbg(VFS, "%s: Could not update with payload\n",
__func__);
- return rc;
+ goto out;
}
drqst.rq_iov++;
drqst.rq_nvec--;
@@ -564,6 +584,9 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
if (!rc)
memcpy(shdr->Signature, sigptr, SMB2_SIGNATURE_SIZE);
+out:
+ if (allocate_crypto)
+ cifs_free_hash(&hash, &sdesc);
return rc;
}
@@ -593,7 +616,7 @@ smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server)
return 0;
}
- rc = server->ops->calc_signature(rqst, server);
+ rc = server->ops->calc_signature(rqst, server, false);
return rc;
}
@@ -631,9 +654,7 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
memset(shdr->Signature, 0, SMB2_SIGNATURE_SIZE);
- mutex_lock(&server->srv_mutex);
- rc = server->ops->calc_signature(rqst, server);
- mutex_unlock(&server->srv_mutex);
+ rc = server->ops->calc_signature(rqst, server, true);
if (rc)
return rc;
diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c
index 8da43a500686..1a5834a5d597 100644
--- a/fs/cifs/smbdirect.c
+++ b/fs/cifs/smbdirect.c
@@ -284,13 +284,10 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc)
request->sge[i].length,
DMA_TO_DEVICE);
- if (request->has_payload) {
- if (atomic_dec_and_test(&request->info->send_payload_pending))
- wake_up(&request->info->wait_send_payload_pending);
- } else {
- if (atomic_dec_and_test(&request->info->send_pending))
- wake_up(&request->info->wait_send_pending);
- }
+ if (atomic_dec_and_test(&request->info->send_pending))
+ wake_up(&request->info->wait_send_pending);
+
+ wake_up(&request->info->wait_post_send);
mempool_free(request, request->info->request_mempool);
}
@@ -383,27 +380,6 @@ static bool process_negotiation_response(
return true;
}
-/*
- * Check and schedule to send an immediate packet
- * This is used to extend credtis to remote peer to keep the transport busy
- */
-static void check_and_send_immediate(struct smbd_connection *info)
-{
- if (info->transport_status != SMBD_CONNECTED)
- return;
-
- info->send_immediate = true;
-
- /*
- * Promptly send a packet if our peer is running low on receive
- * credits
- */
- if (atomic_read(&info->receive_credits) <
- info->receive_credit_target - 1)
- queue_delayed_work(
- info->workqueue, &info->send_immediate_work, 0);
-}
-
static void smbd_post_send_credits(struct work_struct *work)
{
int ret = 0;
@@ -453,10 +429,16 @@ static void smbd_post_send_credits(struct work_struct *work)
info->new_credits_offered += ret;
spin_unlock(&info->lock_new_credits_offered);
- atomic_add(ret, &info->receive_credits);
-
- /* Check if we can post new receive and grant credits to peer */
- check_and_send_immediate(info);
+ /* Promptly send an immediate packet as defined in [MS-SMBD] 3.1.1.1 */
+ info->send_immediate = true;
+ if (atomic_read(&info->receive_credits) <
+ info->receive_credit_target - 1) {
+ if (info->keep_alive_requested == KEEP_ALIVE_PENDING ||
+ info->send_immediate) {
+ log_keep_alive(INFO, "send an empty message\n");
+ smbd_post_send_empty(info);
+ }
+ }
}
/* Called from softirq, when recv is done */
@@ -551,12 +533,6 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
info->keep_alive_requested = KEEP_ALIVE_PENDING;
}
- /*
- * Check if we need to send something to remote peer to
- * grant more credits or respond to KEEP_ALIVE packet
- */
- check_and_send_immediate(info);
-
return;
default:
@@ -749,7 +725,6 @@ static int smbd_post_send_negotiate_req(struct smbd_connection *info)
request->sge[0].addr,
request->sge[0].length, request->sge[0].lkey);
- request->has_payload = false;
atomic_inc(&info->send_pending);
rc = ib_post_send(info->id->qp, &send_wr, NULL);
if (!rc)
@@ -806,45 +781,96 @@ static int manage_keep_alive_before_sending(struct smbd_connection *info)
return 0;
}
-/*
- * Build and prepare the SMBD packet header
- * This function waits for avaialbe send credits and build a SMBD packet
- * header. The caller then optional append payload to the packet after
- * the header
- * intput values
- * size: the size of the payload
- * remaining_data_length: remaining data to send if this is part of a
- * fragmented packet
- * output values
- * request_out: the request allocated from this function
- * return values: 0 on success, otherwise actual error code returned
- */
-static int smbd_create_header(struct smbd_connection *info,
- int size, int remaining_data_length,
- struct smbd_request **request_out)
+/* Post the send request */
+static int smbd_post_send(struct smbd_connection *info,
+ struct smbd_request *request)
+{
+ struct ib_send_wr send_wr;
+ int rc, i;
+
+ for (i = 0; i < request->num_sge; i++) {
+ log_rdma_send(INFO,
+ "rdma_request sge[%d] addr=%llu length=%u\n",
+ i, request->sge[i].addr, request->sge[i].length);
+ ib_dma_sync_single_for_device(
+ info->id->device,
+ request->sge[i].addr,
+ request->sge[i].length,
+ DMA_TO_DEVICE);
+ }
+
+ request->cqe.done = send_done;
+
+ send_wr.next = NULL;
+ send_wr.wr_cqe = &request->cqe;
+ send_wr.sg_list = request->sge;
+ send_wr.num_sge = request->num_sge;
+ send_wr.opcode = IB_WR_SEND;
+ send_wr.send_flags = IB_SEND_SIGNALED;
+
+ rc = ib_post_send(info->id->qp, &send_wr, NULL);
+ if (rc) {
+ log_rdma_send(ERR, "ib_post_send failed rc=%d\n", rc);
+ smbd_disconnect_rdma_connection(info);
+ rc = -EAGAIN;
+ } else
+ /* Reset timer for idle connection after packet is sent */
+ mod_delayed_work(info->workqueue, &info->idle_timer_work,
+ info->keep_alive_interval*HZ);
+
+ return rc;
+}
+
+static int smbd_post_send_sgl(struct smbd_connection *info,
+ struct scatterlist *sgl, int data_length, int remaining_data_length)
{
+ int num_sgs;
+ int i, rc;
+ int header_length;
struct smbd_request *request;
struct smbd_data_transfer *packet;
- int header_length;
- int rc;
+ int new_credits;
+ struct scatterlist *sg;
+wait_credit:
/* Wait for send credits. A SMBD packet needs one credit */
rc = wait_event_interruptible(info->wait_send_queue,
atomic_read(&info->send_credits) > 0 ||
info->transport_status != SMBD_CONNECTED);
if (rc)
- return rc;
+ goto err_wait_credit;
+
+ if (info->transport_status != SMBD_CONNECTED) {
+ log_outgoing(ERR, "disconnected not sending on wait_credit\n");
+ rc = -EAGAIN;
+ goto err_wait_credit;
+ }
+ if (unlikely(atomic_dec_return(&info->send_credits) < 0)) {
+ atomic_inc(&info->send_credits);
+ goto wait_credit;
+ }
+
+wait_send_queue:
+ wait_event(info->wait_post_send,
+ atomic_read(&info->send_pending) < info->send_credit_target ||
+ info->transport_status != SMBD_CONNECTED);
if (info->transport_status != SMBD_CONNECTED) {
- log_outgoing(ERR, "disconnected not sending\n");
- return -EAGAIN;
+ log_outgoing(ERR, "disconnected not sending on wait_send_queue\n");
+ rc = -EAGAIN;
+ goto err_wait_send_queue;
+ }
+
+ if (unlikely(atomic_inc_return(&info->send_pending) >
+ info->send_credit_target)) {
+ atomic_dec(&info->send_pending);
+ goto wait_send_queue;
}
- atomic_dec(&info->send_credits);
request = mempool_alloc(info->request_mempool, GFP_KERNEL);
if (!request) {
rc = -ENOMEM;
- goto err;
+ goto err_alloc;
}
request->info = info;
@@ -852,8 +878,11 @@ static int smbd_create_header(struct smbd_connection *info,
/* Fill in the packet header */
packet = smbd_request_payload(request);
packet->credits_requested = cpu_to_le16(info->send_credit_target);
- packet->credits_granted =
- cpu_to_le16(manage_credits_prior_sending(info));
+
+ new_credits = manage_credits_prior_sending(info);
+ atomic_add(new_credits, &info->receive_credits);
+ packet->credits_granted = cpu_to_le16(new_credits);
+
info->send_immediate = false;
packet->flags = 0;
@@ -861,11 +890,11 @@ static int smbd_create_header(struct smbd_connection *info,
packet->flags |= cpu_to_le16(SMB_DIRECT_RESPONSE_REQUESTED);
packet->reserved = 0;
- if (!size)
+ if (!data_length)
packet->data_offset = 0;
else
packet->data_offset = cpu_to_le32(24);
- packet->data_length = cpu_to_le32(size);
+ packet->data_length = cpu_to_le32(data_length);
packet->remaining_data_length = cpu_to_le32(remaining_data_length);
packet->padding = 0;
@@ -880,7 +909,7 @@ static int smbd_create_header(struct smbd_connection *info,
/* Map the packet to DMA */
header_length = sizeof(struct smbd_data_transfer);
/* If this is a packet without payload, don't send padding */
- if (!size)
+ if (!data_length)
header_length = offsetof(struct smbd_data_transfer, padding);
request->num_sge = 1;
@@ -889,102 +918,15 @@ static int smbd_create_header(struct smbd_connection *info,
header_length,
DMA_TO_DEVICE);
if (ib_dma_mapping_error(info->id->device, request->sge[0].addr)) {
- mempool_free(request, info->request_mempool);
rc = -EIO;
- goto err;
+ request->sge[0].addr = 0;
+ goto err_dma;
}
request->sge[0].length = header_length;
request->sge[0].lkey = info->pd->local_dma_lkey;
- *request_out = request;
- return 0;
-
-err:
- atomic_inc(&info->send_credits);
- return rc;
-}
-
-static void smbd_destroy_header(struct smbd_connection *info,
- struct smbd_request *request)
-{
-
- ib_dma_unmap_single(info->id->device,
- request->sge[0].addr,
- request->sge[0].length,
- DMA_TO_DEVICE);
- mempool_free(request, info->request_mempool);
- atomic_inc(&info->send_credits);
-}
-
-/* Post the send request */
-static int smbd_post_send(struct smbd_connection *info,
- struct smbd_request *request, bool has_payload)
-{
- struct ib_send_wr send_wr;
- int rc, i;
-
- for (i = 0; i < request->num_sge; i++) {
- log_rdma_send(INFO,
- "rdma_request sge[%d] addr=%llu length=%u\n",
- i, request->sge[i].addr, request->sge[i].length);
- ib_dma_sync_single_for_device(
- info->id->device,
- request->sge[i].addr,
- request->sge[i].length,
- DMA_TO_DEVICE);
- }
-
- request->cqe.done = send_done;
-
- send_wr.next = NULL;
- send_wr.wr_cqe = &request->cqe;
- send_wr.sg_list = request->sge;
- send_wr.num_sge = request->num_sge;
- send_wr.opcode = IB_WR_SEND;
- send_wr.send_flags = IB_SEND_SIGNALED;
-
- if (has_payload) {
- request->has_payload = true;
- atomic_inc(&info->send_payload_pending);
- } else {
- request->has_payload = false;
- atomic_inc(&info->send_pending);
- }
-
- rc = ib_post_send(info->id->qp, &send_wr, NULL);
- if (rc) {
- log_rdma_send(ERR, "ib_post_send failed rc=%d\n", rc);
- if (has_payload) {
- if (atomic_dec_and_test(&info->send_payload_pending))
- wake_up(&info->wait_send_payload_pending);
- } else {
- if (atomic_dec_and_test(&info->send_pending))
- wake_up(&info->wait_send_pending);
- }
- smbd_disconnect_rdma_connection(info);
- rc = -EAGAIN;
- } else
- /* Reset timer for idle connection after packet is sent */
- mod_delayed_work(info->workqueue, &info->idle_timer_work,
- info->keep_alive_interval*HZ);
-
- return rc;
-}
-
-static int smbd_post_send_sgl(struct smbd_connection *info,
- struct scatterlist *sgl, int data_length, int remaining_data_length)
-{
- int num_sgs;
- int i, rc;
- struct smbd_request *request;
- struct scatterlist *sg;
-
- rc = smbd_create_header(
- info, data_length, remaining_data_length, &request);
- if (rc)
- return rc;
-
+ /* Fill in the packet data payload */
num_sgs = sgl ? sg_nents(sgl) : 0;
for_each_sg(sgl, sg, num_sgs, i) {
request->sge[i+1].addr =
@@ -994,25 +936,41 @@ static int smbd_post_send_sgl(struct smbd_connection *info,
info->id->device, request->sge[i+1].addr)) {
rc = -EIO;
request->sge[i+1].addr = 0;
- goto dma_mapping_failure;
+ goto err_dma;
}
request->sge[i+1].length = sg->length;
request->sge[i+1].lkey = info->pd->local_dma_lkey;
request->num_sge++;
}
- rc = smbd_post_send(info, request, data_length);
+ rc = smbd_post_send(info, request);
if (!rc)
return 0;
-dma_mapping_failure:
- for (i = 1; i < request->num_sge; i++)
+err_dma:
+ for (i = 0; i < request->num_sge; i++)
if (request->sge[i].addr)
ib_dma_unmap_single(info->id->device,
request->sge[i].addr,
request->sge[i].length,
DMA_TO_DEVICE);
- smbd_destroy_header(info, request);
+ mempool_free(request, info->request_mempool);
+
+ /* roll back receive credits and credits to be offered */
+ spin_lock(&info->lock_new_credits_offered);
+ info->new_credits_offered += new_credits;
+ spin_unlock(&info->lock_new_credits_offered);
+ atomic_sub(new_credits, &info->receive_credits);
+
+err_alloc:
+ if (atomic_dec_and_test(&info->send_pending))
+ wake_up(&info->wait_send_pending);
+
+err_wait_send_queue:
+ /* roll back send credits and pending */
+ atomic_inc(&info->send_credits);
+
+err_wait_credit:
return rc;
}
@@ -1334,25 +1292,6 @@ static void destroy_receive_buffers(struct smbd_connection *info)
mempool_free(response, info->response_mempool);
}
-/*
- * Check and send an immediate or keep alive packet
- * The condition to send those packets are defined in [MS-SMBD] 3.1.1.1
- * Connection.KeepaliveRequested and Connection.SendImmediate
- * The idea is to extend credits to server as soon as it becomes available
- */
-static void send_immediate_work(struct work_struct *work)
-{
- struct smbd_connection *info = container_of(
- work, struct smbd_connection,
- send_immediate_work.work);
-
- if (info->keep_alive_requested == KEEP_ALIVE_PENDING ||
- info->send_immediate) {
- log_keep_alive(INFO, "send an empty message\n");
- smbd_post_send_empty(info);
- }
-}
-
/* Implement idle connection timer [MS-SMBD] 3.1.6.2 */
static void idle_connection_timer(struct work_struct *work)
{
@@ -1407,14 +1346,10 @@ void smbd_destroy(struct TCP_Server_Info *server)
log_rdma_event(INFO, "cancelling idle timer\n");
cancel_delayed_work_sync(&info->idle_timer_work);
- log_rdma_event(INFO, "cancelling send immediate work\n");
- cancel_delayed_work_sync(&info->send_immediate_work);
log_rdma_event(INFO, "wait for all send posted to IB to finish\n");
wait_event(info->wait_send_pending,
atomic_read(&info->send_pending) == 0);
- wait_event(info->wait_send_payload_pending,
- atomic_read(&info->send_payload_pending) == 0);
/* It's not posssible for upper layer to get to reassembly */
log_rdma_event(INFO, "drain the reassembly queue\n");
@@ -1744,15 +1679,13 @@ static struct smbd_connection *_smbd_get_connection(
init_waitqueue_head(&info->wait_send_queue);
INIT_DELAYED_WORK(&info->idle_timer_work, idle_connection_timer);
- INIT_DELAYED_WORK(&info->send_immediate_work, send_immediate_work);
queue_delayed_work(info->workqueue, &info->idle_timer_work,
info->keep_alive_interval*HZ);
init_waitqueue_head(&info->wait_send_pending);
atomic_set(&info->send_pending, 0);
- init_waitqueue_head(&info->wait_send_payload_pending);
- atomic_set(&info->send_payload_pending, 0);
+ init_waitqueue_head(&info->wait_post_send);
INIT_WORK(&info->disconnect_work, smbd_disconnect_rdma_work);
INIT_WORK(&info->post_send_credits_work, smbd_post_send_credits);
@@ -2226,8 +2159,8 @@ done:
* that means all the I/Os have been out and we are good to return
*/
- wait_event(info->wait_send_payload_pending,
- atomic_read(&info->send_payload_pending) == 0);
+ wait_event(info->wait_send_pending,
+ atomic_read(&info->send_pending) == 0);
return rc;
}
diff --git a/fs/cifs/smbdirect.h b/fs/cifs/smbdirect.h
index 8ede915f2b24..a87fca82a796 100644
--- a/fs/cifs/smbdirect.h
+++ b/fs/cifs/smbdirect.h
@@ -114,8 +114,7 @@ struct smbd_connection {
/* Activity accoutning */
atomic_t send_pending;
wait_queue_head_t wait_send_pending;
- atomic_t send_payload_pending;
- wait_queue_head_t wait_send_payload_pending;
+ wait_queue_head_t wait_post_send;
/* Receive queue */
struct list_head receive_queue;
@@ -154,7 +153,6 @@ struct smbd_connection {
struct workqueue_struct *workqueue;
struct delayed_work idle_timer_work;
- struct delayed_work send_immediate_work;
/* Memory pool for preallocating buffers */
/* request pool for RDMA send */
@@ -234,9 +232,6 @@ struct smbd_request {
struct smbd_connection *info;
struct ib_cqe cqe;
- /* true if this request carries upper layer payload */
- bool has_payload;
-
/* the SGE entries for this packet */
struct ib_sge sge[SMBDIRECT_MAX_SGE];
int num_sge;
diff --git a/fs/dax.c b/fs/dax.c
index 35da144375a0..11b16729b86f 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1038,50 +1038,43 @@ static vm_fault_t dax_load_hole(struct xa_state *xas,
return ret;
}
-static bool dax_range_is_aligned(struct block_device *bdev,
- unsigned int offset, unsigned int length)
+int dax_iomap_zero(loff_t pos, unsigned offset, unsigned size,
+ struct iomap *iomap)
{
- unsigned short sector_size = bdev_logical_block_size(bdev);
+ sector_t sector = iomap_sector(iomap, pos & PAGE_MASK);
+ pgoff_t pgoff;
+ long rc, id;
+ void *kaddr;
+ bool page_aligned = false;
- if (!IS_ALIGNED(offset, sector_size))
- return false;
- if (!IS_ALIGNED(length, sector_size))
- return false;
- return true;
-}
+ if (IS_ALIGNED(sector << SECTOR_SHIFT, PAGE_SIZE) &&
+ IS_ALIGNED(size, PAGE_SIZE))
+ page_aligned = true;
-int __dax_zero_page_range(struct block_device *bdev,
- struct dax_device *dax_dev, sector_t sector,
- unsigned int offset, unsigned int size)
-{
- if (dax_range_is_aligned(bdev, offset, size)) {
- sector_t start_sector = sector + (offset >> 9);
+ rc = bdev_dax_pgoff(iomap->bdev, sector, PAGE_SIZE, &pgoff);
+ if (rc)
+ return rc;
- return blkdev_issue_zeroout(bdev, start_sector,
- size >> 9, GFP_NOFS, 0);
- } else {
- pgoff_t pgoff;
- long rc, id;
- void *kaddr;
+ id = dax_read_lock();
- rc = bdev_dax_pgoff(bdev, sector, PAGE_SIZE, &pgoff);
- if (rc)
- return rc;
+ if (page_aligned)
+ rc = dax_zero_page_range(iomap->dax_dev, pgoff,
+ size >> PAGE_SHIFT);
+ else
+ rc = dax_direct_access(iomap->dax_dev, pgoff, 1, &kaddr, NULL);
+ if (rc < 0) {
+ dax_read_unlock(id);
+ return rc;
+ }
- id = dax_read_lock();
- rc = dax_direct_access(dax_dev, pgoff, 1, &kaddr, NULL);
- if (rc < 0) {
- dax_read_unlock(id);
- return rc;
- }
+ if (!page_aligned) {
memset(kaddr + offset, 0, size);
- dax_flush(dax_dev, kaddr + offset, size);
- dax_read_unlock(id);
+ dax_flush(iomap->dax_dev, kaddr + offset, size);
}
+ dax_read_unlock(id);
return 0;
}
-EXPORT_SYMBOL_GPL(__dax_zero_page_range);
static loff_t
dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
diff --git a/fs/filesystems.c b/fs/filesystems.c
index 77bf5f95362d..90b8d879fbaf 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -272,7 +272,9 @@ struct file_system_type *get_fs_type(const char *name)
fs = __get_fs_type(name, len);
if (!fs && (request_module("fs-%.*s", len, name) == 0)) {
fs = __get_fs_type(name, len);
- WARN_ONCE(!fs, "request_module fs-%.*s succeeded, but still no fs?\n", len, name);
+ if (!fs)
+ pr_warn_once("request_module fs-%.*s succeeded, but still no fs?\n",
+ len, name);
}
if (dot && fs && !(fs->fs_flags & FS_HAS_SUBTYPE)) {
diff --git a/fs/hfsplus/attributes.c b/fs/hfsplus/attributes.c
index e6d554476db4..eeebe80c6be4 100644
--- a/fs/hfsplus/attributes.c
+++ b/fs/hfsplus/attributes.c
@@ -292,6 +292,10 @@ static int __hfsplus_delete_attr(struct inode *inode, u32 cnid,
return -ENOENT;
}
+ /* Avoid btree corruption */
+ hfs_bnode_read(fd->bnode, fd->search_key,
+ fd->keyoffset, fd->keylength);
+
err = hfs_brec_remove(fd);
if (err)
return err;
diff --git a/fs/io-wq.c b/fs/io-wq.c
index cc5cf2209fb0..4023c9846860 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -17,6 +17,7 @@
#include <linux/kthread.h>
#include <linux/rculist_nulls.h>
#include <linux/fs_struct.h>
+#include <linux/task_work.h>
#include "io-wq.h"
@@ -716,6 +717,9 @@ static int io_wq_manager(void *data)
complete(&wq->done);
while (!kthread_should_stop()) {
+ if (current->task_works)
+ task_work_run();
+
for_each_node(node) {
struct io_wqe *wqe = wq->wqes[node];
bool fork_worker[2] = { false, false };
@@ -738,6 +742,9 @@ static int io_wq_manager(void *data)
schedule_timeout(HZ);
}
+ if (current->task_works)
+ task_work_run();
+
return 0;
err:
set_bit(IO_WQ_BIT_ERROR, &wq->state);
@@ -1124,3 +1131,8 @@ void io_wq_destroy(struct io_wq *wq)
if (refcount_dec_and_test(&wq->use_refs))
__io_wq_destroy(wq);
}
+
+struct task_struct *io_wq_get_task(struct io_wq *wq)
+{
+ return wq->manager;
+}
diff --git a/fs/io-wq.h b/fs/io-wq.h
index 3ee7356d6be5..5ba12de7572f 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -136,6 +136,8 @@ typedef bool (work_cancel_fn)(struct io_wq_work *, void *);
enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
void *data);
+struct task_struct *io_wq_get_task(struct io_wq *wq);
+
#if defined(CONFIG_IO_WQ)
extern void io_wq_worker_sleeping(struct task_struct *);
extern void io_wq_worker_running(struct task_struct *);
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 358f97be9c7b..5190bfb6a665 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -186,14 +186,23 @@ struct fixed_file_table {
struct file **files;
};
+struct fixed_file_ref_node {
+ struct percpu_ref refs;
+ struct list_head node;
+ struct list_head file_list;
+ struct fixed_file_data *file_data;
+ struct work_struct work;
+};
+
struct fixed_file_data {
struct fixed_file_table *table;
struct io_ring_ctx *ctx;
+ struct percpu_ref *cur_refs;
struct percpu_ref refs;
- struct llist_head put_llist;
- struct work_struct ref_work;
struct completion done;
+ struct list_head ref_list;
+ spinlock_t lock;
};
struct io_buffer {
@@ -317,6 +326,8 @@ struct io_ring_ctx {
spinlock_t inflight_lock;
struct list_head inflight_list;
} ____cacheline_aligned_in_smp;
+
+ struct work_struct exit_work;
};
/*
@@ -599,6 +610,7 @@ struct io_kiocb {
};
struct io_async_ctx *io;
+ int cflags;
bool needs_fixed_file;
u8 opcode;
@@ -606,10 +618,8 @@ struct io_kiocb {
struct list_head list;
unsigned int flags;
refcount_t refs;
- union {
- struct task_struct *task;
- unsigned long fsize;
- };
+ struct task_struct *task;
+ unsigned long fsize;
u64 user_data;
u32 result;
u32 sequence;
@@ -618,6 +628,8 @@ struct io_kiocb {
struct list_head inflight_entry;
+ struct percpu_ref *fixed_file_refs;
+
union {
/*
* Only commands that never go async can use the below fields,
@@ -629,7 +641,6 @@ struct io_kiocb {
struct callback_head task_work;
struct hlist_node hash_node;
struct async_poll *apoll;
- int cflags;
};
struct io_wq_work work;
};
@@ -848,7 +859,6 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
struct io_uring_files_update *ip,
unsigned nr_args);
static int io_grab_files(struct io_kiocb *req);
-static void io_ring_file_ref_flush(struct fixed_file_data *data);
static void io_cleanup_req(struct io_kiocb *req);
static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
int fd, struct file **out_file, bool fixed);
@@ -1285,8 +1295,8 @@ static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx)
return NULL;
}
-static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
- struct io_submit_state *state)
+static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx,
+ struct io_submit_state *state)
{
gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
struct io_kiocb *req;
@@ -1319,41 +1329,20 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
req = state->reqs[state->free_reqs];
}
-got_it:
- req->io = NULL;
- req->file = NULL;
- req->ctx = ctx;
- req->flags = 0;
- /* one is dropped after submission, the other at completion */
- refcount_set(&req->refs, 2);
- req->result = 0;
- INIT_IO_WORK(&req->work, io_wq_submit_work);
return req;
fallback:
- req = io_get_fallback_req(ctx);
- if (req)
- goto got_it;
- percpu_ref_put(&ctx->refs);
- return NULL;
+ return io_get_fallback_req(ctx);
}
static inline void io_put_file(struct io_kiocb *req, struct file *file,
bool fixed)
{
if (fixed)
- percpu_ref_put(&req->ctx->file_data->refs);
+ percpu_ref_put(req->fixed_file_refs);
else
fput(file);
}
-static void __io_req_do_free(struct io_kiocb *req)
-{
- if (likely(!io_is_fallback_req(req)))
- kmem_cache_free(req_cachep, req);
- else
- clear_bit_unlock(0, (unsigned long *) req->ctx->fallback_req);
-}
-
static void __io_req_aux_free(struct io_kiocb *req)
{
if (req->flags & REQ_F_NEED_CLEANUP)
@@ -1362,6 +1351,8 @@ static void __io_req_aux_free(struct io_kiocb *req)
kfree(req->io);
if (req->file)
io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
+ if (req->task)
+ put_task_struct(req->task);
io_req_work_drop_env(req);
}
@@ -1382,7 +1373,10 @@ static void __io_free_req(struct io_kiocb *req)
}
percpu_ref_put(&req->ctx->refs);
- __io_req_do_free(req);
+ if (likely(!io_is_fallback_req(req)))
+ kmem_cache_free(req_cachep, req);
+ else
+ clear_bit_unlock(0, (unsigned long *) req->ctx->fallback_req);
}
struct req_batch {
@@ -1393,21 +1387,18 @@ struct req_batch {
static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb)
{
- int fixed_refs = rb->to_free;
-
if (!rb->to_free)
return;
if (rb->need_iter) {
int i, inflight = 0;
unsigned long flags;
- fixed_refs = 0;
for (i = 0; i < rb->to_free; i++) {
struct io_kiocb *req = rb->reqs[i];
if (req->flags & REQ_F_FIXED_FILE) {
req->file = NULL;
- fixed_refs++;
+ percpu_ref_put(req->fixed_file_refs);
}
if (req->flags & REQ_F_INFLIGHT)
inflight++;
@@ -1433,8 +1424,6 @@ static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb)
}
do_free:
kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs);
- if (fixed_refs)
- percpu_ref_put_many(&ctx->file_data->refs, fixed_refs);
percpu_ref_put_many(&ctx->refs, rb->to_free);
rb->to_free = rb->need_iter = 0;
}
@@ -1738,11 +1727,24 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
io_free_req_many(ctx, &rb);
}
+static void io_iopoll_queue(struct list_head *again)
+{
+ struct io_kiocb *req;
+
+ do {
+ req = list_first_entry(again, struct io_kiocb, list);
+ list_del(&req->list);
+ refcount_inc(&req->refs);
+ io_queue_async_work(req);
+ } while (!list_empty(again));
+}
+
static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
long min)
{
struct io_kiocb *req, *tmp;
LIST_HEAD(done);
+ LIST_HEAD(again);
bool spin;
int ret;
@@ -1757,9 +1759,9 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
struct kiocb *kiocb = &req->rw.kiocb;
/*
- * Move completed entries to our local list. If we find a
- * request that requires polling, break out and complete
- * the done list first, if we have entries there.
+ * Move completed and retryable entries to our local lists.
+ * If we find a request that requires polling, break out
+ * and complete those lists first, if we have entries there.
*/
if (req->flags & REQ_F_IOPOLL_COMPLETED) {
list_move_tail(&req->list, &done);
@@ -1768,6 +1770,13 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
if (!list_empty(&done))
break;
+ if (req->result == -EAGAIN) {
+ list_move_tail(&req->list, &again);
+ continue;
+ }
+ if (!list_empty(&again))
+ break;
+
ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
if (ret < 0)
break;
@@ -1780,6 +1789,9 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
if (!list_empty(&done))
io_iopoll_complete(ctx, nr_events, &done);
+ if (!list_empty(&again))
+ io_iopoll_queue(&again);
+
return ret;
}
@@ -2465,8 +2477,9 @@ static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size,
req->io->rw.iov = iovec;
if (!req->io->rw.iov) {
req->io->rw.iov = req->io->rw.fast_iov;
- memcpy(req->io->rw.iov, fast_iov,
- sizeof(struct iovec) * iter->nr_segs);
+ if (req->io->rw.iov != fast_iov)
+ memcpy(req->io->rw.iov, fast_iov,
+ sizeof(struct iovec) * iter->nr_segs);
} else {
req->flags |= REQ_F_NEED_CLEANUP;
}
@@ -2920,7 +2933,7 @@ static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (sqe->ioprio || sqe->buf_index)
return -EINVAL;
- if (sqe->flags & IOSQE_FIXED_FILE)
+ if (req->flags & REQ_F_FIXED_FILE)
return -EBADF;
if (req->flags & REQ_F_NEED_CLEANUP)
return 0;
@@ -2929,6 +2942,8 @@ static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
req->open.how.mode = READ_ONCE(sqe->len);
fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
req->open.how.flags = READ_ONCE(sqe->open_flags);
+ if (force_o_largefile())
+ req->open.how.flags |= O_LARGEFILE;
req->open.filename = getname(fname);
if (IS_ERR(req->open.filename)) {
@@ -2951,7 +2966,7 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (sqe->ioprio || sqe->buf_index)
return -EINVAL;
- if (sqe->flags & IOSQE_FIXED_FILE)
+ if (req->flags & REQ_F_FIXED_FILE)
return -EBADF;
if (req->flags & REQ_F_NEED_CLEANUP)
return 0;
@@ -3305,7 +3320,7 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (sqe->ioprio || sqe->buf_index)
return -EINVAL;
- if (sqe->flags & IOSQE_FIXED_FILE)
+ if (req->flags & REQ_F_FIXED_FILE)
return -EBADF;
if (req->flags & REQ_F_NEED_CLEANUP)
return 0;
@@ -3382,7 +3397,7 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
sqe->rw_flags || sqe->buf_index)
return -EINVAL;
- if (sqe->flags & IOSQE_FIXED_FILE)
+ if (req->flags & REQ_F_FIXED_FILE)
return -EBADF;
req->close.fd = READ_ONCE(sqe->fd);
@@ -3481,14 +3496,11 @@ static void __io_sync_file_range(struct io_kiocb *req)
static void io_sync_file_range_finish(struct io_wq_work **workptr)
{
struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
- struct io_kiocb *nxt = NULL;
if (io_req_cancelled(req))
return;
__io_sync_file_range(req);
io_put_req(req); /* put submission ref */
- if (nxt)
- io_wq_assign_next(workptr, nxt);
}
static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock)
@@ -4114,6 +4126,7 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
__poll_t mask, task_work_func_t func)
{
struct task_struct *tsk;
+ int ret;
/* for instances that support it check for an event match first: */
if (mask && !(mask & poll->events))
@@ -4127,11 +4140,15 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
req->result = mask;
init_task_work(&req->task_work, func);
/*
- * If this fails, then the task is exiting. If that is the case, then
- * the exit check will ultimately cancel these work items. Hence we
- * don't need to check here and handle it specifically.
+ * If this fails, then the task is exiting. Punt to one of the io-wq
+ * threads to ensure the work gets run, we can't always rely on exit
+ * cancelation taking care of this.
*/
- task_work_add(tsk, &req->task_work, true);
+ ret = task_work_add(tsk, &req->task_work, true);
+ if (unlikely(ret)) {
+ tsk = io_wq_get_task(req->ctx->io_wq);
+ task_work_add(tsk, &req->task_work, true);
+ }
wake_up_process(tsk);
return 1;
}
@@ -4251,10 +4268,7 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
req->flags |= REQ_F_POLLED;
memcpy(&apoll->work, &req->work, sizeof(req->work));
- /*
- * Don't need a reference here, as we're adding it to the task
- * task_works list. If the task exits, the list is pruned.
- */
+ get_task_struct(current);
req->task = current;
req->apoll = apoll;
INIT_HLIST_NODE(&req->hash_node);
@@ -4407,8 +4421,20 @@ static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt)
{
struct io_ring_ctx *ctx = req->ctx;
+ struct io_poll_iocb *poll = &req->poll;
+
+ if (!req->result && !READ_ONCE(poll->canceled)) {
+ struct poll_table_struct pt = { ._key = poll->events };
+
+ req->result = vfs_poll(req->file, &pt) & poll->events;
+ }
spin_lock_irq(&ctx->completion_lock);
+ if (!req->result && !READ_ONCE(poll->canceled)) {
+ add_wait_queue(poll->head, &poll->wait);
+ spin_unlock_irq(&ctx->completion_lock);
+ return;
+ }
hash_del(&req->hash_node);
io_poll_complete(req, req->result, 0);
req->flags |= REQ_F_COMP_LOCKED;
@@ -4465,10 +4491,7 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
events = READ_ONCE(sqe->poll_events);
poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
- /*
- * Don't need a reference here, as we're adding it to the task
- * task_works list. If the task exits, the list is pruned.
- */
+ get_task_struct(current);
req->task = current;
return 0;
}
@@ -5331,7 +5354,8 @@ static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
file = io_file_from_index(ctx, fd);
if (!file)
return -EBADF;
- percpu_ref_get(&ctx->file_data->refs);
+ req->fixed_file_refs = ctx->file_data->cur_refs;
+ percpu_ref_get(req->fixed_file_refs);
} else {
trace_io_uring_file_get(ctx, fd);
file = __io_file_get(state, fd);
@@ -5344,15 +5368,10 @@ static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
}
static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
+ int fd, unsigned int flags)
{
- unsigned flags;
- int fd;
bool fixed;
- flags = READ_ONCE(sqe->flags);
- fd = READ_ONCE(sqe->fd);
-
if (!io_req_needs_file(req, fd))
return 0;
@@ -5594,7 +5613,7 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
{
struct io_ring_ctx *ctx = req->ctx;
unsigned int sqe_flags;
- int ret, id;
+ int ret, id, fd;
sqe_flags = READ_ONCE(sqe->flags);
@@ -5625,7 +5644,8 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
IOSQE_ASYNC | IOSQE_FIXED_FILE |
IOSQE_BUFFER_SELECT);
- ret = io_req_set_file(state, req, sqe);
+ fd = READ_ONCE(sqe->fd);
+ ret = io_req_set_file(state, req, fd, sqe_flags);
if (unlikely(ret)) {
err_req:
io_cqring_add_event(req, ret);
@@ -5741,8 +5761,7 @@ static void io_commit_sqring(struct io_ring_ctx *ctx)
* used, it's important that those reads are done through READ_ONCE() to
* prevent a re-load down the line.
*/
-static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req,
- const struct io_uring_sqe **sqe_ptr)
+static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
{
u32 *sq_array = ctx->sq_array;
unsigned head;
@@ -5756,25 +5775,40 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req,
* though the application is the one updating it.
*/
head = READ_ONCE(sq_array[ctx->cached_sq_head & ctx->sq_mask]);
- if (likely(head < ctx->sq_entries)) {
- /*
- * All io need record the previous position, if LINK vs DARIN,
- * it can be used to mark the position of the first IO in the
- * link list.
- */
- req->sequence = ctx->cached_sq_head;
- *sqe_ptr = &ctx->sq_sqes[head];
- req->opcode = READ_ONCE((*sqe_ptr)->opcode);
- req->user_data = READ_ONCE((*sqe_ptr)->user_data);
- ctx->cached_sq_head++;
- return true;
- }
+ if (likely(head < ctx->sq_entries))
+ return &ctx->sq_sqes[head];
/* drop invalid entries */
- ctx->cached_sq_head++;
ctx->cached_sq_dropped++;
WRITE_ONCE(ctx->rings->sq_dropped, ctx->cached_sq_dropped);
- return false;
+ return NULL;
+}
+
+static inline void io_consume_sqe(struct io_ring_ctx *ctx)
+{
+ ctx->cached_sq_head++;
+}
+
+static void io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ /*
+ * All io need record the previous position, if LINK vs DARIN,
+ * it can be used to mark the position of the first IO in the
+ * link list.
+ */
+ req->sequence = ctx->cached_sq_head;
+ req->opcode = READ_ONCE(sqe->opcode);
+ req->user_data = READ_ONCE(sqe->user_data);
+ req->io = NULL;
+ req->file = NULL;
+ req->ctx = ctx;
+ req->flags = 0;
+ /* one is dropped after submission, the other at completion */
+ refcount_set(&req->refs, 2);
+ req->task = NULL;
+ req->result = 0;
+ INIT_IO_WORK(&req->work, io_wq_submit_work);
}
static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
@@ -5812,17 +5846,20 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
struct io_kiocb *req;
int err;
- req = io_get_req(ctx, statep);
+ sqe = io_get_sqe(ctx);
+ if (unlikely(!sqe)) {
+ io_consume_sqe(ctx);
+ break;
+ }
+ req = io_alloc_req(ctx, statep);
if (unlikely(!req)) {
if (!submitted)
submitted = -EAGAIN;
break;
}
- if (!io_get_sqring(ctx, req, &sqe)) {
- __io_req_do_free(req);
- break;
- }
+ io_init_req(ctx, req, sqe);
+ io_consume_sqe(ctx);
/* will complete beyond this point, count as submitted */
submitted++;
@@ -5962,6 +5999,7 @@ static int io_sq_thread(void *data)
}
if (current->task_works) {
task_work_run();
+ finish_wait(&ctx->sqo_wait, &wait);
continue;
}
if (signal_pending(current))
@@ -6124,43 +6162,36 @@ static void io_file_ref_kill(struct percpu_ref *ref)
complete(&data->done);
}
-static void io_file_ref_exit_and_free(struct work_struct *work)
-{
- struct fixed_file_data *data;
-
- data = container_of(work, struct fixed_file_data, ref_work);
-
- /*
- * Ensure any percpu-ref atomic switch callback has run, it could have
- * been in progress when the files were being unregistered. Once
- * that's done, we can safely exit and free the ref and containing
- * data structure.
- */
- rcu_barrier();
- percpu_ref_exit(&data->refs);
- kfree(data);
-}
-
static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
{
struct fixed_file_data *data = ctx->file_data;
+ struct fixed_file_ref_node *ref_node = NULL;
unsigned nr_tables, i;
+ unsigned long flags;
if (!data)
return -ENXIO;
- percpu_ref_kill_and_confirm(&data->refs, io_file_ref_kill);
- flush_work(&data->ref_work);
+ spin_lock_irqsave(&data->lock, flags);
+ if (!list_empty(&data->ref_list))
+ ref_node = list_first_entry(&data->ref_list,
+ struct fixed_file_ref_node, node);
+ spin_unlock_irqrestore(&data->lock, flags);
+ if (ref_node)
+ percpu_ref_kill(&ref_node->refs);
+
+ percpu_ref_kill(&data->refs);
+
+ /* wait for all refs nodes to complete */
wait_for_completion(&data->done);
- io_ring_file_ref_flush(data);
__io_sqe_files_unregister(ctx);
nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);
for (i = 0; i < nr_tables; i++)
kfree(data->table[i].files);
kfree(data->table);
- INIT_WORK(&data->ref_work, io_file_ref_exit_and_free);
- queue_work(system_wq, &data->ref_work);
+ percpu_ref_exit(&data->refs);
+ kfree(data);
ctx->file_data = NULL;
ctx->nr_user_files = 0;
return 0;
@@ -6204,13 +6235,6 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
struct sk_buff *skb;
int i, nr_files;
- if (!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) {
- unsigned long inflight = ctx->user->unix_inflight + nr;
-
- if (inflight > task_rlimit(current, RLIMIT_NOFILE))
- return -EMFILE;
- }
-
fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
if (!fpl)
return -ENOMEM;
@@ -6385,46 +6409,72 @@ static void io_ring_file_put(struct io_ring_ctx *ctx, struct file *file)
}
struct io_file_put {
- struct llist_node llist;
+ struct list_head list;
struct file *file;
};
-static void io_ring_file_ref_flush(struct fixed_file_data *data)
+static void io_file_put_work(struct work_struct *work)
{
+ struct fixed_file_ref_node *ref_node;
+ struct fixed_file_data *file_data;
+ struct io_ring_ctx *ctx;
struct io_file_put *pfile, *tmp;
- struct llist_node *node;
+ unsigned long flags;
- while ((node = llist_del_all(&data->put_llist)) != NULL) {
- llist_for_each_entry_safe(pfile, tmp, node, llist) {
- io_ring_file_put(data->ctx, pfile->file);
- kfree(pfile);
- }
+ ref_node = container_of(work, struct fixed_file_ref_node, work);
+ file_data = ref_node->file_data;
+ ctx = file_data->ctx;
+
+ list_for_each_entry_safe(pfile, tmp, &ref_node->file_list, list) {
+ list_del_init(&pfile->list);
+ io_ring_file_put(ctx, pfile->file);
+ kfree(pfile);
}
+
+ spin_lock_irqsave(&file_data->lock, flags);
+ list_del_init(&ref_node->node);
+ spin_unlock_irqrestore(&file_data->lock, flags);
+
+ percpu_ref_exit(&ref_node->refs);
+ kfree(ref_node);
+ percpu_ref_put(&file_data->refs);
}
-static void io_ring_file_ref_switch(struct work_struct *work)
+static void io_file_data_ref_zero(struct percpu_ref *ref)
{
- struct fixed_file_data *data;
+ struct fixed_file_ref_node *ref_node;
+
+ ref_node = container_of(ref, struct fixed_file_ref_node, refs);
- data = container_of(work, struct fixed_file_data, ref_work);
- io_ring_file_ref_flush(data);
- percpu_ref_switch_to_percpu(&data->refs);
+ queue_work(system_wq, &ref_node->work);
}
-static void io_file_data_ref_zero(struct percpu_ref *ref)
+static struct fixed_file_ref_node *alloc_fixed_file_ref_node(
+ struct io_ring_ctx *ctx)
{
- struct fixed_file_data *data;
+ struct fixed_file_ref_node *ref_node;
- data = container_of(ref, struct fixed_file_data, refs);
+ ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
+ if (!ref_node)
+ return ERR_PTR(-ENOMEM);
+
+ if (percpu_ref_init(&ref_node->refs, io_file_data_ref_zero,
+ 0, GFP_KERNEL)) {
+ kfree(ref_node);
+ return ERR_PTR(-ENOMEM);
+ }
+ INIT_LIST_HEAD(&ref_node->node);
+ INIT_LIST_HEAD(&ref_node->file_list);
+ INIT_WORK(&ref_node->work, io_file_put_work);
+ ref_node->file_data = ctx->file_data;
+ return ref_node;
- /*
- * We can't safely switch from inside this context, punt to wq. If
- * the table ref is going away, the table is being unregistered.
- * Don't queue up the async work for that case, the caller will
- * handle it.
- */
- if (!percpu_ref_is_dying(&data->refs))
- queue_work(system_wq, &data->ref_work);
+}
+
+static void destroy_fixed_file_ref_node(struct fixed_file_ref_node *ref_node)
+{
+ percpu_ref_exit(&ref_node->refs);
+ kfree(ref_node);
}
static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
@@ -6435,6 +6485,8 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
struct file *file;
int fd, ret = 0;
unsigned i;
+ struct fixed_file_ref_node *ref_node;
+ unsigned long flags;
if (ctx->file_data)
return -EBUSY;
@@ -6448,6 +6500,8 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
return -ENOMEM;
ctx->file_data->ctx = ctx;
init_completion(&ctx->file_data->done);
+ INIT_LIST_HEAD(&ctx->file_data->ref_list);
+ spin_lock_init(&ctx->file_data->lock);
nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE);
ctx->file_data->table = kcalloc(nr_tables,
@@ -6459,15 +6513,13 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
return -ENOMEM;
}
- if (percpu_ref_init(&ctx->file_data->refs, io_file_data_ref_zero,
+ if (percpu_ref_init(&ctx->file_data->refs, io_file_ref_kill,
PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
kfree(ctx->file_data->table);
kfree(ctx->file_data);
ctx->file_data = NULL;
return -ENOMEM;
}
- ctx->file_data->put_llist.first = NULL;
- INIT_WORK(&ctx->file_data->ref_work, io_ring_file_ref_switch);
if (io_sqe_alloc_file_tables(ctx, nr_tables, nr_args)) {
percpu_ref_exit(&ctx->file_data->refs);
@@ -6530,9 +6582,22 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
}
ret = io_sqe_files_scm(ctx);
- if (ret)
+ if (ret) {
io_sqe_files_unregister(ctx);
+ return ret;
+ }
+
+ ref_node = alloc_fixed_file_ref_node(ctx);
+ if (IS_ERR(ref_node)) {
+ io_sqe_files_unregister(ctx);
+ return PTR_ERR(ref_node);
+ }
+ ctx->file_data->cur_refs = &ref_node->refs;
+ spin_lock_irqsave(&ctx->file_data->lock, flags);
+ list_add(&ref_node->node, &ctx->file_data->ref_list);
+ spin_unlock_irqrestore(&ctx->file_data->lock, flags);
+ percpu_ref_get(&ctx->file_data->refs);
return ret;
}
@@ -6579,30 +6644,21 @@ static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
#endif
}
-static void io_atomic_switch(struct percpu_ref *ref)
-{
- struct fixed_file_data *data;
-
- /*
- * Juggle reference to ensure we hit zero, if needed, so we can
- * switch back to percpu mode
- */
- data = container_of(ref, struct fixed_file_data, refs);
- percpu_ref_put(&data->refs);
- percpu_ref_get(&data->refs);
-}
-
static int io_queue_file_removal(struct fixed_file_data *data,
- struct file *file)
+ struct file *file)
{
struct io_file_put *pfile;
+ struct percpu_ref *refs = data->cur_refs;
+ struct fixed_file_ref_node *ref_node;
pfile = kzalloc(sizeof(*pfile), GFP_KERNEL);
if (!pfile)
return -ENOMEM;
+ ref_node = container_of(refs, struct fixed_file_ref_node, refs);
pfile->file = file;
- llist_add(&pfile->llist, &data->put_llist);
+ list_add(&pfile->list, &ref_node->file_list);
+
return 0;
}
@@ -6611,17 +6667,23 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
unsigned nr_args)
{
struct fixed_file_data *data = ctx->file_data;
- bool ref_switch = false;
+ struct fixed_file_ref_node *ref_node;
struct file *file;
__s32 __user *fds;
int fd, i, err;
__u32 done;
+ unsigned long flags;
+ bool needs_switch = false;
if (check_add_overflow(up->offset, nr_args, &done))
return -EOVERFLOW;
if (done > ctx->nr_user_files)
return -EINVAL;
+ ref_node = alloc_fixed_file_ref_node(ctx);
+ if (IS_ERR(ref_node))
+ return PTR_ERR(ref_node);
+
done = 0;
fds = u64_to_user_ptr(up->fds);
while (nr_args) {
@@ -6642,7 +6704,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
if (err)
break;
table->files[index] = NULL;
- ref_switch = true;
+ needs_switch = true;
}
if (fd != -1) {
file = fget(fd);
@@ -6673,11 +6735,19 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
up->offset++;
}
- if (ref_switch)
- percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch);
+ if (needs_switch) {
+ percpu_ref_kill(data->cur_refs);
+ spin_lock_irqsave(&data->lock, flags);
+ list_add(&ref_node->node, &data->ref_list);
+ data->cur_refs = &ref_node->refs;
+ spin_unlock_irqrestore(&data->lock, flags);
+ percpu_ref_get(&ctx->file_data->refs);
+ } else
+ destroy_fixed_file_ref_node(ref_node);
return done ? done : err;
}
+
static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
unsigned nr_args)
{
@@ -7203,6 +7273,18 @@ static int io_remove_personalities(int id, void *p, void *data)
return 0;
}
+static void io_ring_exit_work(struct work_struct *work)
+{
+ struct io_ring_ctx *ctx;
+
+ ctx = container_of(work, struct io_ring_ctx, exit_work);
+ if (ctx->rings)
+ io_cqring_overflow_flush(ctx, true);
+
+ wait_for_completion(&ctx->completions[0]);
+ io_ring_ctx_free(ctx);
+}
+
static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
{
mutex_lock(&ctx->uring_lock);
@@ -7230,8 +7312,8 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
if (ctx->rings)
io_cqring_overflow_flush(ctx, true);
idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx);
- wait_for_completion(&ctx->completions[0]);
- io_ring_ctx_free(ctx);
+ INIT_WORK(&ctx->exit_work, io_ring_exit_work);
+ queue_work(system_wq, &ctx->exit_work);
}
static int io_uring_release(struct inode *inode, struct file *file)
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index f080f542911b..89e21961d1ad 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -302,6 +302,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
if (!ctx->bio || !is_contig || bio_full(ctx->bio, plen)) {
gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL);
+ gfp_t orig_gfp = gfp;
int nr_vecs = (length + PAGE_SIZE - 1) >> PAGE_SHIFT;
if (ctx->bio)
@@ -310,6 +311,13 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
if (ctx->is_readahead) /* same as readahead_gfp_mask */
gfp |= __GFP_NORETRY | __GFP_NOWARN;
ctx->bio = bio_alloc(gfp, min(BIO_MAX_PAGES, nr_vecs));
+ /*
+ * If the bio_alloc fails, try it again for a single page to
+ * avoid having to deal with partial page reads. This emulates
+ * what do_mpage_readpage does.
+ */
+ if (!ctx->bio)
+ ctx->bio = bio_alloc(orig_gfp, 1);
ctx->bio->bi_opf = REQ_OP_READ;
if (ctx->is_readahead)
ctx->bio->bi_opf |= REQ_RAHEAD;
@@ -975,13 +983,6 @@ static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
return iomap_write_end(inode, pos, bytes, bytes, page, iomap, srcmap);
}
-static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
- struct iomap *iomap)
-{
- return __dax_zero_page_range(iomap->bdev, iomap->dax_dev,
- iomap_sector(iomap, pos & PAGE_MASK), offset, bytes);
-}
-
static loff_t
iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
void *data, struct iomap *iomap, struct iomap *srcmap)
@@ -1001,7 +1002,7 @@ iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
bytes = min_t(loff_t, PAGE_SIZE - offset, count);
if (IS_DAX(inode))
- status = iomap_dax_zero(pos, offset, bytes, iomap);
+ status = dax_iomap_zero(pos, offset, bytes, iomap);
else
status = iomap_zero(inode, pos, offset, bytes, iomap,
srcmap);
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 25f135572fc8..e7ddbce48321 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -501,6 +501,7 @@ pnfs_alloc_ds_commits_list(struct list_head *list,
rcu_read_lock();
pnfs_put_commit_array(array, cinfo->inode);
}
+ rcu_read_unlock();
return ret;
}
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 65b3abbcce4e..2f834add165b 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -7402,6 +7402,10 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
struct ocfs2_inline_data *idata = &di->id2.i_data;
+ /* No need to punch hole beyond i_size. */
+ if (start >= i_size_read(inode))
+ return 0;
+
if (end > i_size_read(inode))
end = i_size_read(inode);
diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
index c740159d9ad1..af375e049aae 100644
--- a/fs/orangefs/file.c
+++ b/fs/orangefs/file.c
@@ -346,23 +346,8 @@ static ssize_t orangefs_file_read_iter(struct kiocb *iocb,
struct iov_iter *iter)
{
int ret;
- struct orangefs_read_options *ro;
-
orangefs_stats.reads++;
- /*
- * Remember how they set "count" in read(2) or pread(2) or whatever -
- * users can use count as a knob to control orangefs io size and later
- * we can try to help them fill as many pages as possible in readpage.
- */
- if (!iocb->ki_filp->private_data) {
- iocb->ki_filp->private_data = kmalloc(sizeof *ro, GFP_KERNEL);
- if (!iocb->ki_filp->private_data)
- return(ENOMEM);
- ro = iocb->ki_filp->private_data;
- ro->blksiz = iter->count;
- }
-
down_read(&file_inode(iocb->ki_filp)->i_rwsem);
ret = orangefs_revalidate_mapping(file_inode(iocb->ki_filp));
if (ret)
@@ -650,12 +635,6 @@ static int orangefs_lock(struct file *filp, int cmd, struct file_lock *fl)
return rc;
}
-static int orangefs_file_open(struct inode * inode, struct file *file)
-{
- file->private_data = NULL;
- return generic_file_open(inode, file);
-}
-
static int orangefs_flush(struct file *file, fl_owner_t id)
{
/*
@@ -666,19 +645,8 @@ static int orangefs_flush(struct file *file, fl_owner_t id)
* on an explicit fsync call. This duplicates historical OrangeFS
* behavior.
*/
- struct inode *inode = file->f_mapping->host;
int r;
- kfree(file->private_data);
- file->private_data = NULL;
-
- if (inode->i_state & I_DIRTY_TIME) {
- spin_lock(&inode->i_lock);
- inode->i_state &= ~I_DIRTY_TIME;
- spin_unlock(&inode->i_lock);
- mark_inode_dirty_sync(inode);
- }
-
r = filemap_write_and_wait_range(file->f_mapping, 0, LLONG_MAX);
if (r > 0)
return 0;
@@ -694,7 +662,7 @@ const struct file_operations orangefs_file_operations = {
.lock = orangefs_lock,
.unlocked_ioctl = orangefs_ioctl,
.mmap = orangefs_file_mmap,
- .open = orangefs_file_open,
+ .open = generic_file_open,
.flush = orangefs_flush,
.release = orangefs_file_release,
.fsync = orangefs_fsync,
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 961c0fd8675a..12ae630fbed7 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -259,46 +259,19 @@ static int orangefs_readpage(struct file *file, struct page *page)
pgoff_t index; /* which page */
struct page *next_page;
char *kaddr;
- struct orangefs_read_options *ro = file->private_data;
loff_t read_size;
- loff_t roundedup;
int buffer_index = -1; /* orangefs shared memory slot */
int slot_index; /* index into slot */
int remaining;
/*
- * If they set some miniscule size for "count" in read(2)
- * (for example) then let's try to read a page, or the whole file
- * if it is smaller than a page. Once "count" goes over a page
- * then lets round up to the highest page size multiple that is
- * less than or equal to "count" and do that much orangefs IO and
- * try to fill as many pages as we can from it.
- *
- * "count" should be represented in ro->blksiz.
- *
- * inode->i_size = file size.
+ * Get up to this many bytes from Orangefs at a time and try
+ * to fill them into the page cache at once. Tests with dd made
+ * this seem like a reasonable static number, if there was
+ * interest perhaps this number could be made setable through
+ * sysfs...
*/
- if (ro) {
- if (ro->blksiz < PAGE_SIZE) {
- if (inode->i_size < PAGE_SIZE)
- read_size = inode->i_size;
- else
- read_size = PAGE_SIZE;
- } else {
- roundedup = ((PAGE_SIZE - 1) & ro->blksiz) ?
- ((ro->blksiz + PAGE_SIZE) & ~(PAGE_SIZE -1)) :
- ro->blksiz;
- if (roundedup > inode->i_size)
- read_size = inode->i_size;
- else
- read_size = roundedup;
-
- }
- } else {
- read_size = PAGE_SIZE;
- }
- if (!read_size)
- read_size = PAGE_SIZE;
+ read_size = 524288;
if (PageDirty(page))
orangefs_launder_page(page);
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index ed67f39fa7ce..e12aeb9623d6 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -239,10 +239,6 @@ struct orangefs_write_range {
kgid_t gid;
};
-struct orangefs_read_options {
- ssize_t blksiz;
-};
-
extern struct orangefs_stats orangefs_stats;
/*
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 9fc47c2e078d..9709cf22cab3 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -36,6 +36,13 @@ static int ovl_ccup_get(char *buf, const struct kernel_param *param)
module_param_call(check_copy_up, ovl_ccup_set, ovl_ccup_get, NULL, 0644);
MODULE_PARM_DESC(check_copy_up, "Obsolete; does nothing");
+static bool ovl_must_copy_xattr(const char *name)
+{
+ return !strcmp(name, XATTR_POSIX_ACL_ACCESS) ||
+ !strcmp(name, XATTR_POSIX_ACL_DEFAULT) ||
+ !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN);
+}
+
int ovl_copy_xattr(struct dentry *old, struct dentry *new)
{
ssize_t list_size, size, value_size = 0;
@@ -107,8 +114,13 @@ retry:
continue; /* Discard */
}
error = vfs_setxattr(new, name, value, size, 0);
- if (error)
- break;
+ if (error) {
+ if (error != -EOPNOTSUPP || ovl_must_copy_xattr(name))
+ break;
+
+ /* Ignore failure to copy unknown xattrs */
+ error = 0;
+ }
}
kfree(value);
out:
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 8e57d5372b8f..279009dee366 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -42,7 +42,7 @@ int ovl_cleanup(struct inode *wdir, struct dentry *wdentry)
return err;
}
-static struct dentry *ovl_lookup_temp(struct dentry *workdir)
+struct dentry *ovl_lookup_temp(struct dentry *workdir)
{
struct dentry *temp;
char name[20];
@@ -243,6 +243,9 @@ static int ovl_instantiate(struct dentry *dentry, struct inode *inode,
ovl_dir_modified(dentry->d_parent, false);
ovl_dentry_set_upper_alias(dentry);
+ ovl_dentry_update_reval(dentry, newdentry,
+ DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE);
+
if (!hardlink) {
/*
* ovl_obtain_alias() can be called after ovl_create_real()
@@ -819,6 +822,28 @@ static bool ovl_pure_upper(struct dentry *dentry)
!ovl_test_flag(OVL_WHITEOUTS, d_inode(dentry));
}
+static void ovl_drop_nlink(struct dentry *dentry)
+{
+ struct inode *inode = d_inode(dentry);
+ struct dentry *alias;
+
+ /* Try to find another, hashed alias */
+ spin_lock(&inode->i_lock);
+ hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
+ if (alias != dentry && !d_unhashed(alias))
+ break;
+ }
+ spin_unlock(&inode->i_lock);
+
+ /*
+ * Changes to underlying layers may cause i_nlink to lose sync with
+ * reality. In this case prevent the link count from going to zero
+ * prematurely.
+ */
+ if (inode->i_nlink > !!alias)
+ drop_nlink(inode);
+}
+
static int ovl_do_remove(struct dentry *dentry, bool is_dir)
{
int err;
@@ -856,7 +881,7 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir)
if (is_dir)
clear_nlink(dentry->d_inode);
else
- drop_nlink(dentry->d_inode);
+ ovl_drop_nlink(dentry);
}
ovl_nlink_end(dentry);
@@ -1201,7 +1226,7 @@ static int ovl_rename(struct inode *olddir, struct dentry *old,
if (new_is_dir)
clear_nlink(d_inode(new));
else
- drop_nlink(d_inode(new));
+ ovl_drop_nlink(new);
}
ovl_dir_modified(old->d_parent, ovl_type_origin(old) ||
diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c
index 6f54d70cef27..475c61f53f0f 100644
--- a/fs/overlayfs/export.c
+++ b/fs/overlayfs/export.c
@@ -308,29 +308,35 @@ static struct dentry *ovl_obtain_alias(struct super_block *sb,
ovl_set_flag(OVL_UPPERDATA, inode);
dentry = d_find_any_alias(inode);
- if (!dentry) {
- dentry = d_alloc_anon(inode->i_sb);
- if (!dentry)
- goto nomem;
- oe = ovl_alloc_entry(lower ? 1 : 0);
- if (!oe)
- goto nomem;
-
- if (lower) {
- oe->lowerstack->dentry = dget(lower);
- oe->lowerstack->layer = lowerpath->layer;
- }
- dentry->d_fsdata = oe;
- if (upper_alias)
- ovl_dentry_set_upper_alias(dentry);
+ if (dentry)
+ goto out_iput;
+
+ dentry = d_alloc_anon(inode->i_sb);
+ if (unlikely(!dentry))
+ goto nomem;
+ oe = ovl_alloc_entry(lower ? 1 : 0);
+ if (!oe)
+ goto nomem;
+
+ if (lower) {
+ oe->lowerstack->dentry = dget(lower);
+ oe->lowerstack->layer = lowerpath->layer;
}
+ dentry->d_fsdata = oe;
+ if (upper_alias)
+ ovl_dentry_set_upper_alias(dentry);
+
+ ovl_dentry_update_reval(dentry, upper,
+ DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE);
return d_instantiate_anon(dentry, inode);
nomem:
- iput(inode);
dput(dentry);
- return ERR_PTR(-ENOMEM);
+ dentry = ERR_PTR(-ENOMEM);
+out_iput:
+ iput(inode);
+ return dentry;
}
/* Get the upper or lower dentry in stach whose on layer @idx */
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 79e8994e3bc1..b0d42ece4d7c 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -79,6 +79,7 @@ static int ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, int fsid)
{
bool samefs = ovl_same_fs(dentry->d_sb);
unsigned int xinobits = ovl_xino_bits(dentry->d_sb);
+ unsigned int xinoshift = 64 - xinobits;
if (samefs) {
/*
@@ -89,22 +90,22 @@ static int ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, int fsid)
stat->dev = dentry->d_sb->s_dev;
return 0;
} else if (xinobits) {
- unsigned int shift = 64 - xinobits;
/*
* All inode numbers of underlying fs should not be using the
* high xinobits, so we use high xinobits to partition the
* overlay st_ino address space. The high bits holds the fsid
- * (upper fsid is 0). This way overlay inode numbers are unique
- * and all inodes use overlay st_dev. Inode numbers are also
- * persistent for a given layer configuration.
+ * (upper fsid is 0). The lowest xinobit is reserved for mapping
+ * the non-peresistent inode numbers range in case of overflow.
+ * This way all overlay inode numbers are unique and use the
+ * overlay st_dev.
*/
- if (stat->ino >> shift) {
- pr_warn_ratelimited("inode number too big (%pd2, ino=%llu, xinobits=%d)\n",
- dentry, stat->ino, xinobits);
- } else {
- stat->ino |= ((u64)fsid) << shift;
+ if (likely(!(stat->ino >> xinoshift))) {
+ stat->ino |= ((u64)fsid) << (xinoshift + 1);
stat->dev = dentry->d_sb->s_dev;
return 0;
+ } else if (ovl_xino_warn(dentry->d_sb)) {
+ pr_warn_ratelimited("inode number too big (%pd2, ino=%llu, xinobits=%d)\n",
+ dentry, stat->ino, xinobits);
}
}
@@ -504,7 +505,7 @@ static const struct address_space_operations ovl_aops = {
/*
* It is possible to stack overlayfs instance on top of another
- * overlayfs instance as lower layer. We need to annonate the
+ * overlayfs instance as lower layer. We need to annotate the
* stackable i_mutex locks according to stack level of the super
* block instance. An overlayfs instance can never be in stack
* depth 0 (there is always a real fs below it). An overlayfs
@@ -561,27 +562,73 @@ static inline void ovl_lockdep_annotate_inode_mutex_key(struct inode *inode)
#endif
}
-static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev,
- unsigned long ino, int fsid)
+static void ovl_next_ino(struct inode *inode)
+{
+ struct ovl_fs *ofs = inode->i_sb->s_fs_info;
+
+ inode->i_ino = atomic_long_inc_return(&ofs->last_ino);
+ if (unlikely(!inode->i_ino))
+ inode->i_ino = atomic_long_inc_return(&ofs->last_ino);
+}
+
+static void ovl_map_ino(struct inode *inode, unsigned long ino, int fsid)
{
int xinobits = ovl_xino_bits(inode->i_sb);
+ unsigned int xinoshift = 64 - xinobits;
/*
* When d_ino is consistent with st_ino (samefs or i_ino has enough
* bits to encode layer), set the same value used for st_ino to i_ino,
* so inode number exposed via /proc/locks and a like will be
* consistent with d_ino and st_ino values. An i_ino value inconsistent
- * with d_ino also causes nfsd readdirplus to fail. When called from
- * ovl_new_inode(), ino arg is 0, so i_ino will be updated to real
- * upper inode i_ino on ovl_inode_init() or ovl_inode_update().
+ * with d_ino also causes nfsd readdirplus to fail.
*/
- if (ovl_same_dev(inode->i_sb)) {
- inode->i_ino = ino;
- if (xinobits && fsid && !(ino >> (64 - xinobits)))
- inode->i_ino |= (unsigned long)fsid << (64 - xinobits);
- } else {
- inode->i_ino = get_next_ino();
+ inode->i_ino = ino;
+ if (ovl_same_fs(inode->i_sb)) {
+ return;
+ } else if (xinobits && likely(!(ino >> xinoshift))) {
+ inode->i_ino |= (unsigned long)fsid << (xinoshift + 1);
+ return;
+ }
+
+ /*
+ * For directory inodes on non-samefs with xino disabled or xino
+ * overflow, we allocate a non-persistent inode number, to be used for
+ * resolving st_ino collisions in ovl_map_dev_ino().
+ *
+ * To avoid ino collision with legitimate xino values from upper
+ * layer (fsid 0), use the lowest xinobit to map the non
+ * persistent inode numbers to the unified st_ino address space.
+ */
+ if (S_ISDIR(inode->i_mode)) {
+ ovl_next_ino(inode);
+ if (xinobits) {
+ inode->i_ino &= ~0UL >> xinobits;
+ inode->i_ino |= 1UL << xinoshift;
+ }
}
+}
+
+void ovl_inode_init(struct inode *inode, struct ovl_inode_params *oip,
+ unsigned long ino, int fsid)
+{
+ struct inode *realinode;
+
+ if (oip->upperdentry)
+ OVL_I(inode)->__upperdentry = oip->upperdentry;
+ if (oip->lowerpath && oip->lowerpath->dentry)
+ OVL_I(inode)->lower = igrab(d_inode(oip->lowerpath->dentry));
+ if (oip->lowerdata)
+ OVL_I(inode)->lowerdata = igrab(d_inode(oip->lowerdata));
+
+ realinode = ovl_inode_real(inode);
+ ovl_copyattr(realinode, inode);
+ ovl_copyflags(realinode, inode);
+ ovl_map_ino(inode, ino, fsid);
+}
+
+static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev)
+{
inode->i_mode = mode;
inode->i_flags |= S_NOCMTIME;
#ifdef CONFIG_FS_POSIX_ACL
@@ -719,7 +766,7 @@ struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev)
inode = new_inode(sb);
if (inode)
- ovl_fill_inode(inode, mode, rdev, 0, 0);
+ ovl_fill_inode(inode, mode, rdev);
return inode;
}
@@ -891,7 +938,7 @@ struct inode *ovl_get_inode(struct super_block *sb,
struct dentry *lowerdentry = lowerpath ? lowerpath->dentry : NULL;
bool bylower = ovl_hash_bylower(sb, upperdentry, lowerdentry,
oip->index);
- int fsid = bylower ? oip->lowerpath->layer->fsid : 0;
+ int fsid = bylower ? lowerpath->layer->fsid : 0;
bool is_dir, metacopy = false;
unsigned long ino = 0;
int err = oip->newinode ? -EEXIST : -ENOMEM;
@@ -941,9 +988,11 @@ struct inode *ovl_get_inode(struct super_block *sb,
err = -ENOMEM;
goto out_err;
}
+ ino = realinode->i_ino;
+ fsid = lowerpath->layer->fsid;
}
- ovl_fill_inode(inode, realinode->i_mode, realinode->i_rdev, ino, fsid);
- ovl_inode_init(inode, upperdentry, lowerdentry, oip->lowerdata);
+ ovl_fill_inode(inode, realinode->i_mode, realinode->i_rdev);
+ ovl_inode_init(inode, oip, ino, fsid);
if (upperdentry && ovl_is_impuredir(upperdentry))
ovl_set_flag(OVL_IMPURE, inode);
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index ed9e129fae04..0db23baf98e7 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -845,7 +845,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
if (err)
goto out;
- if (upperdentry && unlikely(ovl_dentry_remote(upperdentry))) {
+ if (upperdentry && upperdentry->d_flags & DCACHE_OP_REAL) {
dput(upperdentry);
err = -EREMOTE;
goto out;
@@ -1076,6 +1076,9 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
goto out_free_oe;
}
+ ovl_dentry_update_reval(dentry, upperdentry,
+ DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE);
+
revert_creds(old_cred);
if (origin_path) {
dput(origin_path->dentry);
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 3d3f2b8bdae5..e6f3670146ed 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -48,6 +48,12 @@ enum ovl_entry_flag {
OVL_E_CONNECTED,
};
+enum {
+ OVL_XINO_OFF,
+ OVL_XINO_AUTO,
+ OVL_XINO_ON,
+};
+
/*
* The tuple (fh,uuid) is a universal unique identifier for a copy up origin,
* where:
@@ -87,7 +93,7 @@ struct ovl_fb {
u8 flags; /* OVL_FH_FLAG_* */
u8 type; /* fid_type of fid */
uuid_t uuid; /* uuid of filesystem */
- u32 fid[0]; /* file identifier should be 32bit aligned in-memory */
+ u32 fid[]; /* file identifier should be 32bit aligned in-memory */
} __packed;
/* In-memory and on-wire format for overlay file handle */
@@ -230,6 +236,8 @@ bool ovl_index_all(struct super_block *sb);
bool ovl_verify_lower(struct super_block *sb);
struct ovl_entry *ovl_alloc_entry(unsigned int numlower);
bool ovl_dentry_remote(struct dentry *dentry);
+void ovl_dentry_update_reval(struct dentry *dentry, struct dentry *upperdentry,
+ unsigned int mask);
bool ovl_dentry_weird(struct dentry *dentry);
enum ovl_path_type ovl_path_type(struct dentry *dentry);
void ovl_path_upper(struct dentry *dentry, struct path *path);
@@ -264,8 +272,6 @@ void ovl_set_upperdata(struct inode *inode);
bool ovl_redirect_dir(struct super_block *sb);
const char *ovl_dentry_get_redirect(struct dentry *dentry);
void ovl_dentry_set_redirect(struct dentry *dentry, const char *redirect);
-void ovl_inode_init(struct inode *inode, struct dentry *upperdentry,
- struct dentry *lowerdentry, struct dentry *lowerdata);
void ovl_inode_update(struct inode *inode, struct dentry *upperdentry);
void ovl_dir_modified(struct dentry *dentry, bool impurity);
u64 ovl_dentry_version_get(struct dentry *dentry);
@@ -301,6 +307,16 @@ static inline bool ovl_is_impuredir(struct dentry *dentry)
return ovl_check_dir_xattr(dentry, OVL_XATTR_IMPURE);
}
+/*
+ * With xino=auto, we do best effort to keep all inodes on same st_dev and
+ * d_ino consistent with st_ino.
+ * With xino=on, we do the same effort but we warn if we failed.
+ */
+static inline bool ovl_xino_warn(struct super_block *sb)
+{
+ return OVL_FS(sb)->config.xino == OVL_XINO_ON;
+}
+
/* All layers on same fs? */
static inline bool ovl_same_fs(struct super_block *sb)
{
@@ -410,6 +426,8 @@ struct ovl_inode_params {
char *redirect;
struct dentry *lowerdata;
};
+void ovl_inode_init(struct inode *inode, struct ovl_inode_params *oip,
+ unsigned long ino, int fsid);
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev);
struct inode *ovl_lookup_inode(struct super_block *sb, struct dentry *real,
bool is_upper);
@@ -451,6 +469,7 @@ struct ovl_cattr {
struct dentry *ovl_create_real(struct inode *dir, struct dentry *newdentry,
struct ovl_cattr *attr);
int ovl_cleanup(struct inode *dir, struct dentry *dentry);
+struct dentry *ovl_lookup_temp(struct dentry *workdir);
struct dentry *ovl_create_temp(struct dentry *workdir, struct ovl_cattr *attr);
/* file.c */
diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h
index 89015ea822e7..5762d802fe01 100644
--- a/fs/overlayfs/ovl_entry.h
+++ b/fs/overlayfs/ovl_entry.h
@@ -75,6 +75,8 @@ struct ovl_fs {
struct inode *indexdir_trap;
/* -1: disabled, 0: same fs, 1..32: number of unused ino bits */
int xino_mode;
+ /* For allocation of non-persistent inode numbers */
+ atomic_long_t last_ino;
};
static inline struct ovl_fs *OVL_FS(struct super_block *sb)
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index 40ac9ce2465a..e452ff7d583d 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -438,15 +438,23 @@ static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry)
/* Map inode number to lower fs unique range */
static u64 ovl_remap_lower_ino(u64 ino, int xinobits, int fsid,
- const char *name, int namelen)
+ const char *name, int namelen, bool warn)
{
- if (ino >> (64 - xinobits)) {
- pr_warn_ratelimited("d_ino too big (%.*s, ino=%llu, xinobits=%d)\n",
- namelen, name, ino, xinobits);
+ unsigned int xinoshift = 64 - xinobits;
+
+ if (unlikely(ino >> xinoshift)) {
+ if (warn) {
+ pr_warn_ratelimited("d_ino too big (%.*s, ino=%llu, xinobits=%d)\n",
+ namelen, name, ino, xinobits);
+ }
return ino;
}
- return ino | ((u64)fsid) << (64 - xinobits);
+ /*
+ * The lowest xinobit is reserved for mapping the non-peresistent inode
+ * numbers range, but this range is only exposed via st_ino, not here.
+ */
+ return ino | ((u64)fsid) << (xinoshift + 1);
}
/*
@@ -515,7 +523,8 @@ get:
} else if (xinobits && !OVL_TYPE_UPPER(type)) {
ino = ovl_remap_lower_ino(ino, xinobits,
ovl_layer_lower(this)->fsid,
- p->name, p->len);
+ p->name, p->len,
+ ovl_xino_warn(dir->d_sb));
}
out:
@@ -645,6 +654,7 @@ struct ovl_readdir_translate {
u64 parent_ino;
int fsid;
int xinobits;
+ bool xinowarn;
};
static int ovl_fill_real(struct dir_context *ctx, const char *name,
@@ -665,7 +675,7 @@ static int ovl_fill_real(struct dir_context *ctx, const char *name,
ino = p->ino;
} else if (rdt->xinobits) {
ino = ovl_remap_lower_ino(ino, rdt->xinobits, rdt->fsid,
- name, namelen);
+ name, namelen, rdt->xinowarn);
}
return orig_ctx->actor(orig_ctx, name, namelen, offset, ino, d_type);
@@ -696,6 +706,7 @@ static int ovl_iterate_real(struct file *file, struct dir_context *ctx)
.ctx.actor = ovl_fill_real,
.orig_ctx = ctx,
.xinobits = ovl_xino_bits(dir->d_sb),
+ .xinowarn = ovl_xino_warn(dir->d_sb),
};
if (rdt.xinobits && lower_layer)
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index ac967f1cb6e5..732ad5495c92 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -113,53 +113,54 @@ bug:
return dentry;
}
-static int ovl_dentry_revalidate(struct dentry *dentry, unsigned int flags)
+static int ovl_revalidate_real(struct dentry *d, unsigned int flags, bool weak)
{
- struct ovl_entry *oe = dentry->d_fsdata;
- unsigned int i;
int ret = 1;
- for (i = 0; i < oe->numlower; i++) {
- struct dentry *d = oe->lowerstack[i].dentry;
-
- if (d->d_flags & DCACHE_OP_REVALIDATE) {
- ret = d->d_op->d_revalidate(d, flags);
- if (ret < 0)
- return ret;
- if (!ret) {
- if (!(flags & LOOKUP_RCU))
- d_invalidate(d);
- return -ESTALE;
- }
+ if (weak) {
+ if (d->d_flags & DCACHE_OP_WEAK_REVALIDATE)
+ ret = d->d_op->d_weak_revalidate(d, flags);
+ } else if (d->d_flags & DCACHE_OP_REVALIDATE) {
+ ret = d->d_op->d_revalidate(d, flags);
+ if (!ret) {
+ if (!(flags & LOOKUP_RCU))
+ d_invalidate(d);
+ ret = -ESTALE;
}
}
- return 1;
+ return ret;
}
-static int ovl_dentry_weak_revalidate(struct dentry *dentry, unsigned int flags)
+static int ovl_dentry_revalidate_common(struct dentry *dentry,
+ unsigned int flags, bool weak)
{
struct ovl_entry *oe = dentry->d_fsdata;
+ struct dentry *upper;
unsigned int i;
int ret = 1;
- for (i = 0; i < oe->numlower; i++) {
- struct dentry *d = oe->lowerstack[i].dentry;
+ upper = ovl_dentry_upper(dentry);
+ if (upper)
+ ret = ovl_revalidate_real(upper, flags, weak);
- if (d->d_flags & DCACHE_OP_WEAK_REVALIDATE) {
- ret = d->d_op->d_weak_revalidate(d, flags);
- if (ret <= 0)
- break;
- }
+ for (i = 0; ret > 0 && i < oe->numlower; i++) {
+ ret = ovl_revalidate_real(oe->lowerstack[i].dentry, flags,
+ weak);
}
return ret;
}
-static const struct dentry_operations ovl_dentry_operations = {
- .d_release = ovl_dentry_release,
- .d_real = ovl_d_real,
-};
+static int ovl_dentry_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ return ovl_dentry_revalidate_common(dentry, flags, false);
+}
+
+static int ovl_dentry_weak_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ return ovl_dentry_revalidate_common(dentry, flags, true);
+}
-static const struct dentry_operations ovl_reval_dentry_operations = {
+static const struct dentry_operations ovl_dentry_operations = {
.d_release = ovl_dentry_release,
.d_real = ovl_d_real,
.d_revalidate = ovl_dentry_revalidate,
@@ -316,12 +317,6 @@ static const char *ovl_redirect_mode_def(void)
return ovl_redirect_dir_def ? "on" : "off";
}
-enum {
- OVL_XINO_OFF,
- OVL_XINO_AUTO,
- OVL_XINO_ON,
-};
-
static const char * const ovl_xino_str[] = {
"off",
"auto",
@@ -751,13 +746,12 @@ static int ovl_mount_dir(const char *name, struct path *path)
ovl_unescape(tmp);
err = ovl_mount_dir_noesc(tmp, path);
- if (!err)
- if (ovl_dentry_remote(path->dentry)) {
- pr_err("filesystem on '%s' not supported as upperdir\n",
- tmp);
- path_put_init(path);
- err = -EINVAL;
- }
+ if (!err && path->dentry->d_flags & DCACHE_OP_REAL) {
+ pr_err("filesystem on '%s' not supported as upperdir\n",
+ tmp);
+ path_put_init(path);
+ err = -EINVAL;
+ }
kfree(tmp);
}
return err;
@@ -778,7 +772,7 @@ static int ovl_check_namelen(struct path *path, struct ovl_fs *ofs,
}
static int ovl_lower_dir(const char *name, struct path *path,
- struct ovl_fs *ofs, int *stack_depth, bool *remote)
+ struct ovl_fs *ofs, int *stack_depth)
{
int fh_type;
int err;
@@ -793,9 +787,6 @@ static int ovl_lower_dir(const char *name, struct path *path,
*stack_depth = max(*stack_depth, path->mnt->mnt_sb->s_stack_depth);
- if (ovl_dentry_remote(path->dentry))
- *remote = true;
-
/*
* The inodes index feature and NFS export need to encode and decode
* file handles, so they require that all layers support them.
@@ -1074,11 +1065,73 @@ out:
return err;
}
+/*
+ * Returns 1 if RENAME_WHITEOUT is supported, 0 if not supported and
+ * negative values if error is encountered.
+ */
+static int ovl_check_rename_whiteout(struct dentry *workdir)
+{
+ struct inode *dir = d_inode(workdir);
+ struct dentry *temp;
+ struct dentry *dest;
+ struct dentry *whiteout;
+ struct name_snapshot name;
+ int err;
+
+ inode_lock_nested(dir, I_MUTEX_PARENT);
+
+ temp = ovl_create_temp(workdir, OVL_CATTR(S_IFREG | 0));
+ err = PTR_ERR(temp);
+ if (IS_ERR(temp))
+ goto out_unlock;
+
+ dest = ovl_lookup_temp(workdir);
+ err = PTR_ERR(dest);
+ if (IS_ERR(dest)) {
+ dput(temp);
+ goto out_unlock;
+ }
+
+ /* Name is inline and stable - using snapshot as a copy helper */
+ take_dentry_name_snapshot(&name, temp);
+ err = ovl_do_rename(dir, temp, dir, dest, RENAME_WHITEOUT);
+ if (err) {
+ if (err == -EINVAL)
+ err = 0;
+ goto cleanup_temp;
+ }
+
+ whiteout = lookup_one_len(name.name.name, workdir, name.name.len);
+ err = PTR_ERR(whiteout);
+ if (IS_ERR(whiteout))
+ goto cleanup_temp;
+
+ err = ovl_is_whiteout(whiteout);
+
+ /* Best effort cleanup of whiteout and temp file */
+ if (err)
+ ovl_cleanup(dir, whiteout);
+ dput(whiteout);
+
+cleanup_temp:
+ ovl_cleanup(dir, temp);
+ release_dentry_name_snapshot(&name);
+ dput(temp);
+ dput(dest);
+
+out_unlock:
+ inode_unlock(dir);
+
+ return err;
+}
+
static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs,
struct path *workpath)
{
struct vfsmount *mnt = ofs->upper_mnt;
struct dentry *temp;
+ bool rename_whiteout;
+ bool d_type;
int fh_type;
int err;
@@ -1104,11 +1157,8 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs,
if (err < 0)
goto out;
- /*
- * We allowed this configuration and don't want to break users over
- * kernel upgrade. So warn instead of erroring out.
- */
- if (!err)
+ d_type = err;
+ if (!d_type)
pr_warn("upper fs needs to support d_type.\n");
/* Check if upper/work fs supports O_TMPFILE */
@@ -1119,6 +1169,16 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs,
else
pr_warn("upper fs does not support tmpfile.\n");
+
+ /* Check if upper/work fs supports RENAME_WHITEOUT */
+ err = ovl_check_rename_whiteout(ofs->workdir);
+ if (err < 0)
+ goto out;
+
+ rename_whiteout = err;
+ if (!rename_whiteout)
+ pr_warn("upper fs does not support RENAME_WHITEOUT.\n");
+
/*
* Check if upper/work fs supports trusted.overlay.* xattr
*/
@@ -1133,6 +1193,18 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs,
vfs_removexattr(ofs->workdir, OVL_XATTR_OPAQUE);
}
+ /*
+ * We allowed sub-optimal upper fs configuration and don't want to break
+ * users over kernel upgrade, but we never allowed remote upper fs, so
+ * we can enforce strict requirements for remote upper fs.
+ */
+ if (ovl_dentry_remote(ofs->workdir) &&
+ (!d_type || !rename_whiteout || ofs->noxattr)) {
+ pr_err("upper fs missing required features.\n");
+ err = -EINVAL;
+ goto out;
+ }
+
/* Check if upper/work fs supports file handles */
fh_type = ovl_can_decode_fh(ofs->workdir->d_sb);
if (ofs->config.index && !fh_type) {
@@ -1401,11 +1473,12 @@ static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs,
/*
* When all layers on same fs, overlay can use real inode numbers.
- * With mount option "xino=on", mounter declares that there are enough
- * free high bits in underlying fs to hold the unique fsid.
+ * With mount option "xino=<on|auto>", mounter declares that there are
+ * enough free high bits in underlying fs to hold the unique fsid.
* If overlayfs does encounter underlying inodes using the high xino
* bits reserved for fsid, it emits a warning and uses the original
- * inode number.
+ * inode number or a non persistent inode number allocated from a
+ * dedicated range.
*/
if (ofs->numfs - !ofs->upper_mnt == 1) {
if (ofs->config.xino == OVL_XINO_ON)
@@ -1413,14 +1486,16 @@ static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs,
ofs->xino_mode = 0;
} else if (ofs->config.xino == OVL_XINO_OFF) {
ofs->xino_mode = -1;
- } else if (ofs->config.xino == OVL_XINO_ON && ofs->xino_mode < 0) {
+ } else if (ofs->xino_mode < 0) {
/*
* This is a roundup of number of bits needed for encoding
- * fsid, where fsid 0 is reserved for upper fs even with
- * lower only overlay.
+ * fsid, where fsid 0 is reserved for upper fs (even with
+ * lower only overlay) +1 extra bit is reserved for the non
+ * persistent inode number range that is used for resolving
+ * xino lower bits overflow.
*/
- BUILD_BUG_ON(ilog2(OVL_MAX_STACK) > 31);
- ofs->xino_mode = ilog2(ofs->numfs - 1) + 1;
+ BUILD_BUG_ON(ilog2(OVL_MAX_STACK) > 30);
+ ofs->xino_mode = ilog2(ofs->numfs - 1) + 2;
}
if (ofs->xino_mode > 0) {
@@ -1440,7 +1515,6 @@ static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb,
char *lowertmp, *lower;
struct path *stack = NULL;
unsigned int stacklen, numlower = 0, i;
- bool remote = false;
struct ovl_entry *oe;
err = -ENOMEM;
@@ -1472,7 +1546,7 @@ static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb,
lower = lowertmp;
for (numlower = 0; numlower < stacklen; numlower++) {
err = ovl_lower_dir(lower, &stack[numlower], ofs,
- &sb->s_stack_depth, &remote);
+ &sb->s_stack_depth);
if (err)
goto out_err;
@@ -1500,11 +1574,6 @@ static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb,
oe->lowerstack[i].layer = &ofs->layers[i+1];
}
- if (remote)
- sb->s_d_op = &ovl_reval_dentry_operations;
- else
- sb->s_d_op = &ovl_dentry_operations;
-
out:
for (i = 0; i < numlower; i++)
path_put(&stack[i]);
@@ -1589,6 +1658,44 @@ static int ovl_check_overlapping_layers(struct super_block *sb,
return 0;
}
+static struct dentry *ovl_get_root(struct super_block *sb,
+ struct dentry *upperdentry,
+ struct ovl_entry *oe)
+{
+ struct dentry *root;
+ struct ovl_path *lowerpath = &oe->lowerstack[0];
+ unsigned long ino = d_inode(lowerpath->dentry)->i_ino;
+ int fsid = lowerpath->layer->fsid;
+ struct ovl_inode_params oip = {
+ .upperdentry = upperdentry,
+ .lowerpath = lowerpath,
+ };
+
+ root = d_make_root(ovl_new_inode(sb, S_IFDIR, 0));
+ if (!root)
+ return NULL;
+
+ root->d_fsdata = oe;
+
+ if (upperdentry) {
+ /* Root inode uses upper st_ino/i_ino */
+ ino = d_inode(upperdentry)->i_ino;
+ fsid = 0;
+ ovl_dentry_set_upper_alias(root);
+ if (ovl_is_impuredir(upperdentry))
+ ovl_set_flag(OVL_IMPURE, d_inode(root));
+ }
+
+ /* Root is always merge -> can have whiteouts */
+ ovl_set_flag(OVL_WHITEOUTS, d_inode(root));
+ ovl_dentry_set_flag(OVL_E_CONNECTED, root);
+ ovl_set_upperdata(d_inode(root));
+ ovl_inode_init(d_inode(root), &oip, ino, fsid);
+ ovl_dentry_update_reval(root, upperdentry, DCACHE_OP_WEAK_REVALIDATE);
+
+ return root;
+}
+
static int ovl_fill_super(struct super_block *sb, void *data, int silent)
{
struct path upperpath = { };
@@ -1598,6 +1705,8 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
struct cred *cred;
int err;
+ sb->s_d_op = &ovl_dentry_operations;
+
err = -ENOMEM;
ofs = kzalloc(sizeof(struct ovl_fs), GFP_KERNEL);
if (!ofs)
@@ -1624,6 +1733,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
sb->s_stack_depth = 0;
sb->s_maxbytes = MAX_LFS_FILESIZE;
+ atomic_long_set(&ofs->last_ino, 1);
/* Assume underlaying fs uses 32bit inodes unless proven otherwise */
if (ofs->config.xino != OVL_XINO_OFF) {
ofs->xino_mode = BITS_PER_LONG - 32;
@@ -1710,25 +1820,11 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
sb->s_flags |= SB_POSIXACL;
err = -ENOMEM;
- root_dentry = d_make_root(ovl_new_inode(sb, S_IFDIR, 0));
+ root_dentry = ovl_get_root(sb, upperpath.dentry, oe);
if (!root_dentry)
goto out_free_oe;
- root_dentry->d_fsdata = oe;
-
mntput(upperpath.mnt);
- if (upperpath.dentry) {
- ovl_dentry_set_upper_alias(root_dentry);
- if (ovl_is_impuredir(upperpath.dentry))
- ovl_set_flag(OVL_IMPURE, d_inode(root_dentry));
- }
-
- /* Root is always merge -> can have whiteouts */
- ovl_set_flag(OVL_WHITEOUTS, d_inode(root_dentry));
- ovl_dentry_set_flag(OVL_E_CONNECTED, root_dentry);
- ovl_set_upperdata(d_inode(root_dentry));
- ovl_inode_init(d_inode(root_dentry), upperpath.dentry,
- ovl_dentry_lower(root_dentry), NULL);
sb->s_root = root_dentry;
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 042f7eb4f7f4..36b60788ee47 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -93,8 +93,24 @@ struct ovl_entry *ovl_alloc_entry(unsigned int numlower)
bool ovl_dentry_remote(struct dentry *dentry)
{
return dentry->d_flags &
- (DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE |
- DCACHE_OP_REAL);
+ (DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE);
+}
+
+void ovl_dentry_update_reval(struct dentry *dentry, struct dentry *upperdentry,
+ unsigned int mask)
+{
+ struct ovl_entry *oe = OVL_E(dentry);
+ unsigned int i, flags = 0;
+
+ if (upperdentry)
+ flags |= upperdentry->d_flags;
+ for (i = 0; i < oe->numlower; i++)
+ flags |= oe->lowerstack[i].dentry->d_flags;
+
+ spin_lock(&dentry->d_lock);
+ dentry->d_flags &= ~mask;
+ dentry->d_flags |= flags & mask;
+ spin_unlock(&dentry->d_lock);
}
bool ovl_dentry_weird(struct dentry *dentry)
@@ -386,24 +402,6 @@ void ovl_dentry_set_redirect(struct dentry *dentry, const char *redirect)
oi->redirect = redirect;
}
-void ovl_inode_init(struct inode *inode, struct dentry *upperdentry,
- struct dentry *lowerdentry, struct dentry *lowerdata)
-{
- struct inode *realinode = d_inode(upperdentry ?: lowerdentry);
-
- if (upperdentry)
- OVL_I(inode)->__upperdentry = upperdentry;
- if (lowerdentry)
- OVL_I(inode)->lower = igrab(d_inode(lowerdentry));
- if (lowerdata)
- OVL_I(inode)->lowerdata = igrab(d_inode(lowerdata));
-
- ovl_copyattr(realinode, inode);
- ovl_copyflags(realinode, inode);
- if (!inode->i_ino)
- inode->i_ino = realinode->i_ino;
-}
-
void ovl_inode_update(struct inode *inode, struct dentry *upperdentry)
{
struct inode *upperinode = d_inode(upperdentry);
@@ -416,8 +414,6 @@ void ovl_inode_update(struct inode *inode, struct dentry *upperdentry)
smp_wmb();
OVL_I(inode)->__upperdentry = upperdentry;
if (inode_unhashed(inode)) {
- if (!inode->i_ino)
- inode->i_ino = upperinode->i_ino;
inode->i_private = upperinode;
__insert_inode_hash(inode, (unsigned long) upperinode);
}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 74f948a6b621..6042b646ab27 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1839,9 +1839,9 @@ void proc_pid_evict_inode(struct proc_inode *ei)
struct pid *pid = ei->pid;
if (S_ISDIR(ei->vfs_inode.i_mode)) {
- spin_lock(&pid->wait_pidfd.lock);
+ spin_lock(&pid->lock);
hlist_del_init_rcu(&ei->sibling_inodes);
- spin_unlock(&pid->wait_pidfd.lock);
+ spin_unlock(&pid->lock);
}
put_pid(pid);
@@ -1877,9 +1877,9 @@ struct inode *proc_pid_make_inode(struct super_block * sb,
/* Let the pid remember us for quick removal */
ei->pid = pid;
if (S_ISDIR(mode)) {
- spin_lock(&pid->wait_pidfd.lock);
+ spin_lock(&pid->lock);
hlist_add_head_rcu(&ei->sibling_inodes, &pid->inodes);
- spin_unlock(&pid->wait_pidfd.lock);
+ spin_unlock(&pid->lock);
}
task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
@@ -3273,7 +3273,7 @@ static const struct inode_operations proc_tgid_base_inode_operations = {
void proc_flush_pid(struct pid *pid)
{
- proc_invalidate_siblings_dcache(&pid->inodes, &pid->wait_pidfd.lock);
+ proc_invalidate_siblings_dcache(&pid->inodes, &pid->lock);
put_pid(pid);
}
diff --git a/fs/read_write.c b/fs/read_write.c
index 59d819c5b92e..bbfa9b12b15e 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -331,7 +331,8 @@ COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned i
}
#endif
-#if !defined(CONFIG_64BIT) || defined(CONFIG_COMPAT)
+#if !defined(CONFIG_64BIT) || defined(CONFIG_COMPAT) || \
+ defined(__ARCH_WANT_SYS_LLSEEK)
SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
unsigned long, offset_low, loff_t __user *, result,
unsigned int, whence)
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 79781ebd2145..70f5fdf99bf6 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -232,9 +232,12 @@ Fill:
loff_t pos = m->index;
p = m->op->next(m, p, &m->index);
- if (pos == m->index)
- /* Buggy ->next function */
+ if (pos == m->index) {
+ pr_info_ratelimited("buggy seq_file .next function %ps "
+ "did not updated position index\n",
+ m->op->next);
m->index++;
+ }
if (!p || IS_ERR(p)) {
err = PTR_ERR(p);
break;
diff --git a/fs/udf/ecma_167.h b/fs/udf/ecma_167.h
index 3fd85464abd5..736ebc5dc441 100644
--- a/fs/udf/ecma_167.h
+++ b/fs/udf/ecma_167.h
@@ -5,7 +5,7 @@
* http://www.ecma.ch
*
* Copyright (c) 2001-2002 Ben Fennema
- * Copyright (c) 2017-2019 Pali Rohár <pali.rohar@gmail.com>
+ * Copyright (c) 2017-2019 Pali Rohár <pali@kernel.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
diff --git a/fs/udf/osta_udf.h b/fs/udf/osta_udf.h
index 35e61b2cacfe..d5fbfab3ddb6 100644
--- a/fs/udf/osta_udf.h
+++ b/fs/udf/osta_udf.h
@@ -5,7 +5,7 @@
* http://www.osta.org
*
* Copyright (c) 2001-2004 Ben Fennema
- * Copyright (c) 2017-2019 Pali Rohár <pali.rohar@gmail.com>
+ * Copyright (c) 2017-2019 Pali Rohár <pali@kernel.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 00266de58954..c526c5e5ab76 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -328,6 +328,38 @@ xfs_validate_sb_common(
return -EFSCORRUPTED;
}
+ /* Validate the realtime geometry; stolen from xfs_repair */
+ if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE ||
+ sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) {
+ xfs_notice(mp,
+ "realtime extent sanity check failed");
+ return -EFSCORRUPTED;
+ }
+
+ if (sbp->sb_rblocks == 0) {
+ if (sbp->sb_rextents != 0 || sbp->sb_rbmblocks != 0 ||
+ sbp->sb_rextslog != 0 || sbp->sb_frextents != 0) {
+ xfs_notice(mp,
+ "realtime zeroed geometry check failed");
+ return -EFSCORRUPTED;
+ }
+ } else {
+ uint64_t rexts;
+ uint64_t rbmblocks;
+
+ rexts = div_u64(sbp->sb_rblocks, sbp->sb_rextsize);
+ rbmblocks = howmany_64(sbp->sb_rextents,
+ NBBY * sbp->sb_blocksize);
+
+ if (sbp->sb_rextents != rexts ||
+ sbp->sb_rextslog != xfs_highbit32(sbp->sb_rextents) ||
+ sbp->sb_rbmblocks != rbmblocks) {
+ xfs_notice(mp,
+ "realtime geometry sanity check failed");
+ return -EFSCORRUPTED;
+ }
+ }
+
if (sbp->sb_unit) {
if (!xfs_sb_version_hasdalign(sbp) ||
sbp->sb_unit > sbp->sb_width ||
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index f880141a2268..9ec3eaf1c618 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -327,6 +327,9 @@ xfs_buf_free(
__free_page(page);
}
+ if (current->reclaim_state)
+ current->reclaim_state->reclaimed_slab +=
+ bp->b_page_count;
} else if (bp->b_flags & _XBF_KMEM)
kmem_free(bp->b_addr);
_xfs_buf_free_pages(bp);
@@ -2114,9 +2117,11 @@ xfs_buf_delwri_pushbuf(
int __init
xfs_buf_init(void)
{
- xfs_buf_zone = kmem_cache_create("xfs_buf",
- sizeof(struct xfs_buf), 0,
- SLAB_HWCACHE_ALIGN, NULL);
+ xfs_buf_zone = kmem_cache_create("xfs_buf", sizeof(struct xfs_buf), 0,
+ SLAB_HWCACHE_ALIGN |
+ SLAB_RECLAIM_ACCOUNT |
+ SLAB_MEM_SPREAD,
+ NULL);
if (!xfs_buf_zone)
goto out;
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 711376ca269f..af2c8e5ceea0 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -1105,8 +1105,8 @@ xfs_qm_dqflush(
* Get the buffer containing the on-disk dquot
*/
error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
- mp->m_quotainfo->qi_dqchunklen, 0, &bp,
- &xfs_dquot_buf_ops);
+ mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK,
+ &bp, &xfs_dquot_buf_ops);
if (error)
goto out_unlock;
@@ -1177,7 +1177,7 @@ xfs_qm_dqflush(
out_unlock:
xfs_dqfunlock(dqp);
- return -EIO;
+ return error;
}
/*
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index cf65e2e43c6e..baad1748d0d1 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -189,7 +189,8 @@ xfs_qm_dquot_logitem_push(
if (!xfs_buf_delwri_queue(bp, buffer_list))
rval = XFS_ITEM_FLUSHING;
xfs_buf_relse(bp);
- }
+ } else if (error == -EAGAIN)
+ rval = XFS_ITEM_LOCKED;
spin_lock(&lip->li_ailp->ail_lock);
out_unlock:
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index f1372f9046e3..5a4b0119143a 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -15,7 +15,6 @@
#include "xfs_trans.h"
#include "xfs_inode_item.h"
#include "xfs_icache.h"
-#include "xfs_log.h"
#include "xfs_pnfs.h"
/*
@@ -221,18 +220,7 @@ STATIC int
xfs_fs_nfs_commit_metadata(
struct inode *inode)
{
- struct xfs_inode *ip = XFS_I(inode);
- struct xfs_mount *mp = ip->i_mount;
- xfs_lsn_t lsn = 0;
-
- xfs_ilock(ip, XFS_ILOCK_SHARED);
- if (xfs_ipincount(ip))
- lsn = ip->i_itemp->ili_last_lsn;
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
-
- if (!lsn)
- return 0;
- return xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
+ return xfs_log_force_inode(XFS_I(inode));
}
const struct export_operations xfs_export_operations = {
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index b8a4a3f29b36..4b8bdecc3863 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -80,19 +80,9 @@ xfs_dir_fsync(
int datasync)
{
struct xfs_inode *ip = XFS_I(file->f_mapping->host);
- struct xfs_mount *mp = ip->i_mount;
- xfs_lsn_t lsn = 0;
trace_xfs_dir_fsync(ip);
-
- xfs_ilock(ip, XFS_ILOCK_SHARED);
- if (xfs_ipincount(ip))
- lsn = ip->i_itemp->ili_last_lsn;
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
-
- if (!lsn)
- return 0;
- return xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
+ return xfs_log_force_inode(ip);
}
STATIC int
@@ -1069,7 +1059,11 @@ xfs_file_remap_range(
ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
remap_flags);
+ if (ret)
+ goto out_unlock;
+ if (mp->m_flags & XFS_MOUNT_WSYNC)
+ xfs_log_force_inode(dest);
out_unlock:
xfs_reflink_remap_unlock(file_in, file_out);
if (ret)
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 14b922f2a6db..d1772786af29 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1200,8 +1200,7 @@ xfs_create(
unlock_dp_on_error = false;
error = xfs_dir_createname(tp, dp, name, ip->i_ino,
- resblks ?
- resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
+ resblks - XFS_IALLOC_SPACE_RES(mp));
if (error) {
ASSERT(error != -ENOSPC);
goto out_trans_cancel;
@@ -2504,6 +2503,88 @@ out:
}
/*
+ * Look up the inode number specified and mark it stale if it is found. If it is
+ * dirty, return the inode so it can be attached to the cluster buffer so it can
+ * be processed appropriately when the cluster free transaction completes.
+ */
+static struct xfs_inode *
+xfs_ifree_get_one_inode(
+ struct xfs_perag *pag,
+ struct xfs_inode *free_ip,
+ xfs_ino_t inum)
+{
+ struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_inode *ip;
+
+retry:
+ rcu_read_lock();
+ ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, inum));
+
+ /* Inode not in memory, nothing to do */
+ if (!ip)
+ goto out_rcu_unlock;
+
+ /*
+ * because this is an RCU protected lookup, we could find a recently
+ * freed or even reallocated inode during the lookup. We need to check
+ * under the i_flags_lock for a valid inode here. Skip it if it is not
+ * valid, the wrong inode or stale.
+ */
+ spin_lock(&ip->i_flags_lock);
+ if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE)) {
+ spin_unlock(&ip->i_flags_lock);
+ goto out_rcu_unlock;
+ }
+ spin_unlock(&ip->i_flags_lock);
+
+ /*
+ * Don't try to lock/unlock the current inode, but we _cannot_ skip the
+ * other inodes that we did not find in the list attached to the buffer
+ * and are not already marked stale. If we can't lock it, back off and
+ * retry.
+ */
+ if (ip != free_ip) {
+ if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
+ rcu_read_unlock();
+ delay(1);
+ goto retry;
+ }
+
+ /*
+ * Check the inode number again in case we're racing with
+ * freeing in xfs_reclaim_inode(). See the comments in that
+ * function for more information as to why the initial check is
+ * not sufficient.
+ */
+ if (ip->i_ino != inum) {
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ goto out_rcu_unlock;
+ }
+ }
+ rcu_read_unlock();
+
+ xfs_iflock(ip);
+ xfs_iflags_set(ip, XFS_ISTALE);
+
+ /*
+ * We don't need to attach clean inodes or those only with unlogged
+ * changes (which we throw away, anyway).
+ */
+ if (!ip->i_itemp || xfs_inode_clean(ip)) {
+ ASSERT(ip != free_ip);
+ xfs_ifunlock(ip);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ goto out_no_inode;
+ }
+ return ip;
+
+out_rcu_unlock:
+ rcu_read_unlock();
+out_no_inode:
+ return NULL;
+}
+
+/*
* A big issue when freeing the inode cluster is that we _cannot_ skip any
* inodes that are in memory - they all must be marked stale and attached to
* the cluster buffer.
@@ -2603,77 +2684,11 @@ xfs_ifree_cluster(
* even trying to lock them.
*/
for (i = 0; i < igeo->inodes_per_cluster; i++) {
-retry:
- rcu_read_lock();
- ip = radix_tree_lookup(&pag->pag_ici_root,
- XFS_INO_TO_AGINO(mp, (inum + i)));
-
- /* Inode not in memory, nothing to do */
- if (!ip) {
- rcu_read_unlock();
+ ip = xfs_ifree_get_one_inode(pag, free_ip, inum + i);
+ if (!ip)
continue;
- }
-
- /*
- * because this is an RCU protected lookup, we could
- * find a recently freed or even reallocated inode
- * during the lookup. We need to check under the
- * i_flags_lock for a valid inode here. Skip it if it
- * is not valid, the wrong inode or stale.
- */
- spin_lock(&ip->i_flags_lock);
- if (ip->i_ino != inum + i ||
- __xfs_iflags_test(ip, XFS_ISTALE)) {
- spin_unlock(&ip->i_flags_lock);
- rcu_read_unlock();
- continue;
- }
- spin_unlock(&ip->i_flags_lock);
-
- /*
- * Don't try to lock/unlock the current inode, but we
- * _cannot_ skip the other inodes that we did not find
- * in the list attached to the buffer and are not
- * already marked stale. If we can't lock it, back off
- * and retry.
- */
- if (ip != free_ip) {
- if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
- rcu_read_unlock();
- delay(1);
- goto retry;
- }
-
- /*
- * Check the inode number again in case we're
- * racing with freeing in xfs_reclaim_inode().
- * See the comments in that function for more
- * information as to why the initial check is
- * not sufficient.
- */
- if (ip->i_ino != inum + i) {
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- rcu_read_unlock();
- continue;
- }
- }
- rcu_read_unlock();
-
- xfs_iflock(ip);
- xfs_iflags_set(ip, XFS_ISTALE);
- /*
- * we don't need to attach clean inodes or those only
- * with unlogged changes (which we throw away, anyway).
- */
iip = ip->i_itemp;
- if (!iip || xfs_inode_clean(ip)) {
- ASSERT(ip != free_ip);
- xfs_ifunlock(ip);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- continue;
- }
-
iip->ili_last_fields = iip->ili_fields;
iip->ili_fields = 0;
iip->ili_fsync_fields = 0;
@@ -3930,3 +3945,22 @@ xfs_irele(
trace_xfs_irele(ip, _RET_IP_);
iput(VFS_I(ip));
}
+
+/*
+ * Ensure all commited transactions touching the inode are written to the log.
+ */
+int
+xfs_log_force_inode(
+ struct xfs_inode *ip)
+{
+ xfs_lsn_t lsn = 0;
+
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+ if (xfs_ipincount(ip))
+ lsn = ip->i_itemp->ili_last_lsn;
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+ if (!lsn)
+ return 0;
+ return xfs_log_force_lsn(ip->i_mount, lsn, XFS_LOG_SYNC, NULL);
+}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 492e53992fa9..c6a63f6764a6 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -426,6 +426,7 @@ int xfs_itruncate_extents_flags(struct xfs_trans **,
struct xfs_inode *, int, xfs_fsize_t, int);
void xfs_iext_realloc(xfs_inode_t *, int, int);
+int xfs_log_force_inode(struct xfs_inode *ip);
void xfs_iunpin_wait(xfs_inode_t *);
#define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount))
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 4a3d13d4a022..f779cca2346f 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -552,7 +552,8 @@ xfs_inode_item_push(
if (!xfs_buf_delwri_queue(bp, buffer_list))
rval = XFS_ITEM_FLUSHING;
xfs_buf_relse(bp);
- }
+ } else if (error == -EAGAIN)
+ rval = XFS_ITEM_LOCKED;
spin_lock(&lip->li_ailp->ail_lock);
out_unlock:
@@ -730,29 +731,27 @@ xfs_iflush_done(
* holding the lock before removing the inode from the AIL.
*/
if (need_ail) {
- bool mlip_changed = false;
+ xfs_lsn_t tail_lsn = 0;
/* this is an opencoded batch version of xfs_trans_ail_delete */
spin_lock(&ailp->ail_lock);
list_for_each_entry(blip, &tmp, li_bio_list) {
if (INODE_ITEM(blip)->ili_logged &&
- blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn)
- mlip_changed |= xfs_ail_delete_one(ailp, blip);
- else {
+ blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn) {
+ /*
+ * xfs_ail_update_finish() only cares about the
+ * lsn of the first tail item removed, any
+ * others will be at the same or higher lsn so
+ * we just ignore them.
+ */
+ xfs_lsn_t lsn = xfs_ail_delete_one(ailp, blip);
+ if (!tail_lsn && lsn)
+ tail_lsn = lsn;
+ } else {
xfs_clear_li_failed(blip);
}
}
-
- if (mlip_changed) {
- if (!XFS_FORCED_SHUTDOWN(ailp->ail_mount))
- xlog_assign_tail_lsn_locked(ailp->ail_mount);
- if (list_empty(&ailp->ail_head))
- wake_up_all(&ailp->ail_empty);
- }
- spin_unlock(&ailp->ail_lock);
-
- if (mlip_changed)
- xfs_log_space_wake(ailp->ail_mount);
+ xfs_ail_update_finish(ailp, tail_lsn);
}
/*
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 4a53768c5397..00fda2e8e738 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -24,13 +24,6 @@
kmem_zone_t *xfs_log_ticket_zone;
/* Local miscellaneous function prototypes */
-STATIC int
-xlog_commit_record(
- struct xlog *log,
- struct xlog_ticket *ticket,
- struct xlog_in_core **iclog,
- xfs_lsn_t *commitlsnp);
-
STATIC struct xlog *
xlog_alloc_log(
struct xfs_mount *mp,
@@ -66,14 +59,6 @@ xlog_grant_push_ail(
struct xlog *log,
int need_bytes);
STATIC void
-xlog_regrant_reserve_log_space(
- struct xlog *log,
- struct xlog_ticket *ticket);
-STATIC void
-xlog_ungrant_log_space(
- struct xlog *log,
- struct xlog_ticket *ticket);
-STATIC void
xlog_sync(
struct xlog *log,
struct xlog_in_core *iclog);
@@ -478,73 +463,6 @@ out_error:
return error;
}
-
-/*
- * NOTES:
- *
- * 1. currblock field gets updated at startup and after in-core logs
- * marked as with WANT_SYNC.
- */
-
-/*
- * This routine is called when a user of a log manager ticket is done with
- * the reservation. If the ticket was ever used, then a commit record for
- * the associated transaction is written out as a log operation header with
- * no data. The flag XLOG_TIC_INITED is set when the first write occurs with
- * a given ticket. If the ticket was one with a permanent reservation, then
- * a few operations are done differently. Permanent reservation tickets by
- * default don't release the reservation. They just commit the current
- * transaction with the belief that the reservation is still needed. A flag
- * must be passed in before permanent reservations are actually released.
- * When these type of tickets are not released, they need to be set into
- * the inited state again. By doing this, a start record will be written
- * out when the next write occurs.
- */
-xfs_lsn_t
-xfs_log_done(
- struct xfs_mount *mp,
- struct xlog_ticket *ticket,
- struct xlog_in_core **iclog,
- bool regrant)
-{
- struct xlog *log = mp->m_log;
- xfs_lsn_t lsn = 0;
-
- if (XLOG_FORCED_SHUTDOWN(log) ||
- /*
- * If nothing was ever written, don't write out commit record.
- * If we get an error, just continue and give back the log ticket.
- */
- (((ticket->t_flags & XLOG_TIC_INITED) == 0) &&
- (xlog_commit_record(log, ticket, iclog, &lsn)))) {
- lsn = (xfs_lsn_t) -1;
- regrant = false;
- }
-
-
- if (!regrant) {
- trace_xfs_log_done_nonperm(log, ticket);
-
- /*
- * Release ticket if not permanent reservation or a specific
- * request has been made to release a permanent reservation.
- */
- xlog_ungrant_log_space(log, ticket);
- } else {
- trace_xfs_log_done_perm(log, ticket);
-
- xlog_regrant_reserve_log_space(log, ticket);
- /* If this ticket was a permanent reservation and we aren't
- * trying to release it, reset the inited flags; so next time
- * we write, a start record will be written out.
- */
- ticket->t_flags |= XLOG_TIC_INITED;
- }
-
- xfs_log_ticket_put(ticket);
- return lsn;
-}
-
static bool
__xlog_state_release_iclog(
struct xlog *log,
@@ -869,32 +787,44 @@ xlog_wait_on_iclog(
}
/*
- * Final log writes as part of unmount.
- *
- * Mark the filesystem clean as unmount happens. Note that during relocation
- * this routine needs to be executed as part of source-bag while the
- * deallocation must not be done until source-end.
+ * Write out an unmount record using the ticket provided. We have to account for
+ * the data space used in the unmount ticket as this write is not done from a
+ * transaction context that has already done the accounting for us.
*/
-
-/* Actually write the unmount record to disk. */
-static void
-xfs_log_write_unmount_record(
- struct xfs_mount *mp)
+static int
+xlog_write_unmount_record(
+ struct xlog *log,
+ struct xlog_ticket *ticket,
+ xfs_lsn_t *lsn,
+ uint flags)
{
- /* the data section must be 32 bit size aligned */
- struct xfs_unmount_log_format magic = {
+ struct xfs_unmount_log_format ulf = {
.magic = XLOG_UNMOUNT_TYPE,
};
struct xfs_log_iovec reg = {
- .i_addr = &magic,
- .i_len = sizeof(magic),
+ .i_addr = &ulf,
+ .i_len = sizeof(ulf),
.i_type = XLOG_REG_TYPE_UNMOUNT,
};
struct xfs_log_vec vec = {
.lv_niovecs = 1,
.lv_iovecp = &reg,
};
- struct xlog *log = mp->m_log;
+
+ /* account for space used by record data */
+ ticket->t_curr_res -= sizeof(ulf);
+ return xlog_write(log, &vec, ticket, lsn, NULL, flags, false);
+}
+
+/*
+ * Mark the filesystem clean by writing an unmount record to the head of the
+ * log.
+ */
+static void
+xlog_unmount_write(
+ struct xlog *log)
+{
+ struct xfs_mount *mp = log->l_mp;
struct xlog_in_core *iclog;
struct xlog_ticket *tic = NULL;
xfs_lsn_t lsn;
@@ -905,23 +835,7 @@ xfs_log_write_unmount_record(
if (error)
goto out_err;
- /*
- * If we think the summary counters are bad, clear the unmount header
- * flag in the unmount record so that the summary counters will be
- * recalculated during log recovery at next mount. Refer to
- * xlog_check_unmount_rec for more details.
- */
- if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS), mp,
- XFS_ERRTAG_FORCE_SUMMARY_RECALC)) {
- xfs_alert(mp, "%s: will fix summary counters at next mount",
- __func__);
- flags &= ~XLOG_UNMOUNT_TRANS;
- }
-
- /* remove inited flag, and account for space used */
- tic->t_flags = 0;
- tic->t_curr_res -= sizeof(magic);
- error = xlog_write(log, &vec, tic, &lsn, NULL, flags);
+ error = xlog_write_unmount_record(log, tic, &lsn, flags);
/*
* At this point, we're umounting anyway, so there's no point in
* transitioning log state to IOERROR. Just continue...
@@ -943,8 +857,7 @@ out_err:
if (tic) {
trace_xfs_log_umount_write(log, tic);
- xlog_ungrant_log_space(log, tic);
- xfs_log_ticket_put(tic);
+ xfs_log_ticket_ungrant(log, tic);
}
}
@@ -987,8 +900,22 @@ xfs_log_unmount_write(
if (XLOG_FORCED_SHUTDOWN(log))
return;
+
+ /*
+ * If we think the summary counters are bad, avoid writing the unmount
+ * record to force log recovery at next mount, after which the summary
+ * counters will be recalculated. Refer to xlog_check_unmount_rec for
+ * more details.
+ */
+ if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS), mp,
+ XFS_ERRTAG_FORCE_SUMMARY_RECALC)) {
+ xfs_alert(mp, "%s: will fix summary counters at next mount",
+ __func__);
+ return;
+ }
+
xfs_log_unmount_verify_iclog(log);
- xfs_log_write_unmount_record(mp);
+ xlog_unmount_write(log);
}
/*
@@ -1515,20 +1442,17 @@ out:
return ERR_PTR(error);
} /* xlog_alloc_log */
-
/*
* Write out the commit record of a transaction associated with the given
- * ticket. Return the lsn of the commit record.
+ * ticket to close off a running log write. Return the lsn of the commit record.
*/
-STATIC int
+int
xlog_commit_record(
struct xlog *log,
struct xlog_ticket *ticket,
struct xlog_in_core **iclog,
- xfs_lsn_t *commitlsnp)
+ xfs_lsn_t *lsn)
{
- struct xfs_mount *mp = log->l_mp;
- int error;
struct xfs_log_iovec reg = {
.i_addr = NULL,
.i_len = 0,
@@ -1538,12 +1462,15 @@ xlog_commit_record(
.lv_niovecs = 1,
.lv_iovecp = &reg,
};
+ int error;
+
+ if (XLOG_FORCED_SHUTDOWN(log))
+ return -EIO;
- ASSERT_ALWAYS(iclog);
- error = xlog_write(log, &vec, ticket, commitlsnp, iclog,
- XLOG_COMMIT_TRANS);
+ error = xlog_write(log, &vec, ticket, lsn, iclog, XLOG_COMMIT_TRANS,
+ false);
if (error)
- xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
+ xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
return error;
}
@@ -1761,7 +1688,15 @@ xlog_write_iclog(
iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart + bno;
iclog->ic_bio.bi_end_io = xlog_bio_end_io;
iclog->ic_bio.bi_private = iclog;
- iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC | REQ_FUA;
+
+ /*
+ * We use REQ_SYNC | REQ_IDLE here to tell the block layer the are more
+ * IOs coming immediately after this one. This prevents the block layer
+ * writeback throttle from throttling log writes behind background
+ * metadata writeback and causing priority inversions.
+ */
+ iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC |
+ REQ_IDLE | REQ_FUA;
if (need_flush)
iclog->ic_bio.bi_opf |= REQ_PREFLUSH;
@@ -1981,7 +1916,7 @@ xlog_dealloc_log(
log->l_mp->m_log = NULL;
destroy_workqueue(log->l_ioend_workqueue);
kmem_free(log);
-} /* xlog_dealloc_log */
+}
/*
* Update counters atomically now that memcpy is done.
@@ -2118,23 +2053,21 @@ xlog_print_trans(
}
/*
- * Calculate the potential space needed by the log vector. Each region gets
- * its own xlog_op_header_t and may need to be double word aligned.
+ * Calculate the potential space needed by the log vector. We may need a start
+ * record, and each region gets its own struct xlog_op_header and may need to be
+ * double word aligned.
*/
static int
xlog_write_calc_vec_length(
struct xlog_ticket *ticket,
- struct xfs_log_vec *log_vector)
+ struct xfs_log_vec *log_vector,
+ bool need_start_rec)
{
struct xfs_log_vec *lv;
- int headers = 0;
+ int headers = need_start_rec ? 1 : 0;
int len = 0;
int i;
- /* acct for start rec of xact */
- if (ticket->t_flags & XLOG_TIC_INITED)
- headers++;
-
for (lv = log_vector; lv; lv = lv->lv_next) {
/* we don't write ordered log vectors */
if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED)
@@ -2156,27 +2089,16 @@ xlog_write_calc_vec_length(
return len;
}
-/*
- * If first write for transaction, insert start record We can't be trying to
- * commit if we are inited. We can't have any "partial_copy" if we are inited.
- */
-static int
+static void
xlog_write_start_rec(
struct xlog_op_header *ophdr,
struct xlog_ticket *ticket)
{
- if (!(ticket->t_flags & XLOG_TIC_INITED))
- return 0;
-
ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
ophdr->oh_clientid = ticket->t_clientid;
ophdr->oh_len = 0;
ophdr->oh_flags = XLOG_START_TRANS;
ophdr->oh_res2 = 0;
-
- ticket->t_flags &= ~XLOG_TIC_INITED;
-
- return sizeof(struct xlog_op_header);
}
static xlog_op_header_t *
@@ -2365,13 +2287,14 @@ xlog_write(
struct xlog_ticket *ticket,
xfs_lsn_t *start_lsn,
struct xlog_in_core **commit_iclog,
- uint flags)
+ uint flags,
+ bool need_start_rec)
{
struct xlog_in_core *iclog = NULL;
- struct xfs_log_iovec *vecp;
- struct xfs_log_vec *lv;
+ struct xfs_log_vec *lv = log_vector;
+ struct xfs_log_iovec *vecp = lv->lv_iovecp;
+ int index = 0;
int len;
- int index;
int partial_copy = 0;
int partial_copy_len = 0;
int contwr = 0;
@@ -2379,25 +2302,13 @@ xlog_write(
int data_cnt = 0;
int error = 0;
- *start_lsn = 0;
-
- len = xlog_write_calc_vec_length(ticket, log_vector);
-
/*
- * Region headers and bytes are already accounted for.
- * We only need to take into account start records and
- * split regions in this function.
+ * If this is a commit or unmount transaction, we don't need a start
+ * record to be written. We do, however, have to account for the
+ * commit or unmount header that gets written. Hence we always have
+ * to account for an extra xlog_op_header here.
*/
- if (ticket->t_flags & XLOG_TIC_INITED)
- ticket->t_curr_res -= sizeof(xlog_op_header_t);
-
- /*
- * Commit record headers need to be accounted for. These
- * come in as separate writes so are easy to detect.
- */
- if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS))
- ticket->t_curr_res -= sizeof(xlog_op_header_t);
-
+ ticket->t_curr_res -= sizeof(struct xlog_op_header);
if (ticket->t_curr_res < 0) {
xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
"ctx ticket reservation ran out. Need to up reservation");
@@ -2405,9 +2316,8 @@ xlog_write(
xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
}
- index = 0;
- lv = log_vector;
- vecp = lv->lv_iovecp;
+ len = xlog_write_calc_vec_length(ticket, log_vector, need_start_rec);
+ *start_lsn = 0;
while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) {
void *ptr;
int log_offset;
@@ -2431,7 +2341,6 @@ xlog_write(
while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) {
struct xfs_log_iovec *reg;
struct xlog_op_header *ophdr;
- int start_rec_copy;
int copy_len;
int copy_off;
bool ordered = false;
@@ -2447,11 +2356,15 @@ xlog_write(
ASSERT(reg->i_len % sizeof(int32_t) == 0);
ASSERT((unsigned long)ptr % sizeof(int32_t) == 0);
- start_rec_copy = xlog_write_start_rec(ptr, ticket);
- if (start_rec_copy) {
- record_cnt++;
+ /*
+ * Before we start formatting log vectors, we need to
+ * write a start record. Only do this for the first
+ * iclog we write to.
+ */
+ if (need_start_rec) {
+ xlog_write_start_rec(ptr, ticket);
xlog_write_adv_cnt(&ptr, &len, &log_offset,
- start_rec_copy);
+ sizeof(struct xlog_op_header));
}
ophdr = xlog_write_setup_ophdr(log, ptr, ticket, flags);
@@ -2483,8 +2396,13 @@ xlog_write(
xlog_write_adv_cnt(&ptr, &len, &log_offset,
copy_len);
}
- copy_len += start_rec_copy + sizeof(xlog_op_header_t);
+ copy_len += sizeof(struct xlog_op_header);
record_cnt++;
+ if (need_start_rec) {
+ copy_len += sizeof(struct xlog_op_header);
+ record_cnt++;
+ need_start_rec = false;
+ }
data_cnt += contwr ? copy_len : 0;
error = xlog_write_copy_finish(log, iclog, flags,
@@ -2541,14 +2459,6 @@ next_lv:
return error;
}
-
-/*****************************************************************************
- *
- * State Machine functions
- *
- *****************************************************************************
- */
-
static void
xlog_state_activate_iclog(
struct xlog_in_core *iclog,
@@ -2909,7 +2819,7 @@ xlog_state_done_syncing(
*/
wake_up_all(&iclog->ic_write_wait);
spin_unlock(&log->l_icloglock);
- xlog_state_do_callback(log); /* also cleans log */
+ xlog_state_do_callback(log);
}
/*
@@ -3029,21 +2939,21 @@ restart:
*logoffsetp = log_offset;
return 0;
-} /* xlog_state_get_iclog_space */
-
-/* The first cnt-1 times through here we don't need to
- * move the grant write head because the permanent
- * reservation has reserved cnt times the unit amount.
- * Release part of current permanent unit reservation and
- * reset current reservation to be one units worth. Also
- * move grant reservation head forward.
+}
+
+/*
+ * The first cnt-1 times a ticket goes through here we don't need to move the
+ * grant write head because the permanent reservation has reserved cnt times the
+ * unit amount. Release part of current permanent unit reservation and reset
+ * current reservation to be one units worth. Also move grant reservation head
+ * forward.
*/
-STATIC void
-xlog_regrant_reserve_log_space(
+void
+xfs_log_ticket_regrant(
struct xlog *log,
struct xlog_ticket *ticket)
{
- trace_xfs_log_regrant_reserve_enter(log, ticket);
+ trace_xfs_log_ticket_regrant(log, ticket);
if (ticket->t_cnt > 0)
ticket->t_cnt--;
@@ -3055,21 +2965,20 @@ xlog_regrant_reserve_log_space(
ticket->t_curr_res = ticket->t_unit_res;
xlog_tic_reset_res(ticket);
- trace_xfs_log_regrant_reserve_sub(log, ticket);
+ trace_xfs_log_ticket_regrant_sub(log, ticket);
/* just return if we still have some of the pre-reserved space */
- if (ticket->t_cnt > 0)
- return;
+ if (!ticket->t_cnt) {
+ xlog_grant_add_space(log, &log->l_reserve_head.grant,
+ ticket->t_unit_res);
+ trace_xfs_log_ticket_regrant_exit(log, ticket);
- xlog_grant_add_space(log, &log->l_reserve_head.grant,
- ticket->t_unit_res);
-
- trace_xfs_log_regrant_reserve_exit(log, ticket);
-
- ticket->t_curr_res = ticket->t_unit_res;
- xlog_tic_reset_res(ticket);
-} /* xlog_regrant_reserve_log_space */
+ ticket->t_curr_res = ticket->t_unit_res;
+ xlog_tic_reset_res(ticket);
+ }
+ xfs_log_ticket_put(ticket);
+}
/*
* Give back the space left from a reservation.
@@ -3085,18 +2994,19 @@ xlog_regrant_reserve_log_space(
* space, the count will stay at zero and the only space remaining will be
* in the current reservation field.
*/
-STATIC void
-xlog_ungrant_log_space(
+void
+xfs_log_ticket_ungrant(
struct xlog *log,
struct xlog_ticket *ticket)
{
- int bytes;
+ int bytes;
+
+ trace_xfs_log_ticket_ungrant(log, ticket);
if (ticket->t_cnt > 0)
ticket->t_cnt--;
- trace_xfs_log_ungrant_enter(log, ticket);
- trace_xfs_log_ungrant_sub(log, ticket);
+ trace_xfs_log_ticket_ungrant_sub(log, ticket);
/*
* If this is a permanent reservation ticket, we may be able to free
@@ -3111,18 +3021,15 @@ xlog_ungrant_log_space(
xlog_grant_sub_space(log, &log->l_reserve_head.grant, bytes);
xlog_grant_sub_space(log, &log->l_write_head.grant, bytes);
- trace_xfs_log_ungrant_exit(log, ticket);
+ trace_xfs_log_ticket_ungrant_exit(log, ticket);
xfs_log_space_wake(log->l_mp);
+ xfs_log_ticket_put(ticket);
}
/*
- * Mark the current iclog in the ring as WANT_SYNC and move the current iclog
- * pointer to the next iclog in the ring.
- *
- * When called from xlog_state_get_iclog_space(), the exact size of the iclog
- * has not yet been determined, all we know is that we have run out of space in
- * the current iclog.
+ * This routine will mark the current iclog in the ring as WANT_SYNC and move
+ * the current iclog pointer to the next iclog in the ring.
*/
STATIC void
xlog_state_switch_iclogs(
@@ -3167,7 +3074,7 @@ xlog_state_switch_iclogs(
}
ASSERT(iclog == log->l_iclog);
log->l_iclog = iclog->ic_next;
-} /* xlog_state_switch_iclogs */
+}
/*
* Write out all data in the in-core log as of this exact moment in time.
@@ -3374,13 +3281,6 @@ xfs_log_force_lsn(
return ret;
}
-/*****************************************************************************
- *
- * TICKET functions
- *
- *****************************************************************************
- */
-
/*
* Free a used ticket when its refcount falls to zero.
*/
@@ -3529,7 +3429,6 @@ xlog_ticket_alloc(
tic->t_ocnt = cnt;
tic->t_tid = prandom_u32();
tic->t_clientid = client;
- tic->t_flags = XLOG_TIC_INITED;
if (permanent)
tic->t_flags |= XLOG_TIC_PERM_RESERV;
@@ -3538,13 +3437,6 @@ xlog_ticket_alloc(
return tic;
}
-
-/******************************************************************************
- *
- * Log debug routines
- *
- ******************************************************************************
- */
#if defined(DEBUG)
/*
* Make sure that the destination ptr is within the valid data region of
@@ -3630,7 +3522,7 @@ xlog_verify_tail_lsn(
if (blocks < BTOBB(iclog->ic_offset) + 1)
xfs_emerg(log->l_mp, "%s: ran out of log space", __func__);
}
-} /* xlog_verify_tail_lsn */
+}
/*
* Perform a number of checks on the iclog before writing to disk.
@@ -3733,7 +3625,7 @@ xlog_verify_iclog(
}
ptr += sizeof(xlog_op_header_t) + op_len;
}
-} /* xlog_verify_iclog */
+}
#endif
/*
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index cc77cc36560a..1412d6993f1e 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -105,10 +105,6 @@ struct xfs_log_item;
struct xfs_item_ops;
struct xfs_trans;
-xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
- struct xlog_ticket *ticket,
- struct xlog_in_core **iclog,
- bool regrant);
int xfs_log_force(struct xfs_mount *mp, uint flags);
int xfs_log_force_lsn(struct xfs_mount *mp, xfs_lsn_t lsn, uint flags,
int *log_forced);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 64cc0bf2ab3b..b43f0e8f43f2 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -669,6 +669,11 @@ xlog_cil_push_work(
ASSERT(push_seq <= ctx->sequence);
/*
+ * Wake up any background push waiters now this context is being pushed.
+ */
+ wake_up_all(&ctx->push_wait);
+
+ /*
* Check if we've anything to push. If there is nothing, then we don't
* move on to a new sequence number and so we have to be able to push
* this sequence again later.
@@ -744,6 +749,7 @@ xlog_cil_push_work(
*/
INIT_LIST_HEAD(&new_ctx->committing);
INIT_LIST_HEAD(&new_ctx->busy_extents);
+ init_waitqueue_head(&new_ctx->push_wait);
new_ctx->sequence = ctx->sequence + 1;
new_ctx->cil = cil;
cil->xc_ctx = new_ctx;
@@ -801,7 +807,7 @@ xlog_cil_push_work(
lvhdr.lv_iovecp = &lhdr;
lvhdr.lv_next = ctx->lv_chain;
- error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0);
+ error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0, true);
if (error)
goto out_abort_free_ticket;
@@ -839,10 +845,11 @@ restart:
}
spin_unlock(&cil->xc_push_lock);
- /* xfs_log_done always frees the ticket on error. */
- commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, false);
- if (commit_lsn == -1)
- goto out_abort;
+ error = xlog_commit_record(log, tic, &commit_iclog, &commit_lsn);
+ if (error)
+ goto out_abort_free_ticket;
+
+ xfs_log_ticket_ungrant(log, tic);
spin_lock(&commit_iclog->ic_callback_lock);
if (commit_iclog->ic_state == XLOG_STATE_IOERROR) {
@@ -875,7 +882,7 @@ out_skip:
return;
out_abort_free_ticket:
- xfs_log_ticket_put(tic);
+ xfs_log_ticket_ungrant(log, tic);
out_abort:
ASSERT(XLOG_FORCED_SHUTDOWN(log));
xlog_cil_committed(ctx);
@@ -890,7 +897,7 @@ out_abort:
*/
static void
xlog_cil_push_background(
- struct xlog *log)
+ struct xlog *log) __releases(cil->xc_ctx_lock)
{
struct xfs_cil *cil = log->l_cilp;
@@ -904,14 +911,36 @@ xlog_cil_push_background(
* don't do a background push if we haven't used up all the
* space available yet.
*/
- if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
+ if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) {
+ up_read(&cil->xc_ctx_lock);
return;
+ }
spin_lock(&cil->xc_push_lock);
if (cil->xc_push_seq < cil->xc_current_sequence) {
cil->xc_push_seq = cil->xc_current_sequence;
queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work);
}
+
+ /*
+ * Drop the context lock now, we can't hold that if we need to sleep
+ * because we are over the blocking threshold. The push_lock is still
+ * held, so blocking threshold sleep/wakeup is still correctly
+ * serialised here.
+ */
+ up_read(&cil->xc_ctx_lock);
+
+ /*
+ * If we are well over the space limit, throttle the work that is being
+ * done until the push work on this context has begun.
+ */
+ if (cil->xc_ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log)) {
+ trace_xfs_log_cil_wait(log, cil->xc_ctx->ticket);
+ ASSERT(cil->xc_ctx->space_used < log->l_logsize);
+ xlog_wait(&cil->xc_ctx->push_wait, &cil->xc_push_lock);
+ return;
+ }
+
spin_unlock(&cil->xc_push_lock);
}
@@ -1007,7 +1036,10 @@ xfs_log_commit_cil(
if (commit_lsn)
*commit_lsn = xc_commit_lsn;
- xfs_log_done(mp, tp->t_ticket, NULL, regrant);
+ if (regrant && !XLOG_FORCED_SHUTDOWN(log))
+ xfs_log_ticket_regrant(log, tp->t_ticket);
+ else
+ xfs_log_ticket_ungrant(log, tp->t_ticket);
tp->t_ticket = NULL;
xfs_trans_unreserve_and_mod_sb(tp);
@@ -1028,9 +1060,9 @@ xfs_log_commit_cil(
if (lip->li_ops->iop_committing)
lip->li_ops->iop_committing(lip, xc_commit_lsn);
}
- xlog_cil_push_background(log);
- up_read(&cil->xc_ctx_lock);
+ /* xlog_cil_push_background() releases cil->xc_ctx_lock */
+ xlog_cil_push_background(log);
}
/*
@@ -1189,6 +1221,7 @@ xlog_cil_init(
INIT_LIST_HEAD(&ctx->committing);
INIT_LIST_HEAD(&ctx->busy_extents);
+ init_waitqueue_head(&ctx->push_wait);
ctx->sequence = 1;
ctx->cil = cil;
cil->xc_ctx = ctx;
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 2b0aec37e73e..ec22c7a3867f 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -51,13 +51,11 @@ enum xlog_iclog_state {
};
/*
- * Flags to log ticket
+ * Log ticket flags
*/
-#define XLOG_TIC_INITED 0x1 /* has been initialized */
-#define XLOG_TIC_PERM_RESERV 0x2 /* permanent reservation */
+#define XLOG_TIC_PERM_RESERV 0x1 /* permanent reservation */
#define XLOG_TIC_FLAGS \
- { XLOG_TIC_INITED, "XLOG_TIC_INITED" }, \
{ XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }
/*
@@ -242,6 +240,7 @@ struct xfs_cil_ctx {
struct xfs_log_vec *lv_chain; /* logvecs being pushed */
struct list_head iclog_entry;
struct list_head committing; /* ctx committing list */
+ wait_queue_head_t push_wait; /* background push throttle */
struct work_struct discard_endio_work;
};
@@ -318,13 +317,53 @@ struct xfs_cil {
* tries to keep 25% of the log free, so we need to keep below that limit or we
* risk running out of free log space to start any new transactions.
*
- * In order to keep background CIL push efficient, we will set a lower
- * threshold at which background pushing is attempted without blocking current
- * transaction commits. A separate, higher bound defines when CIL pushes are
- * enforced to ensure we stay within our maximum checkpoint size bounds.
- * threshold, yet give us plenty of space for aggregation on large logs.
+ * In order to keep background CIL push efficient, we only need to ensure the
+ * CIL is large enough to maintain sufficient in-memory relogging to avoid
+ * repeated physical writes of frequently modified metadata. If we allow the CIL
+ * to grow to a substantial fraction of the log, then we may be pinning hundreds
+ * of megabytes of metadata in memory until the CIL flushes. This can cause
+ * issues when we are running low on memory - pinned memory cannot be reclaimed,
+ * and the CIL consumes a lot of memory. Hence we need to set an upper physical
+ * size limit for the CIL that limits the maximum amount of memory pinned by the
+ * CIL but does not limit performance by reducing relogging efficiency
+ * significantly.
+ *
+ * As such, the CIL push threshold ends up being the smaller of two thresholds:
+ * - a threshold large enough that it allows CIL to be pushed and progress to be
+ * made without excessive blocking of incoming transaction commits. This is
+ * defined to be 12.5% of the log space - half the 25% push threshold of the
+ * AIL.
+ * - small enough that it doesn't pin excessive amounts of memory but maintains
+ * close to peak relogging efficiency. This is defined to be 16x the iclog
+ * buffer window (32MB) as measurements have shown this to be roughly the
+ * point of diminishing performance increases under highly concurrent
+ * modification workloads.
+ *
+ * To prevent the CIL from overflowing upper commit size bounds, we introduce a
+ * new threshold at which we block committing transactions until the background
+ * CIL commit commences and switches to a new context. While this is not a hard
+ * limit, it forces the process committing a transaction to the CIL to block and
+ * yeild the CPU, giving the CIL push work a chance to be scheduled and start
+ * work. This prevents a process running lots of transactions from overfilling
+ * the CIL because it is not yielding the CPU. We set the blocking limit at
+ * twice the background push space threshold so we keep in line with the AIL
+ * push thresholds.
+ *
+ * Note: this is not a -hard- limit as blocking is applied after the transaction
+ * is inserted into the CIL and the push has been triggered. It is largely a
+ * throttling mechanism that allows the CIL push to be scheduled and run. A hard
+ * limit will be difficult to implement without introducing global serialisation
+ * in the CIL commit fast path, and it's not at all clear that we actually need
+ * such hard limits given the ~7 years we've run without a hard limit before
+ * finding the first situation where a checkpoint size overflow actually
+ * occurred. Hence the simple throttle, and an ASSERT check to tell us that
+ * we've overrun the max size.
*/
-#define XLOG_CIL_SPACE_LIMIT(log) (log->l_logsize >> 3)
+#define XLOG_CIL_SPACE_LIMIT(log) \
+ min_t(int, (log)->l_logsize >> 3, BBTOB(XLOG_TOTAL_REC_SHIFT(log)) << 4)
+
+#define XLOG_CIL_BLOCKING_SPACE_LIMIT(log) \
+ (XLOG_CIL_SPACE_LIMIT(log) * 2)
/*
* ticket grant locks, queues and accounting have their own cachlines
@@ -439,14 +478,14 @@ xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes)
void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket);
void xlog_print_trans(struct xfs_trans *);
-int
-xlog_write(
- struct xlog *log,
- struct xfs_log_vec *log_vector,
- struct xlog_ticket *tic,
- xfs_lsn_t *start_lsn,
- struct xlog_in_core **commit_iclog,
- uint flags);
+int xlog_write(struct xlog *log, struct xfs_log_vec *log_vector,
+ struct xlog_ticket *tic, xfs_lsn_t *start_lsn,
+ struct xlog_in_core **commit_iclog, uint flags,
+ bool need_start_rec);
+int xlog_commit_record(struct xlog *log, struct xlog_ticket *ticket,
+ struct xlog_in_core **iclog, xfs_lsn_t *lsn);
+void xfs_log_ticket_ungrant(struct xlog *log, struct xlog_ticket *ticket);
+void xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket);
/*
* When we crack an atomic LSN, we sample it first so that the value will not
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 88ab09ed29e7..50c43422fa17 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -167,6 +167,7 @@ typedef struct xfs_mount {
struct xfs_kobj m_error_meta_kobj;
struct xfs_error_cfg m_error_cfg[XFS_ERR_CLASS_MAX][XFS_ERR_ERRNO_MAX];
struct xstats m_stats; /* per-fs stats */
+ struct ratelimit_state m_flush_inodes_ratelimit;
struct workqueue_struct *m_buf_workqueue;
struct workqueue_struct *m_unwritten_workqueue;
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index cabdb755adae..c225691fad15 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -121,12 +121,11 @@ xfs_qm_dqpurge(
{
struct xfs_mount *mp = dqp->q_mount;
struct xfs_quotainfo *qi = mp->m_quotainfo;
+ int error = -EAGAIN;
xfs_dqlock(dqp);
- if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) {
- xfs_dqunlock(dqp);
- return -EAGAIN;
- }
+ if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0)
+ goto out_unlock;
dqp->dq_flags |= XFS_DQ_FREEING;
@@ -139,7 +138,6 @@ xfs_qm_dqpurge(
*/
if (XFS_DQ_IS_DIRTY(dqp)) {
struct xfs_buf *bp = NULL;
- int error;
/*
* We don't care about getting disk errors here. We need
@@ -149,6 +147,8 @@ xfs_qm_dqpurge(
if (!error) {
error = xfs_bwrite(bp);
xfs_buf_relse(bp);
+ } else if (error == -EAGAIN) {
+ goto out_unlock;
}
xfs_dqflock(dqp);
}
@@ -174,6 +174,10 @@ xfs_qm_dqpurge(
xfs_qm_dqdestroy(dqp);
return 0;
+
+out_unlock:
+ xfs_dqunlock(dqp);
+ return error;
}
/*
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 2094386af8ac..abf06bf9c3f3 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -528,6 +528,9 @@ xfs_flush_inodes(
{
struct super_block *sb = mp->m_super;
+ if (!__ratelimit(&mp->m_flush_inodes_ratelimit))
+ return;
+
if (down_read_trylock(&sb->s_umount)) {
sync_inodes_sb(sb);
up_read(&sb->s_umount);
@@ -1366,6 +1369,17 @@ xfs_fc_fill_super(
if (error)
goto out_free_names;
+ /*
+ * Cap the number of invocations of xfs_flush_inodes to 16 for every
+ * quarter of a second. The magic numbers here were determined by
+ * observation neither to cause stalls in writeback when there are a
+ * lot of IO threads and the fs is near ENOSPC, nor cause any fstest
+ * regressions. YMMV.
+ */
+ ratelimit_state_init(&mp->m_flush_inodes_ratelimit, HZ / 4, 16);
+ ratelimit_set_flags(&mp->m_flush_inodes_ratelimit,
+ RATELIMIT_MSG_ON_RELEASE);
+
error = xfs_init_mount_workqueues(mp);
if (error)
goto out_close_devices;
@@ -1861,7 +1875,8 @@ xfs_init_zones(void)
xfs_ili_zone = kmem_cache_create("xfs_ili",
sizeof(struct xfs_inode_log_item), 0,
- SLAB_MEM_SPREAD, NULL);
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+ NULL);
if (!xfs_ili_zone)
goto out_destroy_inode_zone;
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index fa0fa3c70f1a..13fb4b919648 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -176,7 +176,6 @@ xfs_symlink(
return -ENAMETOOLONG;
ASSERT(pathlen > 0);
- udqp = gdqp = NULL;
prid = xfs_get_initial_prid(dp);
/*
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index efc7751550d9..a4323a63438d 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1001,8 +1001,6 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
DEFINE_EVENT(xfs_loggrant_class, name, \
TP_PROTO(struct xlog *log, struct xlog_ticket *tic), \
TP_ARGS(log, tic))
-DEFINE_LOGGRANT_EVENT(xfs_log_done_nonperm);
-DEFINE_LOGGRANT_EVENT(xfs_log_done_perm);
DEFINE_LOGGRANT_EVENT(xfs_log_umount_write);
DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep);
DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake);
@@ -1011,12 +1009,13 @@ DEFINE_LOGGRANT_EVENT(xfs_log_reserve);
DEFINE_LOGGRANT_EVENT(xfs_log_reserve_exit);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant_exit);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub);
-DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_enter);
-DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_exit);
-DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_sub);
+DEFINE_LOGGRANT_EVENT(xfs_log_ticket_regrant);
+DEFINE_LOGGRANT_EVENT(xfs_log_ticket_regrant_exit);
+DEFINE_LOGGRANT_EVENT(xfs_log_ticket_regrant_sub);
+DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant);
+DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant_sub);
+DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant_exit);
+DEFINE_LOGGRANT_EVENT(xfs_log_cil_wait);
DECLARE_EVENT_CLASS(xfs_log_item_class,
TP_PROTO(struct xfs_log_item *lip),
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 1adc6bc53a56..28b983ff8b11 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -9,6 +9,7 @@
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
+#include "xfs_log_priv.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_extent_busy.h"
@@ -150,8 +151,9 @@ xfs_trans_reserve(
uint blocks,
uint rtextents)
{
- int error = 0;
- bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
+ struct xfs_mount *mp = tp->t_mountp;
+ int error = 0;
+ bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
/* Mark this thread as being in a transaction */
current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
@@ -162,7 +164,7 @@ xfs_trans_reserve(
* fail if the count would go below zero.
*/
if (blocks > 0) {
- error = xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd);
+ error = xfs_mod_fdblocks(mp, -((int64_t)blocks), rsvd);
if (error != 0) {
current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
return -ENOSPC;
@@ -191,9 +193,9 @@ xfs_trans_reserve(
if (tp->t_ticket != NULL) {
ASSERT(resp->tr_logflags & XFS_TRANS_PERM_LOG_RES);
- error = xfs_log_regrant(tp->t_mountp, tp->t_ticket);
+ error = xfs_log_regrant(mp, tp->t_ticket);
} else {
- error = xfs_log_reserve(tp->t_mountp,
+ error = xfs_log_reserve(mp,
resp->tr_logres,
resp->tr_logcount,
&tp->t_ticket, XFS_TRANSACTION,
@@ -213,7 +215,7 @@ xfs_trans_reserve(
* fail if the count would go below zero.
*/
if (rtextents > 0) {
- error = xfs_mod_frextents(tp->t_mountp, -((int64_t)rtextents));
+ error = xfs_mod_frextents(mp, -((int64_t)rtextents));
if (error) {
error = -ENOSPC;
goto undo_log;
@@ -229,7 +231,7 @@ xfs_trans_reserve(
*/
undo_log:
if (resp->tr_logres > 0) {
- xfs_log_done(tp->t_mountp, tp->t_ticket, NULL, false);
+ xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket);
tp->t_ticket = NULL;
tp->t_log_res = 0;
tp->t_flags &= ~XFS_TRANS_PERM_LOG_RES;
@@ -237,7 +239,7 @@ undo_log:
undo_blocks:
if (blocks > 0) {
- xfs_mod_fdblocks(tp->t_mountp, (int64_t)blocks, rsvd);
+ xfs_mod_fdblocks(mp, (int64_t)blocks, rsvd);
tp->t_blk_res = 0;
}
@@ -1004,9 +1006,10 @@ out_unreserve:
*/
xfs_trans_unreserve_and_mod_dquots(tp);
if (tp->t_ticket) {
- commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, regrant);
- if (commit_lsn == -1 && !error)
- error = -EIO;
+ if (regrant && !XLOG_FORCED_SHUTDOWN(mp->m_log))
+ xfs_log_ticket_regrant(mp->m_log, tp->t_ticket);
+ else
+ xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket);
tp->t_ticket = NULL;
}
current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
@@ -1065,7 +1068,7 @@ xfs_trans_cancel(
xfs_trans_unreserve_and_mod_dquots(tp);
if (tp->t_ticket) {
- xfs_log_done(mp, tp->t_ticket, NULL, false);
+ xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket);
tp->t_ticket = NULL;
}
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 2ef0dfbfb303..564253550b75 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -109,17 +109,25 @@ xfs_ail_next(
* We need the AIL lock in order to get a coherent read of the lsn of the last
* item in the AIL.
*/
+static xfs_lsn_t
+__xfs_ail_min_lsn(
+ struct xfs_ail *ailp)
+{
+ struct xfs_log_item *lip = xfs_ail_min(ailp);
+
+ if (lip)
+ return lip->li_lsn;
+ return 0;
+}
+
xfs_lsn_t
xfs_ail_min_lsn(
struct xfs_ail *ailp)
{
- xfs_lsn_t lsn = 0;
- struct xfs_log_item *lip;
+ xfs_lsn_t lsn;
spin_lock(&ailp->ail_lock);
- lip = xfs_ail_min(ailp);
- if (lip)
- lsn = lip->li_lsn;
+ lsn = __xfs_ail_min_lsn(ailp);
spin_unlock(&ailp->ail_lock);
return lsn;
@@ -681,6 +689,28 @@ xfs_ail_push_all_sync(
finish_wait(&ailp->ail_empty, &wait);
}
+void
+xfs_ail_update_finish(
+ struct xfs_ail *ailp,
+ xfs_lsn_t old_lsn) __releases(ailp->ail_lock)
+{
+ struct xfs_mount *mp = ailp->ail_mount;
+
+ /* if the tail lsn hasn't changed, don't do updates or wakeups. */
+ if (!old_lsn || old_lsn == __xfs_ail_min_lsn(ailp)) {
+ spin_unlock(&ailp->ail_lock);
+ return;
+ }
+
+ if (!XFS_FORCED_SHUTDOWN(mp))
+ xlog_assign_tail_lsn_locked(mp);
+
+ if (list_empty(&ailp->ail_head))
+ wake_up_all(&ailp->ail_empty);
+ spin_unlock(&ailp->ail_lock);
+ xfs_log_space_wake(mp);
+}
+
/*
* xfs_trans_ail_update - bulk AIL insertion operation.
*
@@ -712,7 +742,7 @@ xfs_trans_ail_update_bulk(
xfs_lsn_t lsn) __releases(ailp->ail_lock)
{
struct xfs_log_item *mlip;
- int mlip_changed = 0;
+ xfs_lsn_t tail_lsn = 0;
int i;
LIST_HEAD(tmp);
@@ -727,9 +757,10 @@ xfs_trans_ail_update_bulk(
continue;
trace_xfs_ail_move(lip, lip->li_lsn, lsn);
+ if (mlip == lip && !tail_lsn)
+ tail_lsn = lip->li_lsn;
+
xfs_ail_delete(ailp, lip);
- if (mlip == lip)
- mlip_changed = 1;
} else {
trace_xfs_ail_insert(lip, 0, lsn);
}
@@ -740,23 +771,23 @@ xfs_trans_ail_update_bulk(
if (!list_empty(&tmp))
xfs_ail_splice(ailp, cur, &tmp, lsn);
- if (mlip_changed) {
- if (!XFS_FORCED_SHUTDOWN(ailp->ail_mount))
- xlog_assign_tail_lsn_locked(ailp->ail_mount);
- spin_unlock(&ailp->ail_lock);
-
- xfs_log_space_wake(ailp->ail_mount);
- } else {
- spin_unlock(&ailp->ail_lock);
- }
+ xfs_ail_update_finish(ailp, tail_lsn);
}
-bool
+/*
+ * Delete one log item from the AIL.
+ *
+ * If this item was at the tail of the AIL, return the LSN of the log item so
+ * that we can use it to check if the LSN of the tail of the log has moved
+ * when finishing up the AIL delete process in xfs_ail_update_finish().
+ */
+xfs_lsn_t
xfs_ail_delete_one(
struct xfs_ail *ailp,
struct xfs_log_item *lip)
{
struct xfs_log_item *mlip = xfs_ail_min(ailp);
+ xfs_lsn_t lsn = lip->li_lsn;
trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn);
xfs_ail_delete(ailp, lip);
@@ -764,7 +795,9 @@ xfs_ail_delete_one(
clear_bit(XFS_LI_IN_AIL, &lip->li_flags);
lip->li_lsn = 0;
- return mlip == lip;
+ if (mlip == lip)
+ return lsn;
+ return 0;
}
/**
@@ -792,10 +825,10 @@ void
xfs_trans_ail_delete(
struct xfs_ail *ailp,
struct xfs_log_item *lip,
- int shutdown_type) __releases(ailp->ail_lock)
+ int shutdown_type)
{
struct xfs_mount *mp = ailp->ail_mount;
- bool mlip_changed;
+ xfs_lsn_t tail_lsn;
if (!test_bit(XFS_LI_IN_AIL, &lip->li_flags)) {
spin_unlock(&ailp->ail_lock);
@@ -808,17 +841,8 @@ xfs_trans_ail_delete(
return;
}
- mlip_changed = xfs_ail_delete_one(ailp, lip);
- if (mlip_changed) {
- if (!XFS_FORCED_SHUTDOWN(mp))
- xlog_assign_tail_lsn_locked(mp);
- if (list_empty(&ailp->ail_head))
- wake_up_all(&ailp->ail_empty);
- }
-
- spin_unlock(&ailp->ail_lock);
- if (mlip_changed)
- xfs_log_space_wake(ailp->ail_mount);
+ tail_lsn = xfs_ail_delete_one(ailp, lip);
+ xfs_ail_update_finish(ailp, tail_lsn);
}
int
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 2e073c1c4614..35655eac01a6 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -91,9 +91,11 @@ xfs_trans_ail_update(
xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn);
}
-bool xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip);
+xfs_lsn_t xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip);
+void xfs_ail_update_finish(struct xfs_ail *ailp, xfs_lsn_t old_lsn)
+ __releases(ailp->ail_lock);
void xfs_trans_ail_delete(struct xfs_ail *ailp, struct xfs_log_item *lip,
- int shutdown_type) __releases(ailp->ail_lock);
+ int shutdown_type);
static inline void
xfs_trans_ail_remove(