diff options
Diffstat (limited to 'fs/ceph')
-rw-r--r-- | fs/ceph/Kconfig | 2 | ||||
-rw-r--r-- | fs/ceph/addr.c | 1390 | ||||
-rw-r--r-- | fs/ceph/caps.c | 133 | ||||
-rw-r--r-- | fs/ceph/crypto.h | 2 | ||||
-rw-r--r-- | fs/ceph/debugfs.c | 4 | ||||
-rw-r--r-- | fs/ceph/dir.c | 107 | ||||
-rw-r--r-- | fs/ceph/export.c | 14 | ||||
-rw-r--r-- | fs/ceph/file.c | 143 | ||||
-rw-r--r-- | fs/ceph/inode.c | 84 | ||||
-rw-r--r-- | fs/ceph/mds_client.c | 354 | ||||
-rw-r--r-- | fs/ceph/mds_client.h | 48 | ||||
-rw-r--r-- | fs/ceph/quota.c | 2 | ||||
-rw-r--r-- | fs/ceph/super.c | 29 | ||||
-rw-r--r-- | fs/ceph/super.h | 24 | ||||
-rw-r--r-- | fs/ceph/xattr.c | 14 |
15 files changed, 1593 insertions, 757 deletions
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig index 7249d70e1a43..3e7def3d31c1 100644 --- a/fs/ceph/Kconfig +++ b/fs/ceph/Kconfig @@ -3,7 +3,7 @@ config CEPH_FS tristate "Ceph distributed file system" depends on INET select CEPH_LIB - select LIBCRC32C + select CRC32 select CRYPTO_AES select CRYPTO select NETFS_SUPPORT diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index ee9caf7916fb..29be367905a1 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -13,6 +13,7 @@ #include <linux/iversion.h> #include <linux/ktime.h> #include <linux/netfs.h> +#include <trace/events/netfs.h> #include "super.h" #include "mds_client.h" @@ -81,6 +82,7 @@ static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio) { struct inode *inode = mapping->host; struct ceph_client *cl = ceph_inode_to_client(inode); + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); struct ceph_inode_info *ci; struct ceph_snap_context *snapc; @@ -91,11 +93,12 @@ static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio) return false; } + atomic64_inc(&mdsc->dirty_folios); + ci = ceph_inode(inode); /* dirty the head */ spin_lock(&ci->i_ceph_lock); - BUG_ON(ci->i_wr_ref == 0); // caller should hold Fw reference if (__ceph_have_pending_cap_snap(ci)) { struct ceph_cap_snap *capsnap = list_last_entry(&ci->i_cap_snaps, @@ -193,7 +196,7 @@ static void ceph_netfs_expand_readahead(struct netfs_io_request *rreq) * block, but do not exceed the file size, unless the original * request already exceeds it. */ - new_end = min(round_up(end, lo->stripe_unit), rreq->i_size); + new_end = umin(round_up(end, lo->stripe_unit), rreq->i_size); if (new_end > end && new_end <= rreq->start + max_len) rreq->len = new_end - rreq->start; @@ -205,21 +208,6 @@ static void ceph_netfs_expand_readahead(struct netfs_io_request *rreq) } } -static bool ceph_netfs_clamp_length(struct netfs_io_subrequest *subreq) -{ - struct inode *inode = subreq->rreq->inode; - struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); - struct ceph_inode_info *ci = ceph_inode(inode); - u64 objno, objoff; - u32 xlen; - - /* Truncate the extent at the end of the current block */ - ceph_calc_file_object_mapping(&ci->i_layout, subreq->start, subreq->len, - &objno, &objoff, &xlen); - subreq->len = min(xlen, fsc->mount_options->rsize); - return true; -} - static void finish_netfs_read(struct ceph_osd_request *req) { struct inode *inode = req->r_inode; @@ -238,15 +226,19 @@ static void finish_netfs_read(struct ceph_osd_request *req) subreq->len, i_size_read(req->r_inode)); /* no object means success but no data */ - if (err == -ENOENT) + if (err == -ENOENT) { + __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); + __set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); err = 0; - else if (err == -EBLOCKLISTED) + } else if (err == -EBLOCKLISTED) { fsc->blocklisted = true; + } if (err >= 0) { if (sparse && err > 0) err = ceph_sparse_ext_map_end(op); - if (err < subreq->len) + if (err < subreq->len && + subreq->rreq->origin != NETFS_DIO_READ) __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); if (IS_ENCRYPTED(inode) && err > 0) { err = ceph_fscrypt_decrypt_extents(inode, @@ -256,6 +248,8 @@ static void finish_netfs_read(struct ceph_osd_request *req) if (err > subreq->len) err = subreq->len; } + if (err > 0) + __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); } if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) { @@ -263,7 +257,13 @@ static void finish_netfs_read(struct ceph_osd_request *req) calc_pages_for(osd_data->alignment, osd_data->length), false); } - netfs_subreq_terminated(subreq, err, false); + if (err > 0) { + subreq->transferred = err; + err = 0; + } + subreq->error = err; + trace_netfs_sreq(subreq, netfs_sreq_trace_io_progress); + netfs_read_subreq_terminated(subreq); iput(req->r_inode); ceph_dec_osd_stopping_blocker(fsc->mdsc); } @@ -277,12 +277,12 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq) struct ceph_mds_request *req; struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); struct ceph_inode_info *ci = ceph_inode(inode); - struct iov_iter iter; ssize_t err = 0; size_t len; int mode; - __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); + if (rreq->origin != NETFS_DIO_READ) + __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); __clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags); if (subreq->start >= inode->i_size) @@ -299,6 +299,7 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq) req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INLINE_DATA); req->r_num_caps = 2; + trace_netfs_sreq(subreq, netfs_sreq_trace_submit); err = ceph_mdsc_do_request(mdsc, NULL, req); if (err < 0) goto out; @@ -312,17 +313,38 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq) } len = min_t(size_t, iinfo->inline_len - subreq->start, subreq->len); - iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, subreq->start, len); - err = copy_to_iter(iinfo->inline_data + subreq->start, len, &iter); - if (err == 0) + err = copy_to_iter(iinfo->inline_data + subreq->start, len, &subreq->io_iter); + if (err == 0) { err = -EFAULT; + } else { + subreq->transferred += err; + err = 0; + } ceph_mdsc_put_request(req); out: - netfs_subreq_terminated(subreq, err, false); + subreq->error = err; + trace_netfs_sreq(subreq, netfs_sreq_trace_io_progress); + netfs_read_subreq_terminated(subreq); return true; } +static int ceph_netfs_prepare_read(struct netfs_io_subrequest *subreq) +{ + struct netfs_io_request *rreq = subreq->rreq; + struct inode *inode = rreq->inode; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + u64 objno, objoff; + u32 xlen; + + /* Truncate the extent at the end of the current block */ + ceph_calc_file_object_mapping(&ci->i_layout, subreq->start, subreq->len, + &objno, &objoff, &xlen); + rreq->io_streams[0].sreq_max_len = umin(xlen, fsc->mount_options->rsize); + return 0; +} + static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) { struct netfs_io_request *rreq = subreq->rreq; @@ -332,9 +354,8 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) struct ceph_client *cl = fsc->client; struct ceph_osd_request *req = NULL; struct ceph_vino vino = ceph_vino(inode); - struct iov_iter iter; - int err = 0; - u64 len = subreq->len; + int err; + u64 len; bool sparse = IS_ENCRYPTED(inode) || ceph_test_mount_opt(fsc, SPARSEREAD); u64 off = subreq->start; int extent_cnt; @@ -347,6 +368,12 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) if (ceph_has_inline_data(ci) && ceph_netfs_issue_op_inline(subreq)) return; + // TODO: This rounding here is slightly dodgy. It *should* work, for + // now, as the cache only deals in blocks that are a multiple of + // PAGE_SIZE and fscrypt blocks are at most PAGE_SIZE. What needs to + // happen is for the fscrypt driving to be moved into netfslib and the + // data in the cache also to be stored encrypted. + len = subreq->len; ceph_fscrypt_adjust_off_and_len(inode, &off, &len); req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, @@ -369,8 +396,6 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) doutc(cl, "%llx.%llx pos=%llu orig_len=%zu len=%llu\n", ceph_vinop(inode), subreq->start, subreq->len, len); - iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, subreq->start, len); - /* * FIXME: For now, use CEPH_OSD_DATA_TYPE_PAGES instead of _ITER for * encrypted inodes. We'd need infrastructure that handles an iov_iter @@ -382,7 +407,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) struct page **pages; size_t page_off; - err = iov_iter_get_pages_alloc2(&iter, &pages, len, &page_off); + err = iov_iter_get_pages_alloc2(&subreq->io_iter, &pages, len, &page_off); if (err < 0) { doutc(cl, "%llx.%llx failed to allocate pages, %d\n", ceph_vinop(inode), err); @@ -397,7 +422,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false); } else { - osd_req_op_extent_osd_iter(req, 0, &iter); + osd_req_op_extent_osd_iter(req, 0, &subreq->io_iter); } if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) { err = -EIO; @@ -408,22 +433,29 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) req->r_inode = inode; ihold(inode); + trace_netfs_sreq(subreq, netfs_sreq_trace_submit); ceph_osdc_start_request(req->r_osdc, req); out: ceph_osdc_put_request(req); - if (err) - netfs_subreq_terminated(subreq, err, false); + if (err) { + subreq->error = err; + netfs_read_subreq_terminated(subreq); + } doutc(cl, "%llx.%llx result %d\n", ceph_vinop(inode), err); } static int ceph_init_request(struct netfs_io_request *rreq, struct file *file) { struct inode *inode = rreq->inode; + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); struct ceph_client *cl = ceph_inode_to_client(inode); int got = 0, want = CEPH_CAP_FILE_CACHE; struct ceph_netfs_request_data *priv; int ret = 0; + /* [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. */ + __set_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags); + if (rreq->origin != NETFS_READAHEAD) return 0; @@ -467,10 +499,14 @@ static int ceph_init_request(struct netfs_io_request *rreq, struct file *file) priv->caps = got; rreq->netfs_priv = priv; + rreq->io_streams[0].sreq_max_len = fsc->mount_options->rsize; out: - if (ret < 0) + if (ret < 0) { + if (got) + ceph_put_cap_refs(ceph_inode(inode), got); kfree(priv); + } return ret; } @@ -491,16 +527,16 @@ static void ceph_netfs_free_request(struct netfs_io_request *rreq) const struct netfs_request_ops ceph_netfs_ops = { .init_request = ceph_init_request, .free_request = ceph_netfs_free_request, + .prepare_read = ceph_netfs_prepare_read, .issue_read = ceph_netfs_issue_read, .expand_readahead = ceph_netfs_expand_readahead, - .clamp_length = ceph_netfs_clamp_length, .check_write_begin = ceph_netfs_check_write_begin, }; #ifdef CONFIG_CEPH_FSCACHE static void ceph_set_page_fscache(struct page *page) { - set_page_fscache(page); + folio_start_private_2(page_folio(page)); /* [DEPRECATED] */ } static void ceph_fscache_write_terminated(void *priv, ssize_t error, bool was_async) @@ -517,7 +553,7 @@ static void ceph_fscache_write_to_cache(struct inode *inode, u64 off, u64 len, b struct fscache_cookie *cookie = ceph_fscache_cookie(ci); fscache_write_to_cache(cookie, inode->i_mapping, off, len, i_size_read(inode), - ceph_fscache_write_terminated, inode, caching); + ceph_fscache_write_terminated, inode, true, caching); } #else static inline void ceph_set_page_fscache(struct page *page) @@ -535,7 +571,36 @@ struct ceph_writeback_ctl u64 truncate_size; u32 truncate_seq; bool size_stable; + bool head_snapc; + struct ceph_snap_context *snapc; + struct ceph_snap_context *last_snapc; + + bool done; + bool should_loop; + bool range_whole; + pgoff_t start_index; + pgoff_t index; + pgoff_t end; + xa_mark_t tag; + + pgoff_t strip_unit_end; + unsigned int wsize; + unsigned int nr_folios; + unsigned int max_pages; + unsigned int locked_pages; + + int op_idx; + int num_ops; + u64 offset; + u64 len; + + struct folio_batch fbatch; + unsigned int processed_in_fbatch; + + bool from_pool; + struct page **pages; + struct page **data_pages; }; /* @@ -633,22 +698,23 @@ static u64 get_writepages_data_length(struct inode *inode, } /* - * Write a single page, but leave the page locked. + * Write a folio, but leave it locked. * * If we get a write error, mark the mapping for error, but still adjust the - * dirty page accounting (i.e., page is no longer dirty). + * dirty page accounting (i.e., folio is no longer dirty). */ -static int writepage_nounlock(struct page *page, struct writeback_control *wbc) +static int write_folio_nounlock(struct folio *folio, + struct writeback_control *wbc) { - struct folio *folio = page_folio(page); - struct inode *inode = page->mapping->host; + struct page *page = &folio->page; + struct inode *inode = folio->mapping->host; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); struct ceph_client *cl = fsc->client; struct ceph_snap_context *snapc, *oldest; - loff_t page_off = page_offset(page); + loff_t page_off = folio_pos(folio); int err; - loff_t len = thp_size(page); + loff_t len = folio_size(folio); loff_t wlen; struct ceph_writeback_ctl ceph_wbc; struct ceph_osd_client *osdc = &fsc->client->osdc; @@ -656,27 +722,27 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) bool caching = ceph_is_cache_enabled(inode); struct page *bounce_page = NULL; - doutc(cl, "%llx.%llx page %p idx %lu\n", ceph_vinop(inode), page, - page->index); + doutc(cl, "%llx.%llx folio %p idx %lu\n", ceph_vinop(inode), folio, + folio->index); if (ceph_inode_is_shutdown(inode)) return -EIO; /* verify this is a writeable snap context */ - snapc = page_snap_context(page); + snapc = page_snap_context(&folio->page); if (!snapc) { - doutc(cl, "%llx.%llx page %p not dirty?\n", ceph_vinop(inode), - page); + doutc(cl, "%llx.%llx folio %p not dirty?\n", ceph_vinop(inode), + folio); return 0; } oldest = get_oldest_context(inode, &ceph_wbc, snapc); if (snapc->seq > oldest->seq) { - doutc(cl, "%llx.%llx page %p snapc %p not writeable - noop\n", - ceph_vinop(inode), page, snapc); + doutc(cl, "%llx.%llx folio %p snapc %p not writeable - noop\n", + ceph_vinop(inode), folio, snapc); /* we should only noop if called by kswapd */ WARN_ON(!(current->flags & PF_MEMALLOC)); ceph_put_snap_context(oldest); - redirty_page_for_writepage(wbc, page); + folio_redirty_for_writepage(wbc, folio); return 0; } ceph_put_snap_context(oldest); @@ -693,8 +759,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) len = ceph_wbc.i_size - page_off; wlen = IS_ENCRYPTED(inode) ? round_up(len, CEPH_FSCRYPT_BLOCK_SIZE) : len; - doutc(cl, "%llx.%llx page %p index %lu on %llu~%llu snapc %p seq %lld\n", - ceph_vinop(inode), page, page->index, page_off, wlen, snapc, + doutc(cl, "%llx.%llx folio %p index %lu on %llu~%llu snapc %p seq %lld\n", + ceph_vinop(inode), folio, folio->index, page_off, wlen, snapc, snapc->seq); if (atomic_long_inc_return(&fsc->writeback_count) > @@ -707,32 +773,32 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) ceph_wbc.truncate_seq, ceph_wbc.truncate_size, true); if (IS_ERR(req)) { - redirty_page_for_writepage(wbc, page); + folio_redirty_for_writepage(wbc, folio); return PTR_ERR(req); } if (wlen < len) len = wlen; - set_page_writeback(page); + folio_start_writeback(folio); if (caching) - ceph_set_page_fscache(page); + ceph_set_page_fscache(&folio->page); ceph_fscache_write_to_cache(inode, page_off, len, caching); if (IS_ENCRYPTED(inode)) { - bounce_page = fscrypt_encrypt_pagecache_blocks(page, + bounce_page = fscrypt_encrypt_pagecache_blocks(folio, CEPH_FSCRYPT_BLOCK_SIZE, 0, GFP_NOFS); if (IS_ERR(bounce_page)) { - redirty_page_for_writepage(wbc, page); - end_page_writeback(page); + folio_redirty_for_writepage(wbc, folio); + folio_end_writeback(folio); ceph_osdc_put_request(req); return PTR_ERR(bounce_page); } } /* it may be a short write due to an object boundary */ - WARN_ON_ONCE(len > thp_size(page)); + WARN_ON_ONCE(len > folio_size(folio)); osd_req_op_extent_osd_data_pages(req, 0, bounce_page ? &bounce_page : &page, wlen, 0, false, false); @@ -758,25 +824,25 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) if (err == -ERESTARTSYS) { /* killed by SIGKILL */ doutc(cl, "%llx.%llx interrupted page %p\n", - ceph_vinop(inode), page); - redirty_page_for_writepage(wbc, page); - end_page_writeback(page); + ceph_vinop(inode), folio); + folio_redirty_for_writepage(wbc, folio); + folio_end_writeback(folio); return err; } if (err == -EBLOCKLISTED) fsc->blocklisted = true; - doutc(cl, "%llx.%llx setting page/mapping error %d %p\n", - ceph_vinop(inode), err, page); + doutc(cl, "%llx.%llx setting mapping error %d %p\n", + ceph_vinop(inode), err, folio); mapping_set_error(&inode->i_data, err); wbc->pages_skipped++; } else { doutc(cl, "%llx.%llx cleaned page %p\n", - ceph_vinop(inode), page); + ceph_vinop(inode), folio); err = 0; /* vfs expects us to return 0 */ } - oldest = detach_page_private(page); + oldest = folio_detach_private(folio); WARN_ON_ONCE(oldest != snapc); - end_page_writeback(page); + folio_end_writeback(folio); ceph_put_wrbuffer_cap_refs(ci, 1, snapc); ceph_put_snap_context(snapc); /* page's reference */ @@ -787,32 +853,6 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) return err; } -static int ceph_writepage(struct page *page, struct writeback_control *wbc) -{ - int err; - struct inode *inode = page->mapping->host; - BUG_ON(!inode); - ihold(inode); - - if (wbc->sync_mode == WB_SYNC_NONE && - ceph_inode_to_fs_client(inode)->write_congested) { - redirty_page_for_writepage(wbc, page); - return AOP_WRITEPAGE_ACTIVATE; - } - - wait_on_page_fscache(page); - - err = writepage_nounlock(page, wbc); - if (err == -ERESTARTSYS) { - /* direct memory reclaimer was killed by SIGKILL. return 0 - * to prevent caller from setting mapping/page error */ - err = 0; - } - unlock_page(page); - iput(inode); - return err; -} - /* * async writeback completion handler. * @@ -832,6 +872,7 @@ static void writepages_finish(struct ceph_osd_request *req) struct ceph_snap_context *snapc = req->r_snapc; struct address_space *mapping = inode->i_mapping; struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); unsigned int len = 0; bool remove_page; @@ -887,6 +928,12 @@ static void writepages_finish(struct ceph_osd_request *req) ceph_put_snap_context(detach_page_private(page)); end_page_writeback(page); + + if (atomic64_dec_return(&mdsc->dirty_folios) <= 0) { + wake_up_all(&mdsc->flush_end_wq); + WARN_ON(atomic64_read(&mdsc->dirty_folios) < 0); + } + doutc(cl, "unlocking %p\n", page); if (remove_page) @@ -916,36 +963,13 @@ static void writepages_finish(struct ceph_osd_request *req) ceph_dec_osd_stopping_blocker(fsc->mdsc); } -/* - * initiate async writeback - */ -static int ceph_writepages_start(struct address_space *mapping, - struct writeback_control *wbc) +static inline +bool is_forced_umount(struct address_space *mapping) { struct inode *inode = mapping->host; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); struct ceph_client *cl = fsc->client; - struct ceph_vino vino = ceph_vino(inode); - pgoff_t index, start_index, end = -1; - struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc; - struct folio_batch fbatch; - int rc = 0; - unsigned int wsize = i_blocksize(inode); - struct ceph_osd_request *req = NULL; - struct ceph_writeback_ctl ceph_wbc; - bool should_loop, range_whole = false; - bool done = false; - bool caching = ceph_is_cache_enabled(inode); - xa_mark_t tag; - - if (wbc->sync_mode == WB_SYNC_NONE && - fsc->write_congested) - return 0; - - doutc(cl, "%llx.%llx (mode=%s)\n", ceph_vinop(inode), - wbc->sync_mode == WB_SYNC_NONE ? "NONE" : - (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); if (ceph_inode_is_shutdown(inode)) { if (ci->i_wrbuffer_ref > 0) { @@ -954,387 +978,733 @@ static int ceph_writepages_start(struct address_space *mapping, ceph_vinop(inode), ceph_ino(inode)); } mapping_set_error(mapping, -EIO); - return -EIO; /* we're in a forced umount, don't write! */ + return true; } + + return false; +} + +static inline +unsigned int ceph_define_write_size(struct address_space *mapping) +{ + struct inode *inode = mapping->host; + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + unsigned int wsize = i_blocksize(inode); + if (fsc->mount_options->wsize < wsize) wsize = fsc->mount_options->wsize; - folio_batch_init(&fbatch); + return wsize; +} + +static inline +void ceph_folio_batch_init(struct ceph_writeback_ctl *ceph_wbc) +{ + folio_batch_init(&ceph_wbc->fbatch); + ceph_wbc->processed_in_fbatch = 0; +} + +static inline +void ceph_folio_batch_reinit(struct ceph_writeback_ctl *ceph_wbc) +{ + folio_batch_release(&ceph_wbc->fbatch); + ceph_folio_batch_init(ceph_wbc); +} + +static inline +void ceph_init_writeback_ctl(struct address_space *mapping, + struct writeback_control *wbc, + struct ceph_writeback_ctl *ceph_wbc) +{ + ceph_wbc->snapc = NULL; + ceph_wbc->last_snapc = NULL; - start_index = wbc->range_cyclic ? mapping->writeback_index : 0; - index = start_index; + ceph_wbc->strip_unit_end = 0; + ceph_wbc->wsize = ceph_define_write_size(mapping); + + ceph_wbc->nr_folios = 0; + ceph_wbc->max_pages = 0; + ceph_wbc->locked_pages = 0; + + ceph_wbc->done = false; + ceph_wbc->should_loop = false; + ceph_wbc->range_whole = false; + + ceph_wbc->start_index = wbc->range_cyclic ? mapping->writeback_index : 0; + ceph_wbc->index = ceph_wbc->start_index; + ceph_wbc->end = -1; if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) { - tag = PAGECACHE_TAG_TOWRITE; + ceph_wbc->tag = PAGECACHE_TAG_TOWRITE; } else { - tag = PAGECACHE_TAG_DIRTY; + ceph_wbc->tag = PAGECACHE_TAG_DIRTY; } -retry: + + ceph_wbc->op_idx = -1; + ceph_wbc->num_ops = 0; + ceph_wbc->offset = 0; + ceph_wbc->len = 0; + ceph_wbc->from_pool = false; + + ceph_folio_batch_init(ceph_wbc); + + ceph_wbc->pages = NULL; + ceph_wbc->data_pages = NULL; +} + +static inline +int ceph_define_writeback_range(struct address_space *mapping, + struct writeback_control *wbc, + struct ceph_writeback_ctl *ceph_wbc) +{ + struct inode *inode = mapping->host; + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_client *cl = fsc->client; + /* find oldest snap context with dirty data */ - snapc = get_oldest_context(inode, &ceph_wbc, NULL); - if (!snapc) { + ceph_wbc->snapc = get_oldest_context(inode, ceph_wbc, NULL); + if (!ceph_wbc->snapc) { /* hmm, why does writepages get called when there is no dirty data? */ doutc(cl, " no snap context with dirty data?\n"); - goto out; + return -ENODATA; } - doutc(cl, " oldest snapc is %p seq %lld (%d snaps)\n", snapc, - snapc->seq, snapc->num_snaps); - should_loop = false; - if (ceph_wbc.head_snapc && snapc != last_snapc) { + doutc(cl, " oldest snapc is %p seq %lld (%d snaps)\n", + ceph_wbc->snapc, ceph_wbc->snapc->seq, + ceph_wbc->snapc->num_snaps); + + ceph_wbc->should_loop = false; + + if (ceph_wbc->head_snapc && ceph_wbc->snapc != ceph_wbc->last_snapc) { /* where to start/end? */ if (wbc->range_cyclic) { - index = start_index; - end = -1; - if (index > 0) - should_loop = true; - doutc(cl, " cyclic, start at %lu\n", index); + ceph_wbc->index = ceph_wbc->start_index; + ceph_wbc->end = -1; + if (ceph_wbc->index > 0) + ceph_wbc->should_loop = true; + doutc(cl, " cyclic, start at %lu\n", ceph_wbc->index); } else { - index = wbc->range_start >> PAGE_SHIFT; - end = wbc->range_end >> PAGE_SHIFT; + ceph_wbc->index = wbc->range_start >> PAGE_SHIFT; + ceph_wbc->end = wbc->range_end >> PAGE_SHIFT; if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) - range_whole = true; - doutc(cl, " not cyclic, %lu to %lu\n", index, end); + ceph_wbc->range_whole = true; + doutc(cl, " not cyclic, %lu to %lu\n", + ceph_wbc->index, ceph_wbc->end); } - } else if (!ceph_wbc.head_snapc) { + } else if (!ceph_wbc->head_snapc) { /* Do not respect wbc->range_{start,end}. Dirty pages * in that range can be associated with newer snapc. * They are not writeable until we write all dirty pages * associated with 'snapc' get written */ - if (index > 0) - should_loop = true; + if (ceph_wbc->index > 0) + ceph_wbc->should_loop = true; doutc(cl, " non-head snapc, range whole\n"); } - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) - tag_pages_for_writeback(mapping, index, end); + ceph_put_snap_context(ceph_wbc->last_snapc); + ceph_wbc->last_snapc = ceph_wbc->snapc; - ceph_put_snap_context(last_snapc); - last_snapc = snapc; + return 0; +} - while (!done && index <= end) { - int num_ops = 0, op_idx; - unsigned i, nr_folios, max_pages, locked_pages = 0; - struct page **pages = NULL, **data_pages; - struct page *page; - pgoff_t strip_unit_end = 0; - u64 offset = 0, len = 0; - bool from_pool = false; +static inline +bool has_writeback_done(struct ceph_writeback_ctl *ceph_wbc) +{ + return ceph_wbc->done && ceph_wbc->index > ceph_wbc->end; +} - max_pages = wsize >> PAGE_SHIFT; +static inline +bool can_next_page_be_processed(struct ceph_writeback_ctl *ceph_wbc, + unsigned index) +{ + return index < ceph_wbc->nr_folios && + ceph_wbc->locked_pages < ceph_wbc->max_pages; +} -get_more_pages: - nr_folios = filemap_get_folios_tag(mapping, &index, - end, tag, &fbatch); - doutc(cl, "pagevec_lookup_range_tag got %d\n", nr_folios); - if (!nr_folios && !locked_pages) - break; - for (i = 0; i < nr_folios && locked_pages < max_pages; i++) { - page = &fbatch.folios[i]->page; - doutc(cl, "? %p idx %lu\n", page, page->index); - if (locked_pages == 0) - lock_page(page); /* first page */ - else if (!trylock_page(page)) - break; +static +int ceph_check_page_before_write(struct address_space *mapping, + struct writeback_control *wbc, + struct ceph_writeback_ctl *ceph_wbc, + struct folio *folio) +{ + struct inode *inode = mapping->host; + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_client *cl = fsc->client; + struct ceph_snap_context *pgsnapc; - /* only dirty pages, or our accounting breaks */ - if (unlikely(!PageDirty(page)) || - unlikely(page->mapping != mapping)) { - doutc(cl, "!dirty or !mapping %p\n", page); - unlock_page(page); - continue; - } - /* only if matching snap context */ - pgsnapc = page_snap_context(page); - if (pgsnapc != snapc) { - doutc(cl, "page snapc %p %lld != oldest %p %lld\n", - pgsnapc, pgsnapc->seq, snapc, snapc->seq); - if (!should_loop && - !ceph_wbc.head_snapc && - wbc->sync_mode != WB_SYNC_NONE) - should_loop = true; - unlock_page(page); - continue; + /* only dirty folios, or our accounting breaks */ + if (unlikely(!folio_test_dirty(folio) || folio->mapping != mapping)) { + doutc(cl, "!dirty or !mapping %p\n", folio); + return -ENODATA; + } + + /* only if matching snap context */ + pgsnapc = page_snap_context(&folio->page); + if (pgsnapc != ceph_wbc->snapc) { + doutc(cl, "folio snapc %p %lld != oldest %p %lld\n", + pgsnapc, pgsnapc->seq, + ceph_wbc->snapc, ceph_wbc->snapc->seq); + + if (!ceph_wbc->should_loop && !ceph_wbc->head_snapc && + wbc->sync_mode != WB_SYNC_NONE) + ceph_wbc->should_loop = true; + + return -ENODATA; + } + + if (folio_pos(folio) >= ceph_wbc->i_size) { + doutc(cl, "folio at %lu beyond eof %llu\n", + folio->index, ceph_wbc->i_size); + + if ((ceph_wbc->size_stable || + folio_pos(folio) >= i_size_read(inode)) && + folio_clear_dirty_for_io(folio)) + folio_invalidate(folio, 0, folio_size(folio)); + + return -ENODATA; + } + + if (ceph_wbc->strip_unit_end && + (folio->index > ceph_wbc->strip_unit_end)) { + doutc(cl, "end of strip unit %p\n", folio); + return -E2BIG; + } + + return 0; +} + +static inline +void __ceph_allocate_page_array(struct ceph_writeback_ctl *ceph_wbc, + unsigned int max_pages) +{ + ceph_wbc->pages = kmalloc_array(max_pages, + sizeof(*ceph_wbc->pages), + GFP_NOFS); + if (!ceph_wbc->pages) { + ceph_wbc->from_pool = true; + ceph_wbc->pages = mempool_alloc(ceph_wb_pagevec_pool, GFP_NOFS); + BUG_ON(!ceph_wbc->pages); + } +} + +static inline +void ceph_allocate_page_array(struct address_space *mapping, + struct ceph_writeback_ctl *ceph_wbc, + struct folio *folio) +{ + struct inode *inode = mapping->host; + struct ceph_inode_info *ci = ceph_inode(inode); + u64 objnum; + u64 objoff; + u32 xlen; + + /* prepare async write request */ + ceph_wbc->offset = (u64)folio_pos(folio); + ceph_calc_file_object_mapping(&ci->i_layout, + ceph_wbc->offset, ceph_wbc->wsize, + &objnum, &objoff, &xlen); + + ceph_wbc->num_ops = 1; + ceph_wbc->strip_unit_end = folio->index + ((xlen - 1) >> PAGE_SHIFT); + + BUG_ON(ceph_wbc->pages); + ceph_wbc->max_pages = calc_pages_for(0, (u64)xlen); + __ceph_allocate_page_array(ceph_wbc, ceph_wbc->max_pages); + + ceph_wbc->len = 0; +} + +static inline +bool is_folio_index_contiguous(const struct ceph_writeback_ctl *ceph_wbc, + const struct folio *folio) +{ + return folio->index == (ceph_wbc->offset + ceph_wbc->len) >> PAGE_SHIFT; +} + +static inline +bool is_num_ops_too_big(struct ceph_writeback_ctl *ceph_wbc) +{ + return ceph_wbc->num_ops >= + (ceph_wbc->from_pool ? CEPH_OSD_SLAB_OPS : CEPH_OSD_MAX_OPS); +} + +static inline +bool is_write_congestion_happened(struct ceph_fs_client *fsc) +{ + return atomic_long_inc_return(&fsc->writeback_count) > + CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb); +} + +static inline int move_dirty_folio_in_page_array(struct address_space *mapping, + struct writeback_control *wbc, + struct ceph_writeback_ctl *ceph_wbc, struct folio *folio) +{ + struct inode *inode = mapping->host; + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_client *cl = fsc->client; + struct page **pages = ceph_wbc->pages; + unsigned int index = ceph_wbc->locked_pages; + gfp_t gfp_flags = ceph_wbc->locked_pages ? GFP_NOWAIT : GFP_NOFS; + + if (IS_ENCRYPTED(inode)) { + pages[index] = fscrypt_encrypt_pagecache_blocks(folio, + PAGE_SIZE, + 0, + gfp_flags); + if (IS_ERR(pages[index])) { + if (PTR_ERR(pages[index]) == -EINVAL) { + pr_err_client(cl, "inode->i_blkbits=%hhu\n", + inode->i_blkbits); } - if (page_offset(page) >= ceph_wbc.i_size) { - struct folio *folio = page_folio(page); - - doutc(cl, "folio at %lu beyond eof %llu\n", - folio->index, ceph_wbc.i_size); - if ((ceph_wbc.size_stable || - folio_pos(folio) >= i_size_read(inode)) && - folio_clear_dirty_for_io(folio)) - folio_invalidate(folio, 0, - folio_size(folio)); + + /* better not fail on first page! */ + BUG_ON(ceph_wbc->locked_pages == 0); + + pages[index] = NULL; + return PTR_ERR(pages[index]); + } + } else { + pages[index] = &folio->page; + } + + ceph_wbc->locked_pages++; + + return 0; +} + +static +int ceph_process_folio_batch(struct address_space *mapping, + struct writeback_control *wbc, + struct ceph_writeback_ctl *ceph_wbc) +{ + struct inode *inode = mapping->host; + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_client *cl = fsc->client; + struct folio *folio = NULL; + unsigned i; + int rc = 0; + + for (i = 0; can_next_page_be_processed(ceph_wbc, i); i++) { + folio = ceph_wbc->fbatch.folios[i]; + + if (!folio) + continue; + + doutc(cl, "? %p idx %lu, folio_test_writeback %#x, " + "folio_test_dirty %#x, folio_test_locked %#x\n", + folio, folio->index, folio_test_writeback(folio), + folio_test_dirty(folio), + folio_test_locked(folio)); + + if (folio_test_writeback(folio) || + folio_test_private_2(folio) /* [DEPRECATED] */) { + doutc(cl, "waiting on writeback %p\n", folio); + folio_wait_writeback(folio); + folio_wait_private_2(folio); /* [DEPRECATED] */ + continue; + } + + if (ceph_wbc->locked_pages == 0) + folio_lock(folio); + else if (!folio_trylock(folio)) + break; + + rc = ceph_check_page_before_write(mapping, wbc, + ceph_wbc, folio); + if (rc == -ENODATA) { + rc = 0; + folio_unlock(folio); + ceph_wbc->fbatch.folios[i] = NULL; + continue; + } else if (rc == -E2BIG) { + rc = 0; + folio_unlock(folio); + ceph_wbc->fbatch.folios[i] = NULL; + break; + } + + if (!folio_clear_dirty_for_io(folio)) { + doutc(cl, "%p !folio_clear_dirty_for_io\n", folio); + folio_unlock(folio); + ceph_wbc->fbatch.folios[i] = NULL; + continue; + } + + /* + * We have something to write. If this is + * the first locked page this time through, + * calculate max possible write size and + * allocate a page array + */ + if (ceph_wbc->locked_pages == 0) { + ceph_allocate_page_array(mapping, ceph_wbc, folio); + } else if (!is_folio_index_contiguous(ceph_wbc, folio)) { + if (is_num_ops_too_big(ceph_wbc)) { + folio_redirty_for_writepage(wbc, folio); folio_unlock(folio); - continue; - } - if (strip_unit_end && (page->index > strip_unit_end)) { - doutc(cl, "end of strip unit %p\n", page); - unlock_page(page); break; } - if (PageWriteback(page) || PageFsCache(page)) { - if (wbc->sync_mode == WB_SYNC_NONE) { - doutc(cl, "%p under writeback\n", page); - unlock_page(page); - continue; - } - doutc(cl, "waiting on writeback %p\n", page); - wait_on_page_writeback(page); - wait_on_page_fscache(page); - } - if (!clear_page_dirty_for_io(page)) { - doutc(cl, "%p !clear_page_dirty_for_io\n", page); - unlock_page(page); - continue; - } + ceph_wbc->num_ops++; + ceph_wbc->offset = (u64)folio_pos(folio); + ceph_wbc->len = 0; + } - /* - * We have something to write. If this is - * the first locked page this time through, - * calculate max possinle write size and - * allocate a page array - */ - if (locked_pages == 0) { - u64 objnum; - u64 objoff; - u32 xlen; - - /* prepare async write request */ - offset = (u64)page_offset(page); - ceph_calc_file_object_mapping(&ci->i_layout, - offset, wsize, - &objnum, &objoff, - &xlen); - len = xlen; - - num_ops = 1; - strip_unit_end = page->index + - ((len - 1) >> PAGE_SHIFT); - - BUG_ON(pages); - max_pages = calc_pages_for(0, (u64)len); - pages = kmalloc_array(max_pages, - sizeof(*pages), - GFP_NOFS); - if (!pages) { - from_pool = true; - pages = mempool_alloc(ceph_wb_pagevec_pool, GFP_NOFS); - BUG_ON(!pages); - } - - len = 0; - } else if (page->index != - (offset + len) >> PAGE_SHIFT) { - if (num_ops >= (from_pool ? CEPH_OSD_SLAB_OPS : - CEPH_OSD_MAX_OPS)) { - redirty_page_for_writepage(wbc, page); - unlock_page(page); - break; - } - - num_ops++; - offset = (u64)page_offset(page); - len = 0; - } + /* note position of first page in fbatch */ + doutc(cl, "%llx.%llx will write folio %p idx %lu\n", + ceph_vinop(inode), folio, folio->index); - /* note position of first page in fbatch */ - doutc(cl, "%llx.%llx will write page %p idx %lu\n", - ceph_vinop(inode), page, page->index); - - if (atomic_long_inc_return(&fsc->writeback_count) > - CONGESTION_ON_THRESH( - fsc->mount_options->congestion_kb)) - fsc->write_congested = true; - - if (IS_ENCRYPTED(inode)) { - pages[locked_pages] = - fscrypt_encrypt_pagecache_blocks(page, - PAGE_SIZE, 0, - locked_pages ? GFP_NOWAIT : GFP_NOFS); - if (IS_ERR(pages[locked_pages])) { - if (PTR_ERR(pages[locked_pages]) == -EINVAL) - pr_err_client(cl, - "inode->i_blkbits=%hhu\n", - inode->i_blkbits); - /* better not fail on first page! */ - BUG_ON(locked_pages == 0); - pages[locked_pages] = NULL; - redirty_page_for_writepage(wbc, page); - unlock_page(page); - break; - } - ++locked_pages; - } else { - pages[locked_pages++] = page; - } + fsc->write_congested = is_write_congestion_happened(fsc); - fbatch.folios[i] = NULL; - len += thp_size(page); + rc = move_dirty_folio_in_page_array(mapping, wbc, ceph_wbc, + folio); + if (rc) { + folio_redirty_for_writepage(wbc, folio); + folio_unlock(folio); + break; } - /* did we get anything? */ - if (!locked_pages) - goto release_folios; - if (i) { - unsigned j, n = 0; - /* shift unused page to beginning of fbatch */ - for (j = 0; j < nr_folios; j++) { - if (!fbatch.folios[j]) - continue; - if (n < j) - fbatch.folios[n] = fbatch.folios[j]; - n++; - } - fbatch.nr = n; + ceph_wbc->fbatch.folios[i] = NULL; + ceph_wbc->len += folio_size(folio); + } - if (nr_folios && i == nr_folios && - locked_pages < max_pages) { - doutc(cl, "reached end fbatch, trying for more\n"); - folio_batch_release(&fbatch); - goto get_more_pages; - } + ceph_wbc->processed_in_fbatch = i; + + return rc; +} + +static inline +void ceph_shift_unused_folios_left(struct folio_batch *fbatch) +{ + unsigned j, n = 0; + + /* shift unused page to beginning of fbatch */ + for (j = 0; j < folio_batch_count(fbatch); j++) { + if (!fbatch->folios[j]) + continue; + + if (n < j) { + fbatch->folios[n] = fbatch->folios[j]; } + n++; + } + + fbatch->nr = n; +} + +static +int ceph_submit_write(struct address_space *mapping, + struct writeback_control *wbc, + struct ceph_writeback_ctl *ceph_wbc) +{ + struct inode *inode = mapping->host; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_client *cl = fsc->client; + struct ceph_vino vino = ceph_vino(inode); + struct ceph_osd_request *req = NULL; + struct page *page = NULL; + bool caching = ceph_is_cache_enabled(inode); + u64 offset; + u64 len; + unsigned i; + new_request: - offset = ceph_fscrypt_page_offset(pages[0]); - len = wsize; + offset = ceph_fscrypt_page_offset(ceph_wbc->pages[0]); + len = ceph_wbc->wsize; + req = ceph_osdc_new_request(&fsc->client->osdc, + &ci->i_layout, vino, + offset, &len, 0, ceph_wbc->num_ops, + CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, + ceph_wbc->snapc, ceph_wbc->truncate_seq, + ceph_wbc->truncate_size, false); + if (IS_ERR(req)) { req = ceph_osdc_new_request(&fsc->client->osdc, - &ci->i_layout, vino, - offset, &len, 0, num_ops, - CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, - snapc, ceph_wbc.truncate_seq, - ceph_wbc.truncate_size, false); - if (IS_ERR(req)) { - req = ceph_osdc_new_request(&fsc->client->osdc, - &ci->i_layout, vino, - offset, &len, 0, - min(num_ops, - CEPH_OSD_SLAB_OPS), - CEPH_OSD_OP_WRITE, - CEPH_OSD_FLAG_WRITE, - snapc, ceph_wbc.truncate_seq, - ceph_wbc.truncate_size, true); - BUG_ON(IS_ERR(req)); + &ci->i_layout, vino, + offset, &len, 0, + min(ceph_wbc->num_ops, + CEPH_OSD_SLAB_OPS), + CEPH_OSD_OP_WRITE, + CEPH_OSD_FLAG_WRITE, + ceph_wbc->snapc, + ceph_wbc->truncate_seq, + ceph_wbc->truncate_size, + true); + BUG_ON(IS_ERR(req)); + } + + page = ceph_wbc->pages[ceph_wbc->locked_pages - 1]; + BUG_ON(len < ceph_fscrypt_page_offset(page) + thp_size(page) - offset); + + if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) { + for (i = 0; i < folio_batch_count(&ceph_wbc->fbatch); i++) { + struct folio *folio = ceph_wbc->fbatch.folios[i]; + + if (!folio) + continue; + + page = &folio->page; + redirty_page_for_writepage(wbc, page); + unlock_page(page); } - BUG_ON(len < ceph_fscrypt_page_offset(pages[locked_pages - 1]) + - thp_size(pages[locked_pages - 1]) - offset); - if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) { - rc = -EIO; - goto release_folios; + for (i = 0; i < ceph_wbc->locked_pages; i++) { + page = ceph_fscrypt_pagecache_page(ceph_wbc->pages[i]); + + if (!page) + continue; + + redirty_page_for_writepage(wbc, page); + unlock_page(page); } - req->r_callback = writepages_finish; - req->r_inode = inode; - - /* Format the osd request message and submit the write */ - len = 0; - data_pages = pages; - op_idx = 0; - for (i = 0; i < locked_pages; i++) { - struct page *page = ceph_fscrypt_pagecache_page(pages[i]); - - u64 cur_offset = page_offset(page); - /* - * Discontinuity in page range? Ceph can handle that by just passing - * multiple extents in the write op. - */ - if (offset + len != cur_offset) { - /* If it's full, stop here */ - if (op_idx + 1 == req->r_num_ops) - break; - - /* Kick off an fscache write with what we have so far. */ - ceph_fscache_write_to_cache(inode, offset, len, caching); - - /* Start a new extent */ - osd_req_op_extent_dup_last(req, op_idx, - cur_offset - offset); - doutc(cl, "got pages at %llu~%llu\n", offset, - len); - osd_req_op_extent_osd_data_pages(req, op_idx, - data_pages, len, 0, - from_pool, false); - osd_req_op_extent_update(req, op_idx, len); - - len = 0; - offset = cur_offset; - data_pages = pages + i; - op_idx++; - } - set_page_writeback(page); - if (caching) - ceph_set_page_fscache(page); - len += thp_size(page); + ceph_osdc_put_request(req); + return -EIO; + } + + req->r_callback = writepages_finish; + req->r_inode = inode; + + /* Format the osd request message and submit the write */ + len = 0; + ceph_wbc->data_pages = ceph_wbc->pages; + ceph_wbc->op_idx = 0; + for (i = 0; i < ceph_wbc->locked_pages; i++) { + u64 cur_offset; + + page = ceph_fscrypt_pagecache_page(ceph_wbc->pages[i]); + cur_offset = page_offset(page); + + /* + * Discontinuity in page range? Ceph can handle that by just passing + * multiple extents in the write op. + */ + if (offset + len != cur_offset) { + /* If it's full, stop here */ + if (ceph_wbc->op_idx + 1 == req->r_num_ops) + break; + + /* Kick off an fscache write with what we have so far. */ + ceph_fscache_write_to_cache(inode, offset, len, caching); + + /* Start a new extent */ + osd_req_op_extent_dup_last(req, ceph_wbc->op_idx, + cur_offset - offset); + + doutc(cl, "got pages at %llu~%llu\n", offset, len); + + osd_req_op_extent_osd_data_pages(req, ceph_wbc->op_idx, + ceph_wbc->data_pages, + len, 0, + ceph_wbc->from_pool, + false); + osd_req_op_extent_update(req, ceph_wbc->op_idx, len); + + len = 0; + offset = cur_offset; + ceph_wbc->data_pages = ceph_wbc->pages + i; + ceph_wbc->op_idx++; } - ceph_fscache_write_to_cache(inode, offset, len, caching); - - if (ceph_wbc.size_stable) { - len = min(len, ceph_wbc.i_size - offset); - } else if (i == locked_pages) { - /* writepages_finish() clears writeback pages - * according to the data length, so make sure - * data length covers all locked pages */ - u64 min_len = len + 1 - thp_size(page); - len = get_writepages_data_length(inode, pages[i - 1], - offset); - len = max(len, min_len); + + set_page_writeback(page); + + if (caching) + ceph_set_page_fscache(page); + + len += thp_size(page); + } + + ceph_fscache_write_to_cache(inode, offset, len, caching); + + if (ceph_wbc->size_stable) { + len = min(len, ceph_wbc->i_size - offset); + } else if (i == ceph_wbc->locked_pages) { + /* writepages_finish() clears writeback pages + * according to the data length, so make sure + * data length covers all locked pages */ + u64 min_len = len + 1 - thp_size(page); + len = get_writepages_data_length(inode, + ceph_wbc->pages[i - 1], + offset); + len = max(len, min_len); + } + + if (IS_ENCRYPTED(inode)) + len = round_up(len, CEPH_FSCRYPT_BLOCK_SIZE); + + doutc(cl, "got pages at %llu~%llu\n", offset, len); + + if (IS_ENCRYPTED(inode) && + ((offset | len) & ~CEPH_FSCRYPT_BLOCK_MASK)) { + pr_warn_client(cl, + "bad encrypted write offset=%lld len=%llu\n", + offset, len); + } + + osd_req_op_extent_osd_data_pages(req, ceph_wbc->op_idx, + ceph_wbc->data_pages, len, + 0, ceph_wbc->from_pool, false); + osd_req_op_extent_update(req, ceph_wbc->op_idx, len); + + BUG_ON(ceph_wbc->op_idx + 1 != req->r_num_ops); + + ceph_wbc->from_pool = false; + if (i < ceph_wbc->locked_pages) { + BUG_ON(ceph_wbc->num_ops <= req->r_num_ops); + ceph_wbc->num_ops -= req->r_num_ops; + ceph_wbc->locked_pages -= i; + + /* allocate new pages array for next request */ + ceph_wbc->data_pages = ceph_wbc->pages; + __ceph_allocate_page_array(ceph_wbc, ceph_wbc->locked_pages); + memcpy(ceph_wbc->pages, ceph_wbc->data_pages + i, + ceph_wbc->locked_pages * sizeof(*ceph_wbc->pages)); + memset(ceph_wbc->data_pages + i, 0, + ceph_wbc->locked_pages * sizeof(*ceph_wbc->pages)); + } else { + BUG_ON(ceph_wbc->num_ops != req->r_num_ops); + /* request message now owns the pages array */ + ceph_wbc->pages = NULL; + } + + req->r_mtime = inode_get_mtime(inode); + ceph_osdc_start_request(&fsc->client->osdc, req); + req = NULL; + + wbc->nr_to_write -= i; + if (ceph_wbc->pages) + goto new_request; + + return 0; +} + +static +void ceph_wait_until_current_writes_complete(struct address_space *mapping, + struct writeback_control *wbc, + struct ceph_writeback_ctl *ceph_wbc) +{ + struct page *page; + unsigned i, nr; + + if (wbc->sync_mode != WB_SYNC_NONE && + ceph_wbc->start_index == 0 && /* all dirty pages were checked */ + !ceph_wbc->head_snapc) { + ceph_wbc->index = 0; + + while ((ceph_wbc->index <= ceph_wbc->end) && + (nr = filemap_get_folios_tag(mapping, + &ceph_wbc->index, + (pgoff_t)-1, + PAGECACHE_TAG_WRITEBACK, + &ceph_wbc->fbatch))) { + for (i = 0; i < nr; i++) { + page = &ceph_wbc->fbatch.folios[i]->page; + if (page_snap_context(page) != ceph_wbc->snapc) + continue; + wait_on_page_writeback(page); + } + + folio_batch_release(&ceph_wbc->fbatch); + cond_resched(); } - if (IS_ENCRYPTED(inode)) - len = round_up(len, CEPH_FSCRYPT_BLOCK_SIZE); + } +} - doutc(cl, "got pages at %llu~%llu\n", offset, len); +/* + * initiate async writeback + */ +static int ceph_writepages_start(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct inode *inode = mapping->host; + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_client *cl = fsc->client; + struct ceph_writeback_ctl ceph_wbc; + int rc = 0; - if (IS_ENCRYPTED(inode) && - ((offset | len) & ~CEPH_FSCRYPT_BLOCK_MASK)) - pr_warn_client(cl, - "bad encrypted write offset=%lld len=%llu\n", - offset, len); - - osd_req_op_extent_osd_data_pages(req, op_idx, data_pages, len, - 0, from_pool, false); - osd_req_op_extent_update(req, op_idx, len); - - BUG_ON(op_idx + 1 != req->r_num_ops); - - from_pool = false; - if (i < locked_pages) { - BUG_ON(num_ops <= req->r_num_ops); - num_ops -= req->r_num_ops; - locked_pages -= i; - - /* allocate new pages array for next request */ - data_pages = pages; - pages = kmalloc_array(locked_pages, sizeof(*pages), - GFP_NOFS); - if (!pages) { - from_pool = true; - pages = mempool_alloc(ceph_wb_pagevec_pool, GFP_NOFS); - BUG_ON(!pages); + if (wbc->sync_mode == WB_SYNC_NONE && fsc->write_congested) + return 0; + + doutc(cl, "%llx.%llx (mode=%s)\n", ceph_vinop(inode), + wbc->sync_mode == WB_SYNC_NONE ? "NONE" : + (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); + + if (is_forced_umount(mapping)) { + /* we're in a forced umount, don't write! */ + return -EIO; + } + + ceph_init_writeback_ctl(mapping, wbc, &ceph_wbc); + + if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) { + rc = -EIO; + goto out; + } + +retry: + rc = ceph_define_writeback_range(mapping, wbc, &ceph_wbc); + if (rc == -ENODATA) { + /* hmm, why does writepages get called when there + is no dirty data? */ + rc = 0; + goto dec_osd_stopping_blocker; + } + + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + tag_pages_for_writeback(mapping, ceph_wbc.index, ceph_wbc.end); + + while (!has_writeback_done(&ceph_wbc)) { + ceph_wbc.locked_pages = 0; + ceph_wbc.max_pages = ceph_wbc.wsize >> PAGE_SHIFT; + +get_more_pages: + ceph_folio_batch_reinit(&ceph_wbc); + + ceph_wbc.nr_folios = filemap_get_folios_tag(mapping, + &ceph_wbc.index, + ceph_wbc.end, + ceph_wbc.tag, + &ceph_wbc.fbatch); + doutc(cl, "pagevec_lookup_range_tag for tag %#x got %d\n", + ceph_wbc.tag, ceph_wbc.nr_folios); + + if (!ceph_wbc.nr_folios && !ceph_wbc.locked_pages) + break; + +process_folio_batch: + rc = ceph_process_folio_batch(mapping, wbc, &ceph_wbc); + if (rc) + goto release_folios; + + /* did we get anything? */ + if (!ceph_wbc.locked_pages) + goto release_folios; + + if (ceph_wbc.processed_in_fbatch) { + ceph_shift_unused_folios_left(&ceph_wbc.fbatch); + + if (folio_batch_count(&ceph_wbc.fbatch) == 0 && + ceph_wbc.locked_pages < ceph_wbc.max_pages) { + doutc(cl, "reached end fbatch, trying for more\n"); + goto get_more_pages; } - memcpy(pages, data_pages + i, - locked_pages * sizeof(*pages)); - memset(data_pages + i, 0, - locked_pages * sizeof(*pages)); - } else { - BUG_ON(num_ops != req->r_num_ops); - index = pages[i - 1]->index + 1; - /* request message now owns the pages array */ - pages = NULL; } - req->r_mtime = inode_get_mtime(inode); - ceph_osdc_start_request(&fsc->client->osdc, req); - req = NULL; + rc = ceph_submit_write(mapping, wbc, &ceph_wbc); + if (rc) + goto release_folios; + + ceph_wbc.locked_pages = 0; + ceph_wbc.strip_unit_end = 0; - wbc->nr_to_write -= i; - if (pages) - goto new_request; + if (folio_batch_count(&ceph_wbc.fbatch) > 0) { + ceph_wbc.nr_folios = + folio_batch_count(&ceph_wbc.fbatch); + goto process_folio_batch; + } /* * We stop writing back only if we are not doing @@ -1343,61 +1713,44 @@ new_request: * we tagged for writeback prior to entering this loop. */ if (wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) - done = true; + ceph_wbc.done = true; release_folios: doutc(cl, "folio_batch release on %d folios (%p)\n", - (int)fbatch.nr, fbatch.nr ? fbatch.folios[0] : NULL); - folio_batch_release(&fbatch); + (int)ceph_wbc.fbatch.nr, + ceph_wbc.fbatch.nr ? ceph_wbc.fbatch.folios[0] : NULL); + folio_batch_release(&ceph_wbc.fbatch); } - if (should_loop && !done) { + if (ceph_wbc.should_loop && !ceph_wbc.done) { /* more to do; loop back to beginning of file */ doutc(cl, "looping back to beginning of file\n"); - end = start_index - 1; /* OK even when start_index == 0 */ + /* OK even when start_index == 0 */ + ceph_wbc.end = ceph_wbc.start_index - 1; /* to write dirty pages associated with next snapc, * we need to wait until current writes complete */ - if (wbc->sync_mode != WB_SYNC_NONE && - start_index == 0 && /* all dirty pages were checked */ - !ceph_wbc.head_snapc) { - struct page *page; - unsigned i, nr; - index = 0; - while ((index <= end) && - (nr = filemap_get_folios_tag(mapping, &index, - (pgoff_t)-1, - PAGECACHE_TAG_WRITEBACK, - &fbatch))) { - for (i = 0; i < nr; i++) { - page = &fbatch.folios[i]->page; - if (page_snap_context(page) != snapc) - continue; - wait_on_page_writeback(page); - } - folio_batch_release(&fbatch); - cond_resched(); - } - } + ceph_wait_until_current_writes_complete(mapping, wbc, &ceph_wbc); - start_index = 0; - index = 0; + ceph_wbc.start_index = 0; + ceph_wbc.index = 0; goto retry; } - if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) - mapping->writeback_index = index; + if (wbc->range_cyclic || (ceph_wbc.range_whole && wbc->nr_to_write > 0)) + mapping->writeback_index = ceph_wbc.index; + +dec_osd_stopping_blocker: + ceph_dec_osd_stopping_blocker(fsc->mdsc); out: - ceph_osdc_put_request(req); - ceph_put_snap_context(last_snapc); + ceph_put_snap_context(ceph_wbc.last_snapc); doutc(cl, "%llx.%llx dend - startone, rc = %d\n", ceph_vinop(inode), rc); + return rc; } - - /* * See if a given @snapc is either writeable, or already written. */ @@ -1413,56 +1766,56 @@ static int context_is_writeable_or_written(struct inode *inode, /** * ceph_find_incompatible - find an incompatible context and return it - * @page: page being dirtied + * @folio: folio being dirtied * - * We are only allowed to write into/dirty a page if the page is + * We are only allowed to write into/dirty a folio if the folio is * clean, or already dirty within the same snap context. Returns a * conflicting context if there is one, NULL if there isn't, or a * negative error code on other errors. * - * Must be called with page lock held. + * Must be called with folio lock held. */ static struct ceph_snap_context * -ceph_find_incompatible(struct page *page) +ceph_find_incompatible(struct folio *folio) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); if (ceph_inode_is_shutdown(inode)) { - doutc(cl, " %llx.%llx page %p is shutdown\n", - ceph_vinop(inode), page); + doutc(cl, " %llx.%llx folio %p is shutdown\n", + ceph_vinop(inode), folio); return ERR_PTR(-ESTALE); } for (;;) { struct ceph_snap_context *snapc, *oldest; - wait_on_page_writeback(page); + folio_wait_writeback(folio); - snapc = page_snap_context(page); + snapc = page_snap_context(&folio->page); if (!snapc || snapc == ci->i_head_snapc) break; /* - * this page is already dirty in another (older) snap + * this folio is already dirty in another (older) snap * context! is it writeable now? */ oldest = get_oldest_context(inode, NULL, NULL); if (snapc->seq > oldest->seq) { /* not writeable -- return it for the caller to deal with */ ceph_put_snap_context(oldest); - doutc(cl, " %llx.%llx page %p snapc %p not current or oldest\n", - ceph_vinop(inode), page, snapc); + doutc(cl, " %llx.%llx folio %p snapc %p not current or oldest\n", + ceph_vinop(inode), folio, snapc); return ceph_get_snap_context(snapc); } ceph_put_snap_context(oldest); - /* yay, writeable, do it now (without dropping page lock) */ - doutc(cl, " %llx.%llx page %p snapc %p not current, but oldest\n", - ceph_vinop(inode), page, snapc); - if (clear_page_dirty_for_io(page)) { - int r = writepage_nounlock(page, NULL); + /* yay, writeable, do it now (without dropping folio lock) */ + doutc(cl, " %llx.%llx folio %p snapc %p not current, but oldest\n", + ceph_vinop(inode), folio, snapc); + if (folio_clear_dirty_for_io(folio)) { + int r = write_folio_nounlock(folio, NULL); if (r < 0) return ERR_PTR(r); } @@ -1477,7 +1830,7 @@ static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_snap_context *snapc; - snapc = ceph_find_incompatible(folio_page(*foliop, 0)); + snapc = ceph_find_incompatible(*foliop); if (snapc) { int r; @@ -1502,20 +1855,18 @@ static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned */ static int ceph_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); - struct folio *folio = NULL; int r; - r = netfs_write_begin(&ci->netfs, file, inode->i_mapping, pos, len, &folio, NULL); + r = netfs_write_begin(&ci->netfs, file, inode->i_mapping, pos, len, foliop, NULL); if (r < 0) return r; - folio_wait_fscache(folio); - WARN_ON_ONCE(!folio_test_locked(folio)); - *pagep = &folio->page; + folio_wait_private_2(*foliop); /* [DEPRECATED] */ + WARN_ON_ONCE(!folio_test_locked(*foliop)); return 0; } @@ -1525,9 +1876,8 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, */ static int ceph_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *subpage, void *fsdata) + struct folio *folio, void *fsdata) { - struct folio *folio = page_folio(subpage); struct inode *inode = file_inode(file); struct ceph_client *cl = ceph_inode_to_client(inode); bool check_cap = false; @@ -1563,7 +1913,6 @@ out: const struct address_space_operations ceph_aops = { .read_folio = netfs_read_folio, .readahead = netfs_readahead, - .writepage = ceph_writepage, .writepages = ceph_writepages_start, .write_begin = ceph_write_begin, .write_end = ceph_write_end, @@ -1571,6 +1920,7 @@ const struct address_space_operations ceph_aops = { .invalidate_folio = ceph_invalidate_folio, .release_folio = netfs_release_folio, .direct_IO = noop_direct_IO, + .migrate_folio = filemap_migrate_folio, }; static void ceph_block_sigs(sigset_t *oldset) @@ -1687,8 +2037,8 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_file_info *fi = vma->vm_file->private_data; struct ceph_cap_flush *prealloc_cf; - struct page *page = vmf->page; - loff_t off = page_offset(page); + struct folio *folio = page_folio(vmf->page); + loff_t off = folio_pos(folio); loff_t size = i_size_read(inode); size_t len; int want, got, err; @@ -1705,10 +2055,10 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) sb_start_pagefault(inode->i_sb); ceph_block_sigs(&oldset); - if (off + thp_size(page) <= size) - len = thp_size(page); + if (off + folio_size(folio) <= size) + len = folio_size(folio); else - len = offset_in_thp(page, size); + len = offset_in_folio(folio, size); doutc(cl, "%llx.%llx %llu~%zd getting caps i_size %llu\n", ceph_vinop(inode), off, len, size); @@ -1725,30 +2075,30 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) doutc(cl, "%llx.%llx %llu~%zd got cap refs on %s\n", ceph_vinop(inode), off, len, ceph_cap_string(got)); - /* Update time before taking page lock */ + /* Update time before taking folio lock */ file_update_time(vma->vm_file); inode_inc_iversion_raw(inode); do { struct ceph_snap_context *snapc; - lock_page(page); + folio_lock(folio); - if (page_mkwrite_check_truncate(page, inode) < 0) { - unlock_page(page); + if (folio_mkwrite_check_truncate(folio, inode) < 0) { + folio_unlock(folio); ret = VM_FAULT_NOPAGE; break; } - snapc = ceph_find_incompatible(page); + snapc = ceph_find_incompatible(folio); if (!snapc) { - /* success. we'll keep the page locked. */ - set_page_dirty(page); + /* success. we'll keep the folio locked. */ + folio_mark_dirty(folio); ret = VM_FAULT_LOCKED; break; } - unlock_page(page); + folio_unlock(folio); if (IS_ERR(snapc)) { ret = VM_FAULT_SIGBUS; @@ -2127,7 +2477,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, } pool_ns_len = pool_ns ? pool_ns->len : 0; - perm = kmalloc(sizeof(*perm) + pool_ns_len + 1, GFP_NOFS); + perm = kmalloc(struct_size(perm, pool_ns, pool_ns_len + 1), GFP_NOFS); if (!perm) { err = -ENOMEM; goto out_unlock; @@ -2174,7 +2524,7 @@ int ceph_pool_perm_check(struct inode *inode, int need) if (ci->i_vino.snap != CEPH_NOSNAP) { /* * Pool permission check needs to write to the first object. - * But for snapshot, head of the first object may have alread + * But for snapshot, head of the first object may have already * been deleted. Skip check to avoid creating orphan object. */ return 0; diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index c4941ba245ac..a8d8b56cf9d2 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -10,6 +10,7 @@ #include <linux/writeback.h> #include <linux/iversion.h> #include <linux/filelock.h> +#include <linux/jiffies.h> #include "super.h" #include "mds_client.h" @@ -977,20 +978,6 @@ int __ceph_caps_revoking_other(struct ceph_inode_info *ci, return 0; } -int ceph_caps_revoking(struct ceph_inode_info *ci, int mask) -{ - struct inode *inode = &ci->netfs.inode; - struct ceph_client *cl = ceph_inode_to_client(inode); - int ret; - - spin_lock(&ci->i_ceph_lock); - ret = __ceph_caps_revoking_other(ci, NULL, mask); - spin_unlock(&ci->i_ceph_lock); - doutc(cl, "%p %llx.%llx %s = %d\n", inode, ceph_vinop(inode), - ceph_cap_string(mask), ret); - return ret; -} - int __ceph_caps_used(struct ceph_inode_info *ci) { int used = 0; @@ -2016,6 +2003,8 @@ bool __ceph_should_report_size(struct ceph_inode_info *ci) * CHECK_CAPS_AUTHONLY - we should only check the auth cap * CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without * further delay. + * CHECK_CAPS_FLUSH_FORCE - we should flush any caps immediately, without + * further delay. */ void ceph_check_caps(struct ceph_inode_info *ci, int flags) { @@ -2097,7 +2086,7 @@ retry: } doutc(cl, "%p %llx.%llx file_want %s used %s dirty %s " - "flushing %s issued %s revoking %s retain %s %s%s%s\n", + "flushing %s issued %s revoking %s retain %s %s%s%s%s\n", inode, ceph_vinop(inode), ceph_cap_string(file_wanted), ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps), ceph_cap_string(ci->i_flushing_caps), @@ -2105,7 +2094,8 @@ retry: ceph_cap_string(retain), (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "", (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "", - (flags & CHECK_CAPS_NOINVAL) ? " NOINVAL" : ""); + (flags & CHECK_CAPS_NOINVAL) ? " NOINVAL" : "", + (flags & CHECK_CAPS_FLUSH_FORCE) ? " FLUSH_FORCE" : ""); /* * If we no longer need to hold onto old our caps, and we may @@ -2180,6 +2170,11 @@ retry: queue_writeback = true; } + if (flags & CHECK_CAPS_FLUSH_FORCE) { + doutc(cl, "force to flush caps\n"); + goto ack; + } + if (cap == ci->i_auth_cap && (cap->issued & CEPH_CAP_FILE_WR)) { /* request larger max_size from MDS? */ @@ -2804,7 +2799,7 @@ void ceph_take_cap_refs(struct ceph_inode_info *ci, int got, * requested from the MDS. * * Returns 0 if caps were not able to be acquired (yet), 1 if succeed, - * or a negative error code. There are 3 speical error codes: + * or a negative error code. There are 3 special error codes: * -EAGAIN: need to sleep but non-blocking is specified * -EFBIG: ask caller to call check_max_size() and try again. * -EUCLEAN: ask caller to call ceph_renew_caps() and try again. @@ -3067,10 +3062,13 @@ int __ceph_get_caps(struct inode *inode, struct ceph_file_info *fi, int need, flags, &_got); WARN_ON_ONCE(ret == -EAGAIN); if (!ret) { +#ifdef CONFIG_DEBUG_FS struct ceph_mds_client *mdsc = fsc->mdsc; struct cap_wait cw; +#endif DEFINE_WAIT_FUNC(wait, woken_wake_function); +#ifdef CONFIG_DEBUG_FS cw.ino = ceph_ino(inode); cw.tgid = current->tgid; cw.need = need; @@ -3079,6 +3077,7 @@ int __ceph_get_caps(struct inode *inode, struct ceph_file_info *fi, int need, spin_lock(&mdsc->caps_list_lock); list_add(&cw.list, &mdsc->cap_wait_list); spin_unlock(&mdsc->caps_list_lock); +#endif /* make sure used fmode not timeout */ ceph_get_fmode(ci, flags, FMODE_WAIT_BIAS); @@ -3097,9 +3096,11 @@ int __ceph_get_caps(struct inode *inode, struct ceph_file_info *fi, int need, remove_wait_queue(&ci->i_cap_wq, &wait); ceph_put_fmode(ci, flags, FMODE_WAIT_BIAS); +#ifdef CONFIG_DEBUG_FS spin_lock(&mdsc->caps_list_lock); list_del(&cw.list); spin_unlock(&mdsc->caps_list_lock); +#endif if (ret == -EAGAIN) continue; @@ -3504,6 +3505,8 @@ static void handle_cap_grant(struct inode *inode, bool queue_invalidate = false; bool deleted_inode = false; bool fill_inline = false; + bool revoke_wait = false; + int flags = 0; /* * If there is at least one crypto block then we'll trust @@ -3699,16 +3702,18 @@ static void handle_cap_grant(struct inode *inode, ceph_cap_string(cap->issued), ceph_cap_string(newcaps), ceph_cap_string(revoking)); if (S_ISREG(inode->i_mode) && - (revoking & used & CEPH_CAP_FILE_BUFFER)) + (revoking & used & CEPH_CAP_FILE_BUFFER)) { writeback = true; /* initiate writeback; will delay ack */ - else if (queue_invalidate && + revoke_wait = true; + } else if (queue_invalidate && revoking == CEPH_CAP_FILE_CACHE && - (newcaps & CEPH_CAP_FILE_LAZYIO) == 0) - ; /* do nothing yet, invalidation will be queued */ - else if (cap == ci->i_auth_cap) + (newcaps & CEPH_CAP_FILE_LAZYIO) == 0) { + revoke_wait = true; /* do nothing yet, invalidation will be queued */ + } else if (cap == ci->i_auth_cap) { check_caps = 1; /* check auth cap only */ - else + } else { check_caps = 2; /* check all caps */ + } /* If there is new caps, try to wake up the waiters */ if (~cap->issued & newcaps) wake = true; @@ -3735,8 +3740,9 @@ static void handle_cap_grant(struct inode *inode, BUG_ON(cap->issued & ~cap->implemented); /* don't let check_caps skip sending a response to MDS for revoke msgs */ - if (le32_to_cpu(grant->op) == CEPH_CAP_OP_REVOKE) { + if (!revoke_wait && le32_to_cpu(grant->op) == CEPH_CAP_OP_REVOKE) { cap->mds_wanted = 0; + flags |= CHECK_CAPS_FLUSH_FORCE; if (cap == ci->i_auth_cap) check_caps = 1; /* check auth cap only */ else @@ -3792,9 +3798,9 @@ static void handle_cap_grant(struct inode *inode, mutex_unlock(&session->s_mutex); if (check_caps == 1) - ceph_check_caps(ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_NOINVAL); + ceph_check_caps(ci, flags | CHECK_CAPS_AUTHONLY | CHECK_CAPS_NOINVAL); else if (check_caps == 2) - ceph_check_caps(ci, CHECK_CAPS_NOINVAL); + ceph_check_caps(ci, flags | CHECK_CAPS_NOINVAL); } /* @@ -4065,23 +4071,22 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, struct ceph_cap *cap, *tcap, *new_cap = NULL; struct ceph_inode_info *ci = ceph_inode(inode); u64 t_cap_id; - unsigned mseq = le32_to_cpu(ex->migrate_seq); - unsigned t_seq, t_mseq; + u32 t_issue_seq, t_mseq; int target, issued; int mds = session->s_mds; if (ph) { t_cap_id = le64_to_cpu(ph->cap_id); - t_seq = le32_to_cpu(ph->seq); + t_issue_seq = le32_to_cpu(ph->issue_seq); t_mseq = le32_to_cpu(ph->mseq); target = le32_to_cpu(ph->mds); } else { - t_cap_id = t_seq = t_mseq = 0; + t_cap_id = t_issue_seq = t_mseq = 0; target = -1; } - doutc(cl, "%p %llx.%llx ci %p mds%d mseq %d target %d\n", - inode, ceph_vinop(inode), ci, mds, mseq, target); + doutc(cl, " cap %llx.%llx export to peer %d piseq %u pmseq %u\n", + ceph_vinop(inode), target, t_issue_seq, t_mseq); retry: down_read(&mdsc->snap_rwsem); spin_lock(&ci->i_ceph_lock); @@ -4114,12 +4119,12 @@ retry: if (tcap) { /* already have caps from the target */ if (tcap->cap_id == t_cap_id && - ceph_seq_cmp(tcap->seq, t_seq) < 0) { + ceph_seq_cmp(tcap->seq, t_issue_seq) < 0) { doutc(cl, " updating import cap %p mds%d\n", tcap, target); tcap->cap_id = t_cap_id; - tcap->seq = t_seq - 1; - tcap->issue_seq = t_seq - 1; + tcap->seq = t_issue_seq - 1; + tcap->issue_seq = t_issue_seq - 1; tcap->issued |= issued; tcap->implemented |= issued; if (cap == ci->i_auth_cap) { @@ -4130,11 +4135,11 @@ retry: ceph_remove_cap(mdsc, cap, false); goto out_unlock; } else if (tsession) { - /* add placeholder for the export tagert */ + /* add placeholder for the export target */ int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0; tcap = new_cap; ceph_add_cap(inode, tsession, t_cap_id, issued, 0, - t_seq - 1, t_mseq, (u64)-1, flag, &new_cap); + t_issue_seq - 1, t_mseq, (u64)-1, flag, &new_cap); if (!list_empty(&ci->i_cap_flush_list) && ci->i_auth_cap == tcap) { @@ -4208,18 +4213,22 @@ static void handle_cap_import(struct ceph_mds_client *mdsc, u64 realmino = le64_to_cpu(im->realm); u64 cap_id = le64_to_cpu(im->cap_id); u64 p_cap_id; + u32 piseq = 0; + u32 pmseq = 0; int peer; if (ph) { p_cap_id = le64_to_cpu(ph->cap_id); peer = le32_to_cpu(ph->mds); + piseq = le32_to_cpu(ph->issue_seq); + pmseq = le32_to_cpu(ph->mseq); } else { p_cap_id = 0; peer = -1; } - doutc(cl, "%p %llx.%llx ci %p mds%d mseq %d peer %d\n", - inode, ceph_vinop(inode), ci, mds, mseq, peer); + doutc(cl, " cap %llx.%llx import from peer %d piseq %u pmseq %u\n", + ceph_vinop(inode), peer, piseq, pmseq); retry: cap = __get_cap_for_mds(ci, mds); if (!cap) { @@ -4248,15 +4257,13 @@ retry: doutc(cl, " remove export cap %p mds%d flags %d\n", ocap, peer, ph->flags); if ((ph->flags & CEPH_CAP_FLAG_AUTH) && - (ocap->seq != le32_to_cpu(ph->seq) || - ocap->mseq != le32_to_cpu(ph->mseq))) { + (ocap->seq != piseq || + ocap->mseq != pmseq)) { pr_err_ratelimited_client(cl, "mismatched seq/mseq: " "%p %llx.%llx mds%d seq %d mseq %d" " importer mds%d has peer seq %d mseq %d\n", inode, ceph_vinop(inode), peer, - ocap->seq, ocap->mseq, mds, - le32_to_cpu(ph->seq), - le32_to_cpu(ph->mseq)); + ocap->seq, ocap->mseq, mds, piseq, pmseq); } ceph_remove_cap(mdsc, ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE)); } @@ -4330,7 +4337,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, struct ceph_snap_realm *realm = NULL; int op; int msg_version = le16_to_cpu(msg->hdr.version); - u32 seq, mseq; + u32 seq, mseq, issue_seq; struct ceph_vino vino; void *snaptrace; size_t snaptrace_len; @@ -4340,8 +4347,6 @@ void ceph_handle_caps(struct ceph_mds_session *session, bool close_sessions = false; bool do_cap_release = false; - doutc(cl, "from mds%d\n", session->s_mds); - if (!ceph_inc_mds_stopping_blocker(mdsc, session)) return; @@ -4355,6 +4360,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, vino.snap = CEPH_NOSNAP; seq = le32_to_cpu(h->seq); mseq = le32_to_cpu(h->migrate_seq); + issue_seq = le32_to_cpu(h->issue_seq); snaptrace = h + 1; snaptrace_len = le32_to_cpu(h->snap_trace_len); @@ -4442,12 +4448,11 @@ void ceph_handle_caps(struct ceph_mds_session *session, /* lookup ino */ inode = ceph_find_inode(mdsc->fsc->sb, vino); - doutc(cl, " op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), - vino.ino, vino.snap, inode); + doutc(cl, " caps mds%d op %s ino %llx.%llx inode %p seq %u iseq %u mseq %u\n", + session->s_mds, ceph_cap_op_name(op), vino.ino, vino.snap, inode, + seq, issue_seq, mseq); mutex_lock(&session->s_mutex); - doutc(cl, " mds%d seq %lld cap seq %u\n", session->s_mds, - session->s_seq, (unsigned)seq); if (!inode) { doutc(cl, " i don't have ino %llx\n", vino.ino); @@ -4583,7 +4588,7 @@ flush_cap_releases: __ceph_queue_cap_release(session, cap); spin_unlock(&session->s_cap_lock); } - ceph_flush_cap_releases(mdsc, session); + ceph_flush_session_cap_releases(mdsc, session); goto done; bad: @@ -4640,7 +4645,7 @@ unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc) * slowness doesn't block mdsc delayed work, * preventing send_renew_caps() from running. */ - if (jiffies - loop_start >= 5 * HZ) + if (time_after_eq(jiffies, loop_start + 5 * HZ)) break; } spin_unlock(&mdsc->cap_delay_lock); @@ -4682,6 +4687,28 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) ceph_mdsc_iterate_sessions(mdsc, flush_dirty_session_caps, true); } +/* + * Flush all cap releases to the mds + */ +static void flush_cap_releases(struct ceph_mds_session *s) +{ + struct ceph_mds_client *mdsc = s->s_mdsc; + struct ceph_client *cl = mdsc->fsc->client; + + doutc(cl, "begin\n"); + spin_lock(&s->s_cap_lock); + if (s->s_num_cap_releases) + ceph_flush_session_cap_releases(mdsc, s); + spin_unlock(&s->s_cap_lock); + doutc(cl, "done\n"); + +} + +void ceph_flush_cap_releases(struct ceph_mds_client *mdsc) +{ + ceph_mdsc_iterate_sessions(mdsc, flush_cap_releases, true); +} + void __ceph_touch_fmode(struct ceph_inode_info *ci, struct ceph_mds_client *mdsc, int fmode) { diff --git a/fs/ceph/crypto.h b/fs/ceph/crypto.h index 47e0c319fc68..d0768239a1c9 100644 --- a/fs/ceph/crypto.h +++ b/fs/ceph/crypto.h @@ -27,7 +27,7 @@ struct ceph_fname { }; /* - * Header for the crypted file when truncating the size, this + * Header for the encrypted file when truncating the size, this * will be sent to MDS, and the MDS will update the encrypted * last block and then truncate the size. */ diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 24c08078f5aa..fdd404fc8112 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -357,7 +357,7 @@ static int status_show(struct seq_file *s, void *p) seq_printf(s, "instance: %s.%lld %s/%u\n", ENTITY_NAME(inst->name), ceph_pr_addr(client_addr), le32_to_cpu(client_addr->nonce)); - seq_printf(s, "blocklisted: %s\n", fsc->blocklisted ? "true" : "false"); + seq_printf(s, "blocklisted: %s\n", str_true_false(fsc->blocklisted)); return 0; } @@ -412,7 +412,7 @@ void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc) void ceph_fs_debugfs_init(struct ceph_fs_client *fsc) { - char name[100]; + char name[NAME_MAX]; doutc(fsc->client, "begin\n"); fsc->debugfs_congestion_kb = diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 0e9f56eaba1e..a321aa6d0ed2 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -141,17 +141,18 @@ __dcache_find_get_entry(struct dentry *parent, u64 idx, if (ptr_pos >= i_size_read(dir)) return NULL; - if (!cache_ctl->page || ptr_pgoff != page_index(cache_ctl->page)) { + if (!cache_ctl->folio || ptr_pgoff != cache_ctl->folio->index) { ceph_readdir_cache_release(cache_ctl); - cache_ctl->page = find_lock_page(&dir->i_data, ptr_pgoff); - if (!cache_ctl->page) { - doutc(cl, " page %lu not found\n", ptr_pgoff); + cache_ctl->folio = filemap_lock_folio(&dir->i_data, ptr_pgoff); + if (IS_ERR(cache_ctl->folio)) { + cache_ctl->folio = NULL; + doutc(cl, " folio %lu not found\n", ptr_pgoff); return ERR_PTR(-EAGAIN); } /* reading/filling the cache are serialized by - i_rwsem, no need to use page lock */ - unlock_page(cache_ctl->page); - cache_ctl->dentries = kmap(cache_ctl->page); + i_rwsem, no need to use folio lock */ + folio_unlock(cache_ctl->folio); + cache_ctl->dentries = kmap_local_folio(cache_ctl->folio, 0); } cache_ctl->index = idx & idx_mask; @@ -207,7 +208,7 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, dentry = __dcache_find_get_entry(parent, idx + step, &cache_ctl); if (!dentry) { - /* use linar search */ + /* use linear search */ idx = 0; break; } @@ -659,7 +660,7 @@ static bool need_reset_readdir(struct ceph_dir_file_info *dfi, loff_t new_pos) return true; if (is_hash_order(new_pos)) { /* no need to reset last_name for a forward seek when - * dentries are sotred in hash order */ + * dentries are sorted in hash order */ } else if (dfi->frag != fpos_frag(new_pos)) { return true; } @@ -707,7 +708,6 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) if (offset != file->f_pos) { file->f_pos = offset; - file->f_version = 0; dfi->file_info.flags &= ~CEPH_F_ATEND; } retval = offset; @@ -1093,19 +1093,20 @@ out: return err; } -static int ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_request *req; struct ceph_acl_sec_ctx as_ctx = {}; + struct dentry *ret; int err; int op; err = ceph_wait_on_conflict_unlink(dentry); if (err) - return err; + return ERR_PTR(err); if (ceph_snap(dir) == CEPH_SNAPDIR) { /* mkdir .snap/foo is a MKSNAP */ @@ -1117,32 +1118,32 @@ static int ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir, ceph_vinop(dir), dentry, dentry, mode); op = CEPH_MDS_OP_MKDIR; } else { - err = -EROFS; + ret = ERR_PTR(-EROFS); goto out; } if (op == CEPH_MDS_OP_MKDIR && ceph_quota_is_max_files_exceeded(dir)) { - err = -EDQUOT; + ret = ERR_PTR(-EDQUOT); goto out; } if ((op == CEPH_MDS_OP_MKSNAP) && IS_ENCRYPTED(dir) && !fscrypt_has_encryption_key(dir)) { - err = -ENOKEY; + ret = ERR_PTR(-ENOKEY); goto out; } req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); if (IS_ERR(req)) { - err = PTR_ERR(req); + ret = ERR_CAST(req); goto out; } mode |= S_IFDIR; req->r_new_inode = ceph_new_inode(dir, dentry, &mode, &as_ctx); if (IS_ERR(req->r_new_inode)) { - err = PTR_ERR(req->r_new_inode); + ret = ERR_CAST(req->r_new_inode); req->r_new_inode = NULL; goto out_req; } @@ -1166,15 +1167,22 @@ static int ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir, !req->r_reply_info.head->is_target && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); + ret = ERR_PTR(err); out_req: + if (!IS_ERR(ret) && req->r_dentry != dentry) + /* Some other dentry was spliced in */ + ret = dget(req->r_dentry); ceph_mdsc_put_request(req); out: - if (!err) + if (!IS_ERR(ret)) { + if (ret) + dentry = ret; ceph_init_inode_acls(d_inode(dentry), &as_ctx); - else + } else { d_drop(dentry); + } ceph_release_acl_sec_ctx(&as_ctx); - return err; + return ret; } static int ceph_link(struct dentry *old_dentry, struct inode *dir, @@ -1336,8 +1344,12 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry) struct inode *inode = d_inode(dentry); struct ceph_mds_request *req; bool try_async = ceph_test_mount_opt(fsc, ASYNC_DIROPS); + struct dentry *dn; int err = -EROFS; int op; + char *path; + int pathlen; + u64 pathbase; if (ceph_snap(dir) == CEPH_SNAPDIR) { /* rmdir .snap/foo is RMSNAP */ @@ -1351,6 +1363,30 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry) CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK; } else goto out; + + dn = d_find_alias(dir); + if (!dn) { + try_async = false; + } else { + path = ceph_mdsc_build_path(mdsc, dn, &pathlen, &pathbase, 0); + if (IS_ERR(path)) { + try_async = false; + err = 0; + } else { + err = ceph_mds_check_access(mdsc, path, MAY_WRITE); + } + ceph_mdsc_free_path(path, pathlen); + dput(dn); + + /* For none EACCES cases will let the MDS do the mds auth check */ + if (err == -EACCES) { + return err; + } else if (err < 0) { + try_async = false; + err = 0; + } + } + retry: req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); if (IS_ERR(req)) { @@ -1561,7 +1597,7 @@ void __ceph_dentry_dir_lease_touch(struct ceph_dentry_info *di) } spin_lock(&mdsc->dentry_list_lock); - __dentry_dir_lease_touch(mdsc, di), + __dentry_dir_lease_touch(mdsc, di); spin_unlock(&mdsc->dentry_list_lock); } @@ -1913,29 +1949,19 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry, /* * Check if cached dentry can be trusted. */ -static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) +static int ceph_d_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(dentry->d_sb)->mdsc; struct ceph_client *cl = mdsc->fsc->client; int valid = 0; - struct dentry *parent; - struct inode *dir, *inode; + struct inode *inode; - valid = fscrypt_d_revalidate(dentry, flags); + valid = fscrypt_d_revalidate(dir, name, dentry, flags); if (valid <= 0) return valid; - if (flags & LOOKUP_RCU) { - parent = READ_ONCE(dentry->d_parent); - dir = d_inode_rcu(parent); - if (!dir) - return -ECHILD; - inode = d_inode_rcu(dentry); - } else { - parent = dget_parent(dentry); - dir = d_inode(parent); - inode = d_inode(dentry); - } + inode = d_inode_rcu(dentry); doutc(cl, "%p '%pd' inode %p offset 0x%llx nokey %d\n", dentry, dentry, inode, ceph_dentry(dentry)->offset, @@ -1981,6 +2007,8 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) req->r_parent = dir; ihold(dir); + req->r_dname = name; + mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED; if (ceph_security_xattr_wanted(dir)) mask |= CEPH_CAP_XATTR_SHARED; @@ -2011,9 +2039,6 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) doutc(cl, "%p '%pd' %s\n", dentry, dentry, valid ? "valid" : "invalid"); if (!valid) ceph_dir_clear_complete(dir); - - if (!(flags & LOOKUP_RCU)) - dput(parent); return valid; } @@ -2031,7 +2056,7 @@ static int ceph_d_delete(const struct dentry *dentry) return 0; if (ceph_snap(d_inode(dentry)) != CEPH_NOSNAP) return 0; - /* vaild lease? */ + /* valid lease? */ di = ceph_dentry(dentry); if (di) { if (__dentry_lease_is_valid(di)) diff --git a/fs/ceph/export.c b/fs/ceph/export.c index a79f163ae4ed..150076ced937 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -3,7 +3,7 @@ #include <linux/exportfs.h> #include <linux/slab.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include "super.h" #include "mds_client.h" @@ -393,9 +393,9 @@ static struct dentry *ceph_get_parent(struct dentry *child) } dir = snapdir; } - /* If directory has already been deleted, futher get_parent + /* If directory has already been deleted, further get_parent * will fail. Do not mark snapdir dentry as disconnected, - * this prevent exportfs from doing futher get_parent. */ + * this prevents exportfs from doing further get_parent. */ if (unlinked) dn = d_obtain_root(dir); else @@ -452,7 +452,13 @@ static int __get_snap_name(struct dentry *parent, char *name, goto out; if (ceph_snap(inode) == CEPH_SNAPDIR) { if (ceph_snap(dir) == CEPH_NOSNAP) { - strcpy(name, fsc->mount_options->snapdir_name); + /* + * .get_name() from struct export_operations + * assumes that its 'name' parameter is pointing + * to a NAME_MAX+1 sized buffer + */ + strscpy(name, fsc->mount_options->snapdir_name, + NAME_MAX + 1); err = 0; } goto out; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 16873d07692f..851d70200c6b 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -366,6 +366,12 @@ int ceph_open(struct inode *inode, struct file *file) struct ceph_file_info *fi = file->private_data; int err; int flags, fmode, wanted; + struct dentry *dentry; + char *path; + int pathlen; + u64 pathbase; + bool do_sync = false; + int mask = MAY_READ; if (fi) { doutc(cl, "file %p is already opened\n", file); @@ -387,6 +393,31 @@ int ceph_open(struct inode *inode, struct file *file) fmode = ceph_flags_to_mode(flags); wanted = ceph_caps_for_mode(fmode); + if (fmode & CEPH_FILE_MODE_WR) + mask |= MAY_WRITE; + dentry = d_find_alias(inode); + if (!dentry) { + do_sync = true; + } else { + path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase, 0); + if (IS_ERR(path)) { + do_sync = true; + err = 0; + } else { + err = ceph_mds_check_access(mdsc, path, mask); + } + ceph_mdsc_free_path(path, pathlen); + dput(dentry); + + /* For none EACCES cases will let the MDS do the mds auth check */ + if (err == -EACCES) { + return err; + } else if (err < 0) { + do_sync = true; + err = 0; + } + } + /* snapped files are read-only */ if (ceph_snap(inode) != CEPH_NOSNAP && (file->f_mode & FMODE_WRITE)) return -EROFS; @@ -402,7 +433,7 @@ int ceph_open(struct inode *inode, struct file *file) * asynchronously. */ spin_lock(&ci->i_ceph_lock); - if (__ceph_is_any_real_caps(ci) && + if (!do_sync && __ceph_is_any_real_caps(ci) && (((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) { int mds_wanted = __ceph_caps_mds_wanted(ci, true); int issued = __ceph_caps_issued(ci, NULL); @@ -420,7 +451,7 @@ int ceph_open(struct inode *inode, struct file *file) ceph_check_caps(ci, 0); return ceph_init_file(inode, file, fmode); - } else if (ceph_snap(inode) != CEPH_NOSNAP && + } else if (!do_sync && ceph_snap(inode) != CEPH_NOSNAP && (ci->i_snap_caps & wanted) == wanted) { __ceph_touch_fmode(ci, mdsc, fmode); spin_unlock(&ci->i_ceph_lock); @@ -759,6 +790,9 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, bool try_async = ceph_test_mount_opt(fsc, ASYNC_DIROPS); int mask; int err; + char *path; + int pathlen; + u64 pathbase; doutc(cl, "%p %llx.%llx dentry %p '%pd' %s flags %d mode 0%o\n", dir, ceph_vinop(dir), dentry, dentry, @@ -776,6 +810,34 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, */ flags &= ~O_TRUNC; + dn = d_find_alias(dir); + if (!dn) { + try_async = false; + } else { + path = ceph_mdsc_build_path(mdsc, dn, &pathlen, &pathbase, 0); + if (IS_ERR(path)) { + try_async = false; + err = 0; + } else { + int fmode = ceph_flags_to_mode(flags); + + mask = MAY_READ; + if (fmode & CEPH_FILE_MODE_WR) + mask |= MAY_WRITE; + err = ceph_mds_check_access(mdsc, path, mask); + } + ceph_mdsc_free_path(path, pathlen); + dput(dn); + + /* For none EACCES cases will let the MDS do the mds auth check */ + if (err == -EACCES) { + return err; + } else if (err < 0) { + try_async = false; + err = 0; + } + } + retry: if (flags & O_CREAT) { if (ceph_quota_is_max_files_exceeded(dir)) @@ -1004,7 +1066,7 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, if (ceph_inode_is_shutdown(inode)) return -EIO; - if (!len) + if (!len || !i_size) return 0; /* * flush any page cache pages in this range. this @@ -1024,7 +1086,7 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, int num_pages; size_t page_off; bool more; - int idx; + int idx = 0; size_t left; struct ceph_osd_req_op *op; u64 read_off = off; @@ -1054,6 +1116,16 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, len = read_off + read_len - off; more = len < iov_iter_count(to); + op = &req->r_ops[0]; + if (sparse) { + extent_cnt = __ceph_sparse_read_ext_count(inode, read_len); + ret = ceph_alloc_sparse_ext_map(op, extent_cnt); + if (ret) { + ceph_osdc_put_request(req); + break; + } + } + num_pages = calc_pages_for(read_off, read_len); page_off = offset_in_page(off); pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); @@ -1065,17 +1137,7 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, osd_req_op_extent_osd_data_pages(req, 0, pages, read_len, offset_in_page(read_off), - false, false); - - op = &req->r_ops[0]; - if (sparse) { - extent_cnt = __ceph_sparse_read_ext_count(inode, read_len); - ret = ceph_alloc_sparse_ext_map(op, extent_cnt); - if (ret) { - ceph_osdc_put_request(req); - break; - } - } + false, true); ceph_osdc_start_request(osdc, req); ret = ceph_osdc_wait_request(osdc, req); @@ -1098,7 +1160,14 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, else if (ret == -ENOENT) ret = 0; - if (ret > 0 && IS_ENCRYPTED(inode)) { + if (ret < 0) { + ceph_osdc_put_request(req); + if (ret == -EBLOCKLISTED) + fsc->blocklisted = true; + break; + } + + if (IS_ENCRYPTED(inode)) { int fret; fret = ceph_fscrypt_decrypt_extents(inode, pages, @@ -1124,10 +1193,8 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, ret = min_t(ssize_t, fret, len); } - ceph_osdc_put_request(req); - /* Short read but not EOF? Zero out the remainder. */ - if (ret >= 0 && ret < len && (off + ret < i_size)) { + if (ret < len && (off + ret < i_size)) { int zlen = min(len - ret, i_size - off - ret); int zoff = page_off + ret; @@ -1137,13 +1204,11 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, ret += zlen; } - idx = 0; - if (ret <= 0) - left = 0; - else if (off + ret > i_size) - left = i_size - off; + if (off + ret > i_size) + left = (i_size > off) ? i_size - off : 0; else left = ret; + while (left > 0) { size_t plen, copied; @@ -1159,13 +1224,8 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, break; } } - ceph_release_page_vector(pages, num_pages); - if (ret < 0) { - if (ret == -EBLOCKLISTED) - fsc->blocklisted = true; - break; - } + ceph_osdc_put_request(req); if (off >= i_size || !more) break; @@ -1491,6 +1551,16 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, break; } + op = &req->r_ops[0]; + if (!write && sparse) { + extent_cnt = __ceph_sparse_read_ext_count(inode, size); + ret = ceph_alloc_sparse_ext_map(op, extent_cnt); + if (ret) { + ceph_osdc_put_request(req); + break; + } + } + len = iter_get_bvecs_alloc(iter, size, &bvecs, &num_pages); if (len < 0) { ceph_osdc_put_request(req); @@ -1500,6 +1570,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, if (len != size) osd_req_op_extent_update(req, 0, len); + osd_req_op_extent_osd_data_bvecs(req, 0, bvecs, num_pages, len); + /* * To simplify error handling, allow AIO when IO within i_size * or IO can be satisfied by single OSD request. @@ -1531,17 +1603,6 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, req->r_mtime = mtime; } - osd_req_op_extent_osd_data_bvecs(req, 0, bvecs, num_pages, len); - op = &req->r_ops[0]; - if (sparse) { - extent_cnt = __ceph_sparse_read_ext_count(inode, size); - ret = ceph_alloc_sparse_ext_map(op, extent_cnt); - if (ret) { - ceph_osdc_put_request(req); - break; - } - } - if (aio_req) { aio_req->total_len += len; aio_req->num_reqs++; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 7b2e77517f23..06cd2963e41e 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -160,7 +160,7 @@ struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino, } /* - * get/constuct snapdir inode for a given directory + * get/construct snapdir inode for a given directory */ struct inode *ceph_get_snapdir(struct inode *parent) { @@ -695,6 +695,7 @@ void ceph_evict_inode(struct inode *inode) percpu_counter_dec(&mdsc->metric.total_inodes); + netfs_wait_for_outstanding_io(inode); truncate_inode_pages_final(&inode->i_data); if (inode->i_state & I_PINNING_NETFS_WB) ceph_fscache_unuse_cookie(inode, true); @@ -1778,7 +1779,7 @@ retry_lookup: if (err < 0) goto done; } else if (rinfo->head->is_dentry && req->r_dentry) { - /* parent inode is not locked, be carefull */ + /* parent inode is not locked, be careful */ struct ceph_vino *ptvino = NULL; dvino.ino = le64_to_cpu(rinfo->diri.in->ino); dvino.snap = le64_to_cpu(rinfo->diri.in->snapid); @@ -1844,10 +1845,9 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req, void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl) { - if (ctl->page) { - kunmap(ctl->page); - put_page(ctl->page); - ctl->page = NULL; + if (ctl->folio) { + folio_release_kmap(ctl->folio, ctl->dentries); + ctl->folio = NULL; } } @@ -1861,20 +1861,26 @@ static int fill_readdir_cache(struct inode *dir, struct dentry *dn, unsigned idx = ctl->index % nsize; pgoff_t pgoff = ctl->index / nsize; - if (!ctl->page || pgoff != page_index(ctl->page)) { + if (!ctl->folio || pgoff != ctl->folio->index) { ceph_readdir_cache_release(ctl); + fgf_t fgf = FGP_LOCK; + if (idx == 0) - ctl->page = grab_cache_page(&dir->i_data, pgoff); - else - ctl->page = find_lock_page(&dir->i_data, pgoff); - if (!ctl->page) { + fgf |= FGP_ACCESSED | FGP_CREAT; + + ctl->folio = __filemap_get_folio(&dir->i_data, pgoff, + fgf, mapping_gfp_mask(&dir->i_data)); + if (IS_ERR(ctl->folio)) { + int err = PTR_ERR(ctl->folio); + + ctl->folio = NULL; ctl->index = -1; - return idx == 0 ? -ENOMEM : 0; + return idx == 0 ? err : 0; } /* reading/filling the cache are serialized by - * i_rwsem, no need to use page lock */ - unlock_page(ctl->page); - ctl->dentries = kmap(ctl->page); + * i_rwsem, no need to use folio lock */ + folio_unlock(ctl->folio); + ctl->dentries = kmap_local_folio(ctl->folio, 0); if (idx == 0) memset(ctl->dentries, 0, PAGE_SIZE); } @@ -2361,7 +2367,7 @@ static int fill_fscrypt_truncate(struct inode *inode, /* Try to writeback the dirty pagecaches */ if (issued & (CEPH_CAP_FILE_BUFFER)) { - loff_t lend = orig_pos + CEPH_FSCRYPT_BLOCK_SHIFT - 1; + loff_t lend = orig_pos + CEPH_FSCRYPT_BLOCK_SIZE - 1; ret = filemap_write_and_wait_range(inode->i_mapping, orig_pos, lend); @@ -2480,6 +2486,34 @@ int __ceph_setattr(struct mnt_idmap *idmap, struct inode *inode, bool lock_snap_rwsem = false; bool fill_fscrypt; int truncate_retry = 20; /* The RMW will take around 50ms */ + struct dentry *dentry; + char *path; + int pathlen; + u64 pathbase; + bool do_sync = false; + + dentry = d_find_alias(inode); + if (!dentry) { + do_sync = true; + } else { + path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase, 0); + if (IS_ERR(path)) { + do_sync = true; + err = 0; + } else { + err = ceph_mds_check_access(mdsc, path, MAY_WRITE); + } + ceph_mdsc_free_path(path, pathlen); + dput(dentry); + + /* For none EACCES cases will let the MDS do the mds auth check */ + if (err == -EACCES) { + return err; + } else if (err < 0) { + do_sync = true; + err = 0; + } + } retry: prealloc_cf = ceph_alloc_cap_flush(); @@ -2526,7 +2560,7 @@ retry: /* It should never be re-set once set */ WARN_ON_ONCE(ci->fscrypt_auth); - if (issued & CEPH_CAP_AUTH_EXCL) { + if (!do_sync && (issued & CEPH_CAP_AUTH_EXCL)) { dirtied |= CEPH_CAP_AUTH_EXCL; kfree(ci->fscrypt_auth); ci->fscrypt_auth = (u8 *)cia->fscrypt_auth; @@ -2555,7 +2589,7 @@ retry: ceph_vinop(inode), from_kuid(&init_user_ns, inode->i_uid), from_kuid(&init_user_ns, attr->ia_uid)); - if (issued & CEPH_CAP_AUTH_EXCL) { + if (!do_sync && (issued & CEPH_CAP_AUTH_EXCL)) { inode->i_uid = fsuid; dirtied |= CEPH_CAP_AUTH_EXCL; } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || @@ -2573,7 +2607,7 @@ retry: ceph_vinop(inode), from_kgid(&init_user_ns, inode->i_gid), from_kgid(&init_user_ns, attr->ia_gid)); - if (issued & CEPH_CAP_AUTH_EXCL) { + if (!do_sync && (issued & CEPH_CAP_AUTH_EXCL)) { inode->i_gid = fsgid; dirtied |= CEPH_CAP_AUTH_EXCL; } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || @@ -2587,7 +2621,7 @@ retry: if (ia_valid & ATTR_MODE) { doutc(cl, "%p %llx.%llx mode 0%o -> 0%o\n", inode, ceph_vinop(inode), inode->i_mode, attr->ia_mode); - if (issued & CEPH_CAP_AUTH_EXCL) { + if (!do_sync && (issued & CEPH_CAP_AUTH_EXCL)) { inode->i_mode = attr->ia_mode; dirtied |= CEPH_CAP_AUTH_EXCL; } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || @@ -2606,11 +2640,11 @@ retry: inode, ceph_vinop(inode), atime.tv_sec, atime.tv_nsec, attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec); - if (issued & CEPH_CAP_FILE_EXCL) { + if (!do_sync && (issued & CEPH_CAP_FILE_EXCL)) { ci->i_time_warp_seq++; inode_set_atime_to_ts(inode, attr->ia_atime); dirtied |= CEPH_CAP_FILE_EXCL; - } else if ((issued & CEPH_CAP_FILE_WR) && + } else if (!do_sync && (issued & CEPH_CAP_FILE_WR) && timespec64_compare(&atime, &attr->ia_atime) < 0) { inode_set_atime_to_ts(inode, attr->ia_atime); @@ -2646,7 +2680,7 @@ retry: CEPH_FSCRYPT_BLOCK_SIZE)); req->r_fscrypt_file = attr->ia_size; fill_fscrypt = true; - } else if ((issued & CEPH_CAP_FILE_EXCL) && attr->ia_size >= isize) { + } else if (!do_sync && (issued & CEPH_CAP_FILE_EXCL) && attr->ia_size >= isize) { if (attr->ia_size > isize) { i_size_write(inode, attr->ia_size); inode->i_blocks = calc_inode_blocks(attr->ia_size); @@ -2683,11 +2717,11 @@ retry: inode, ceph_vinop(inode), mtime.tv_sec, mtime.tv_nsec, attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec); - if (issued & CEPH_CAP_FILE_EXCL) { + if (!do_sync && (issued & CEPH_CAP_FILE_EXCL)) { ci->i_time_warp_seq++; inode_set_mtime_to_ts(inode, attr->ia_mtime); dirtied |= CEPH_CAP_FILE_EXCL; - } else if ((issued & CEPH_CAP_FILE_WR) && + } else if (!do_sync && (issued & CEPH_CAP_FILE_WR) && timespec64_compare(&mtime, &attr->ia_mtime) < 0) { inode_set_mtime_to_ts(inode, attr->ia_mtime); dirtied |= CEPH_CAP_FILE_WR; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 360b686c3c67..230e0c3f341f 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -827,7 +827,7 @@ static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info) * And the worst case is that for the none async openc request it will * successfully open the file if the CDentry hasn't been unlinked yet, * but later the previous delayed async unlink request will remove the - * CDenty. That means the just created file is possiblly deleted later + * CDentry. That means the just created file is possibly deleted later * by accident. * * We need to wait for the inflight async unlink requests to finish @@ -1747,14 +1747,6 @@ static void __open_export_target_sessions(struct ceph_mds_client *mdsc, } } -void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session) -{ - mutex_lock(&mdsc->mutex); - __open_export_target_sessions(mdsc, session); - mutex_unlock(&mdsc->mutex); -} - /* * session caps */ @@ -2266,7 +2258,7 @@ int ceph_trim_caps(struct ceph_mds_client *mdsc, trim_caps - remaining); } - ceph_flush_cap_releases(mdsc, session); + ceph_flush_session_cap_releases(mdsc, session); return 0; } @@ -2362,7 +2354,7 @@ again: item->ino = cpu_to_le64(cap->cap_ino); item->cap_id = cpu_to_le64(cap->cap_id); item->migrate_seq = cpu_to_le32(cap->mseq); - item->seq = cpu_to_le32(cap->issue_seq); + item->issue_seq = cpu_to_le32(cap->issue_seq); msg->front.iov_len += sizeof(*item); ceph_put_cap(mdsc, cap); @@ -2420,7 +2412,7 @@ static void ceph_cap_release_work(struct work_struct *work) ceph_put_mds_session(session); } -void ceph_flush_cap_releases(struct ceph_mds_client *mdsc, +void ceph_flush_session_cap_releases(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { struct ceph_client *cl = mdsc->fsc->client; @@ -2447,7 +2439,7 @@ void __ceph_queue_cap_release(struct ceph_mds_session *session, session->s_num_cap_releases++; if (!(session->s_num_cap_releases % CEPH_CAPS_PER_RELEASE)) - ceph_flush_cap_releases(session->s_mdsc, session); + ceph_flush_session_cap_releases(session->s_mdsc, session); } static void ceph_cap_reclaim_work(struct work_struct *work) @@ -2629,6 +2621,7 @@ static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen) { struct inode *dir = req->r_parent; struct dentry *dentry = req->r_dentry; + const struct qstr *name = req->r_dname; u8 *cryptbuf = NULL; u32 len = 0; int ret = 0; @@ -2649,8 +2642,10 @@ static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen) if (!fscrypt_has_encryption_key(dir)) goto success; - if (!fscrypt_fname_encrypted_size(dir, dentry->d_name.len, NAME_MAX, - &len)) { + if (!name) + name = &dentry->d_name; + + if (!fscrypt_fname_encrypted_size(dir, name->len, NAME_MAX, &len)) { WARN_ON_ONCE(1); return ERR_PTR(-ENAMETOOLONG); } @@ -2665,7 +2660,7 @@ static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen) if (!cryptbuf) return ERR_PTR(-ENOMEM); - ret = fscrypt_fname_encrypt(dir, &dentry->d_name, cryptbuf, len); + ret = fscrypt_fname_encrypt(dir, name, cryptbuf, len); if (ret) { kfree(cryptbuf); return ERR_PTR(ret); @@ -2808,12 +2803,11 @@ retry: if (pos < 0) { /* - * A rename didn't occur, but somehow we didn't end up where - * we thought we would. Throw a warning and try again. + * The path is longer than PATH_MAX and this function + * cannot ever succeed. Creating paths that long is + * possible with Ceph, but Linux cannot use them. */ - pr_warn_client(cl, "did not end path lookup where expected (pos = %d)\n", - pos); - goto retry; + return ERR_PTR(-ENAMETOOLONG); } *pbase = base; @@ -2954,12 +2948,12 @@ static struct ceph_mds_request_head_legacy * find_legacy_request_head(void *p, u64 features) { bool legacy = !(features & CEPH_FEATURE_FS_BTIME); - struct ceph_mds_request_head_old *ohead; + struct ceph_mds_request_head *head; if (legacy) return (struct ceph_mds_request_head_legacy *)p; - ohead = (struct ceph_mds_request_head_old *)p; - return (struct ceph_mds_request_head_legacy *)&ohead->oldest_client_tid; + head = (struct ceph_mds_request_head *)p; + return (struct ceph_mds_request_head_legacy *)&head->oldest_client_tid; } /* @@ -3029,7 +3023,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, if (legacy) len = sizeof(struct ceph_mds_request_head_legacy); else if (request_head_version == 1) - len = sizeof(struct ceph_mds_request_head_old); + len = offsetofend(struct ceph_mds_request_head, args); else if (request_head_version == 2) len = offsetofend(struct ceph_mds_request_head, ext_num_fwd); else @@ -3113,11 +3107,11 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, msg->hdr.version = cpu_to_le16(3); p = msg->front.iov_base + sizeof(*lhead); } else if (request_head_version == 1) { - struct ceph_mds_request_head_old *ohead = msg->front.iov_base; + struct ceph_mds_request_head *nhead = msg->front.iov_base; msg->hdr.version = cpu_to_le16(4); - ohead->version = cpu_to_le16(1); - p = msg->front.iov_base + sizeof(*ohead); + nhead->version = cpu_to_le16(1); + p = msg->front.iov_base + offsetofend(struct ceph_mds_request_head, args); } else if (request_head_version == 2) { struct ceph_mds_request_head *nhead = msg->front.iov_base; @@ -3269,12 +3263,12 @@ static int __prepare_send_request(struct ceph_mds_session *session, &session->s_features); /* - * Avoid inifinite retrying after overflow. The client will + * Avoid infinite retrying after overflow. The client will * increase the retry count and if the MDS is old version, * so we limit to retry at most 256 times. */ if (req->r_attempts) { - old_max_retry = sizeof_field(struct ceph_mds_request_head_old, + old_max_retry = sizeof_field(struct ceph_mds_request_head, num_retry); old_max_retry = 1 << (old_max_retry * BITS_PER_BYTE); if ((old_version && req->r_attempts >= old_max_retry) || @@ -3522,7 +3516,7 @@ static void __do_request(struct ceph_mds_client *mdsc, /* * For async create we will choose the auth MDS of frag in parent - * directory to send the request and ususally this works fine, but + * directory to send the request and usually this works fine, but * if the migrated the dirtory to another MDS before it could handle * it the request will be forwarded. * @@ -4033,7 +4027,7 @@ static void handle_forward(struct ceph_mds_client *mdsc, __unregister_request(mdsc, req); } else if (fwd_seq <= req->r_num_fwd || (uint32_t)fwd_seq >= U32_MAX) { /* - * Avoid inifinite retrying after overflow. + * Avoid infinite retrying after overflow. * * The MDS will increase the fwd count and in client side * if the num_fwd is less than the one saved in request @@ -4112,10 +4106,13 @@ static void handle_session(struct ceph_mds_session *session, void *p = msg->front.iov_base; void *end = p + msg->front.iov_len; struct ceph_mds_session_head *h; - u32 op; + struct ceph_mds_cap_auth *cap_auths = NULL; + u32 op, cap_auths_num = 0; u64 seq, features = 0; int wake = 0; bool blocklisted = false; + u32 i; + /* decode */ ceph_decode_need(&p, end, sizeof(*h), bad); @@ -4160,7 +4157,101 @@ static void handle_session(struct ceph_mds_session *session, } } + if (msg_version >= 6) { + ceph_decode_32_safe(&p, end, cap_auths_num, bad); + doutc(cl, "cap_auths_num %d\n", cap_auths_num); + + if (cap_auths_num && op != CEPH_SESSION_OPEN) { + WARN_ON_ONCE(op != CEPH_SESSION_OPEN); + goto skip_cap_auths; + } + + cap_auths = kcalloc(cap_auths_num, + sizeof(struct ceph_mds_cap_auth), + GFP_KERNEL); + if (!cap_auths) { + pr_err_client(cl, "No memory for cap_auths\n"); + return; + } + + for (i = 0; i < cap_auths_num; i++) { + u32 _len, j; + + /* struct_v, struct_compat, and struct_len in MDSCapAuth */ + ceph_decode_skip_n(&p, end, 2 + sizeof(u32), bad); + + /* struct_v, struct_compat, and struct_len in MDSCapMatch */ + ceph_decode_skip_n(&p, end, 2 + sizeof(u32), bad); + ceph_decode_64_safe(&p, end, cap_auths[i].match.uid, bad); + ceph_decode_32_safe(&p, end, _len, bad); + if (_len) { + cap_auths[i].match.gids = kcalloc(_len, sizeof(u32), + GFP_KERNEL); + if (!cap_auths[i].match.gids) { + pr_err_client(cl, "No memory for gids\n"); + goto fail; + } + + cap_auths[i].match.num_gids = _len; + for (j = 0; j < _len; j++) + ceph_decode_32_safe(&p, end, + cap_auths[i].match.gids[j], + bad); + } + + ceph_decode_32_safe(&p, end, _len, bad); + if (_len) { + cap_auths[i].match.path = kcalloc(_len + 1, sizeof(char), + GFP_KERNEL); + if (!cap_auths[i].match.path) { + pr_err_client(cl, "No memory for path\n"); + goto fail; + } + ceph_decode_copy(&p, cap_auths[i].match.path, _len); + + /* Remove the tailing '/' */ + while (_len && cap_auths[i].match.path[_len - 1] == '/') { + cap_auths[i].match.path[_len - 1] = '\0'; + _len -= 1; + } + } + + ceph_decode_32_safe(&p, end, _len, bad); + if (_len) { + cap_auths[i].match.fs_name = kcalloc(_len + 1, sizeof(char), + GFP_KERNEL); + if (!cap_auths[i].match.fs_name) { + pr_err_client(cl, "No memory for fs_name\n"); + goto fail; + } + ceph_decode_copy(&p, cap_auths[i].match.fs_name, _len); + } + + ceph_decode_8_safe(&p, end, cap_auths[i].match.root_squash, bad); + ceph_decode_8_safe(&p, end, cap_auths[i].readable, bad); + ceph_decode_8_safe(&p, end, cap_auths[i].writeable, bad); + doutc(cl, "uid %lld, num_gids %u, path %s, fs_name %s, root_squash %d, readable %d, writeable %d\n", + cap_auths[i].match.uid, cap_auths[i].match.num_gids, + cap_auths[i].match.path, cap_auths[i].match.fs_name, + cap_auths[i].match.root_squash, + cap_auths[i].readable, cap_auths[i].writeable); + } + } + +skip_cap_auths: mutex_lock(&mdsc->mutex); + if (op == CEPH_SESSION_OPEN) { + if (mdsc->s_cap_auths) { + for (i = 0; i < mdsc->s_cap_auths_num; i++) { + kfree(mdsc->s_cap_auths[i].match.gids); + kfree(mdsc->s_cap_auths[i].match.path); + kfree(mdsc->s_cap_auths[i].match.fs_name); + } + kfree(mdsc->s_cap_auths); + } + mdsc->s_cap_auths_num = cap_auths_num; + mdsc->s_cap_auths = cap_auths; + } if (op == CEPH_SESSION_CLOSE) { ceph_get_mds_session(session); __unregister_session(mdsc, session); @@ -4243,7 +4334,7 @@ static void handle_session(struct ceph_mds_session *session, /* flush cap releases */ spin_lock(&session->s_cap_lock); if (session->s_num_cap_releases) - ceph_flush_cap_releases(mdsc, session); + ceph_flush_session_cap_releases(mdsc, session); spin_unlock(&session->s_cap_lock); send_flushmsg_ack(mdsc, session, seq); @@ -4290,6 +4381,13 @@ bad: pr_err_client(cl, "corrupt message mds%d len %d\n", mds, (int)msg->front.iov_len); ceph_msg_dump(msg); +fail: + for (i = 0; i < cap_auths_num; i++) { + kfree(cap_auths[i].match.gids); + kfree(cap_auths[i].match.path); + kfree(cap_auths[i].match.fs_name); + } + kfree(cap_auths); return; } @@ -4806,7 +4904,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, } else { recon_state.msg_version = 2; } - /* trsaverse this session's caps */ + /* traverse this session's caps */ err = ceph_iterate_session_caps(session, reconnect_caps_cb, &recon_state); spin_lock(&session->s_cap_lock); @@ -5342,6 +5440,8 @@ static void delayed_work(struct work_struct *work) } mutex_unlock(&mdsc->mutex); + ceph_flush_session_cap_releases(mdsc, s); + mutex_lock(&s->s_mutex); if (renew_caps) send_renew_caps(mdsc, s); @@ -5389,6 +5489,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) spin_lock_init(&mdsc->stopping_lock); atomic_set(&mdsc->stopping_blockers, 0); init_completion(&mdsc->stopping_waiter); + atomic64_set(&mdsc->dirty_folios, 0); + init_waitqueue_head(&mdsc->flush_end_wq); init_waitqueue_head(&mdsc->session_close_wq); INIT_LIST_HEAD(&mdsc->waiting_for_map); mdsc->quotarealms_inodes = RB_ROOT; @@ -5401,7 +5503,9 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work); mdsc->last_renew_caps = jiffies; INIT_LIST_HEAD(&mdsc->cap_delay_list); +#ifdef CONFIG_DEBUG_FS INIT_LIST_HEAD(&mdsc->cap_wait_list); +#endif spin_lock_init(&mdsc->cap_delay_lock); INIT_LIST_HEAD(&mdsc->cap_unlink_delay_list); INIT_LIST_HEAD(&mdsc->snap_flush_list); @@ -5499,6 +5603,173 @@ void send_flush_mdlog(struct ceph_mds_session *s) mutex_unlock(&s->s_mutex); } +static int ceph_mds_auth_match(struct ceph_mds_client *mdsc, + struct ceph_mds_cap_auth *auth, + const struct cred *cred, + char *tpath) +{ + u32 caller_uid = from_kuid(&init_user_ns, cred->fsuid); + u32 caller_gid = from_kgid(&init_user_ns, cred->fsgid); + struct ceph_client *cl = mdsc->fsc->client; + const char *spath = mdsc->fsc->mount_options->server_path; + bool gid_matched = false; + u32 gid, tlen, len; + int i, j; + + doutc(cl, "match.uid %lld\n", auth->match.uid); + if (auth->match.uid != MDS_AUTH_UID_ANY) { + if (auth->match.uid != caller_uid) + return 0; + if (auth->match.num_gids) { + for (i = 0; i < auth->match.num_gids; i++) { + if (caller_gid == auth->match.gids[i]) + gid_matched = true; + } + if (!gid_matched && cred->group_info->ngroups) { + for (i = 0; i < cred->group_info->ngroups; i++) { + gid = from_kgid(&init_user_ns, + cred->group_info->gid[i]); + for (j = 0; j < auth->match.num_gids; j++) { + if (gid == auth->match.gids[j]) { + gid_matched = true; + break; + } + } + if (gid_matched) + break; + } + } + if (!gid_matched) + return 0; + } + } + + /* path match */ + if (auth->match.path) { + if (!tpath) + return 0; + + tlen = strlen(tpath); + len = strlen(auth->match.path); + if (len) { + char *_tpath = tpath; + bool free_tpath = false; + int m, n; + + doutc(cl, "server path %s, tpath %s, match.path %s\n", + spath, tpath, auth->match.path); + if (spath && (m = strlen(spath)) != 1) { + /* mount path + '/' + tpath + an extra space */ + n = m + 1 + tlen + 1; + _tpath = kmalloc(n, GFP_NOFS); + if (!_tpath) + return -ENOMEM; + /* remove the leading '/' */ + snprintf(_tpath, n, "%s/%s", spath + 1, tpath); + free_tpath = true; + tlen = strlen(_tpath); + } + + /* + * Please note the tailing '/' for match.path has already + * been removed when parsing. + * + * Remove the tailing '/' for the target path. + */ + while (tlen && _tpath[tlen - 1] == '/') { + _tpath[tlen - 1] = '\0'; + tlen -= 1; + } + doutc(cl, "_tpath %s\n", _tpath); + + /* + * In case first == _tpath && tlen == len: + * match.path=/foo --> /foo _path=/foo --> match + * match.path=/foo/ --> /foo _path=/foo --> match + * + * In case first == _tmatch.path && tlen > len: + * match.path=/foo/ --> /foo _path=/foo/ --> match + * match.path=/foo --> /foo _path=/foo/ --> match + * match.path=/foo/ --> /foo _path=/foo/d --> match + * match.path=/foo --> /foo _path=/food --> mismatch + * + * All the other cases --> mismatch + */ + bool path_matched = true; + char *first = strstr(_tpath, auth->match.path); + if (first != _tpath || + (tlen > len && _tpath[len] != '/')) { + path_matched = false; + } + + if (free_tpath) + kfree(_tpath); + + if (!path_matched) + return 0; + } + } + + doutc(cl, "matched\n"); + return 1; +} + +int ceph_mds_check_access(struct ceph_mds_client *mdsc, char *tpath, int mask) +{ + const struct cred *cred = get_current_cred(); + u32 caller_uid = from_kuid(&init_user_ns, cred->fsuid); + u32 caller_gid = from_kgid(&init_user_ns, cred->fsgid); + struct ceph_mds_cap_auth *rw_perms_s = NULL; + struct ceph_client *cl = mdsc->fsc->client; + bool root_squash_perms = true; + int i, err; + + doutc(cl, "tpath '%s', mask %d, caller_uid %d, caller_gid %d\n", + tpath, mask, caller_uid, caller_gid); + + for (i = 0; i < mdsc->s_cap_auths_num; i++) { + struct ceph_mds_cap_auth *s = &mdsc->s_cap_auths[i]; + + err = ceph_mds_auth_match(mdsc, s, cred, tpath); + if (err < 0) { + put_cred(cred); + return err; + } else if (err > 0) { + /* always follow the last auth caps' permission */ + root_squash_perms = true; + rw_perms_s = NULL; + if ((mask & MAY_WRITE) && s->writeable && + s->match.root_squash && (!caller_uid || !caller_gid)) + root_squash_perms = false; + + if (((mask & MAY_WRITE) && !s->writeable) || + ((mask & MAY_READ) && !s->readable)) + rw_perms_s = s; + } + } + + put_cred(cred); + + doutc(cl, "root_squash_perms %d, rw_perms_s %p\n", root_squash_perms, + rw_perms_s); + if (root_squash_perms && rw_perms_s == NULL) { + doutc(cl, "access allowed\n"); + return 0; + } + + if (!root_squash_perms) { + doutc(cl, "root_squash is enabled and user(%d %d) isn't allowed to write", + caller_uid, caller_gid); + } + if (rw_perms_s) { + doutc(cl, "mds auth caps readable/writeable %d/%d while request r/w %d/%d", + rw_perms_s->readable, rw_perms_s->writeable, + !!(mask & MAY_READ), !!(mask & MAY_WRITE)); + } + doutc(cl, "access denied\n"); + return -EACCES; +} + /* * called before mount is ro, and before dentries are torn down. * (hmm, does this still race with new lookups?) @@ -5605,6 +5876,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) mutex_unlock(&mdsc->mutex); ceph_flush_dirty_caps(mdsc); + ceph_flush_cap_releases(mdsc); spin_lock(&mdsc->cap_dirty_lock); want_flush = mdsc->last_cap_flush_tid; if (!list_empty(&mdsc->cap_flush_list)) { @@ -5743,6 +6015,18 @@ static void ceph_mdsc_stop(struct ceph_mds_client *mdsc) ceph_mdsmap_destroy(mdsc->mdsmap); kfree(mdsc->sessions); ceph_caps_finalize(mdsc); + + if (mdsc->s_cap_auths) { + int i; + + for (i = 0; i < mdsc->s_cap_auths_num; i++) { + kfree(mdsc->s_cap_auths[i].match.gids); + kfree(mdsc->s_cap_auths[i].match.path); + kfree(mdsc->s_cap_auths[i].match.fs_name); + } + kfree(mdsc->s_cap_auths); + } + ceph_pool_perm_destroy(mdsc); } diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index b88e80415224..3e2a6fa7c19a 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -35,8 +35,9 @@ enum ceph_feature_type { CEPHFS_FEATURE_32BITS_RETRY_FWD, CEPHFS_FEATURE_NEW_SNAPREALM_INFO, CEPHFS_FEATURE_HAS_OWNER_UIDGID, + CEPHFS_FEATURE_MDS_AUTH_CAPS_CHECK, - CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_HAS_OWNER_UIDGID, + CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_MDS_AUTH_CAPS_CHECK, }; #define CEPHFS_FEATURES_CLIENT_SUPPORTED { \ @@ -52,6 +53,7 @@ enum ceph_feature_type { CEPHFS_FEATURE_OP_GETVXATTR, \ CEPHFS_FEATURE_32BITS_RETRY_FWD, \ CEPHFS_FEATURE_HAS_OWNER_UIDGID, \ + CEPHFS_FEATURE_MDS_AUTH_CAPS_CHECK, \ } /* @@ -71,6 +73,24 @@ enum ceph_feature_type { struct ceph_fs_client; struct ceph_cap; +#define MDS_AUTH_UID_ANY -1 + +struct ceph_mds_cap_match { + s64 uid; /* default to MDS_AUTH_UID_ANY */ + u32 num_gids; + u32 *gids; /* use these GIDs */ + char *path; /* require path to be child of this + (may be "" or "/" for any) */ + char *fs_name; + bool root_squash; /* default to false */ +}; + +struct ceph_mds_cap_auth { + struct ceph_mds_cap_match match; + bool readable; + bool writeable; +}; + /* * parsed info about a single inode. pointers are into the encoded * on-wire structures within the mds reply message payload. @@ -279,6 +299,8 @@ struct ceph_mds_request { struct inode *r_target_inode; /* resulting inode */ struct inode *r_new_inode; /* new inode (for creates) */ + const struct qstr *r_dname; /* stable name (for ->d_revalidate) */ + #define CEPH_MDS_R_DIRECT_IS_HASH (1) /* r_direct_hash is valid */ #define CEPH_MDS_R_ABORTED (2) /* call was aborted */ #define CEPH_MDS_R_GOT_UNSAFE (3) /* got an unsafe reply */ @@ -396,6 +418,8 @@ struct ceph_quotarealm_inode { struct inode *inode; }; +#ifdef CONFIG_DEBUG_FS + struct cap_wait { struct list_head list; u64 ino; @@ -404,6 +428,8 @@ struct cap_wait { int want; }; +#endif + enum { CEPH_MDSC_STOPPING_BEGIN = 1, CEPH_MDSC_STOPPING_FLUSHING = 2, @@ -432,6 +458,9 @@ struct ceph_mds_client { atomic_t stopping_blockers; struct completion stopping_waiter; + atomic64_t dirty_folios; + wait_queue_head_t flush_end_wq; + atomic64_t quotarealms_count; /* # realms with quota */ /* * We keep a list of inodes we don't see in the mountpoint but that we @@ -492,7 +521,9 @@ struct ceph_mds_client { spinlock_t caps_list_lock; struct list_head caps_list; /* unused (reserved or unreserved) */ +#ifdef CONFIG_DEBUG_FS struct list_head cap_wait_list; +#endif int caps_total_count; /* total caps allocated */ int caps_use_count; /* in use */ int caps_use_max; /* max used caps */ @@ -513,6 +544,9 @@ struct ceph_mds_client { struct rw_semaphore pool_perm_rwsem; struct rb_root pool_perm_tree; + u32 s_cap_auths_num; + struct ceph_mds_cap_auth *s_cap_auths; + char nodename[__NEW_UTS_LEN + 1]; }; @@ -530,9 +564,6 @@ extern struct ceph_mds_session * ceph_get_mds_session(struct ceph_mds_session *s); extern void ceph_put_mds_session(struct ceph_mds_session *s); -extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc, - struct ceph_msg *msg, int mds); - extern int ceph_mdsc_init(struct ceph_fs_client *fsc); extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc); extern void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc); @@ -573,14 +604,17 @@ extern void ceph_mdsc_iterate_sessions(struct ceph_mds_client *mdsc, extern struct ceph_msg *ceph_create_session_msg(u32 op, u64 seq); extern void __ceph_queue_cap_release(struct ceph_mds_session *session, struct ceph_cap *cap); -extern void ceph_flush_cap_releases(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session); +extern void ceph_flush_session_cap_releases(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session); extern void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc); extern void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr); extern void ceph_queue_cap_unlink_work(struct ceph_mds_client *mdsc); extern int ceph_iterate_session_caps(struct ceph_mds_session *session, int (*cb)(struct inode *, int mds, void *), void *arg); +extern int ceph_mds_check_access(struct ceph_mds_client *mdsc, char *tpath, + int mask); + extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc); static inline void ceph_mdsc_free_path(char *path, int len) @@ -605,8 +639,6 @@ extern void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, extern struct ceph_mds_session * ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target); -extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session); extern int ceph_trim_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c index 06ee397e0c3a..d90eda19bcc4 100644 --- a/fs/ceph/quota.c +++ b/fs/ceph/quota.c @@ -166,7 +166,7 @@ static struct inode *lookup_quotarealm_inode(struct ceph_mds_client *mdsc, if (IS_ERR(in)) { doutc(cl, "Can't lookup inode %llx (err: %ld)\n", realm->ino, PTR_ERR(in)); - qri->timeout = jiffies + msecs_to_jiffies(60 * 1000); /* XXX */ + qri->timeout = jiffies + secs_to_jiffies(60); /* XXX */ } else { qri->timeout = 0; qri->inode = in; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 885cb5d4e771..f3951253e393 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -126,6 +126,7 @@ static int ceph_sync_fs(struct super_block *sb, int wait) if (!wait) { doutc(cl, "(non-blocking)\n"); ceph_flush_dirty_caps(fsc->mdsc); + ceph_flush_cap_releases(fsc->mdsc); doutc(cl, "(non-blocking) done\n"); return 0; } @@ -284,8 +285,10 @@ static int ceph_parse_new_source(const char *dev_name, const char *dev_name_end, size_t len; struct ceph_fsid fsid; struct ceph_parse_opts_ctx *pctx = fc->fs_private; + struct ceph_options *opts = pctx->copts; struct ceph_mount_options *fsopt = pctx->opts; - char *fsid_start, *fs_name_start; + const char *name_start = dev_name; + const char *fsid_start, *fs_name_start; if (*dev_name_end != '=') { dout("separator '=' missing in source"); @@ -295,8 +298,14 @@ static int ceph_parse_new_source(const char *dev_name, const char *dev_name_end, fsid_start = strchr(dev_name, '@'); if (!fsid_start) return invalfc(fc, "missing cluster fsid"); - ++fsid_start; /* start of cluster fsid */ + len = fsid_start - name_start; + kfree(opts->name); + opts->name = kstrndup(name_start, len, GFP_KERNEL); + if (!opts->name) + return -ENOMEM; + dout("using %s entity name", opts->name); + ++fsid_start; /* start of cluster fsid */ fs_name_start = strchr(fsid_start, '.'); if (!fs_name_start) return invalfc(fc, "missing file system name"); @@ -422,6 +431,8 @@ static int ceph_parse_mount_param(struct fs_context *fc, switch (token) { case Opt_snapdirname: + if (strlen(param->string) > NAME_MAX) + return invalfc(fc, "snapdirname too long"); kfree(fsopt->snapdir_name); fsopt->snapdir_name = param->string; param->string = NULL; @@ -961,7 +972,8 @@ static int __init init_caches(void) if (!ceph_mds_request_cachep) goto bad_mds_req; - ceph_wb_pagevec_pool = mempool_create_kmalloc_pool(10, CEPH_MAX_WRITE_SIZE >> PAGE_SHIFT); + ceph_wb_pagevec_pool = mempool_create_kmalloc_pool(10, + (CEPH_MAX_WRITE_SIZE >> PAGE_SHIFT) * sizeof(struct page *)); if (!ceph_wb_pagevec_pool) goto bad_pagevec_pool; @@ -1551,6 +1563,17 @@ static void ceph_kill_sb(struct super_block *s) */ sync_filesystem(s); + if (atomic64_read(&mdsc->dirty_folios) > 0) { + wait_queue_head_t *wq = &mdsc->flush_end_wq; + long timeleft = wait_event_killable_timeout(*wq, + atomic64_read(&mdsc->dirty_folios) <= 0, + fsc->client->options->mount_timeout); + if (!timeleft) /* timed out */ + pr_warn_client(cl, "umount timed out, %ld\n", timeleft); + else if (timeleft < 0) /* killed */ + pr_warn_client(cl, "umount was killed, %ld\n", timeleft); + } + spin_lock(&mdsc->stopping_lock); mdsc->stopping = CEPH_MDSC_STOPPING_FLUSHING; wait = !!atomic_read(&mdsc->stopping_blockers); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index b63b4cd9b5b6..bb0db0cc8003 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -5,7 +5,7 @@ #include <linux/ceph/ceph_debug.h> #include <linux/ceph/osd_client.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <linux/backing-dev.h> #include <linux/completion.h> #include <linux/exportfs.h> @@ -60,7 +60,7 @@ /* max size of osd read request, limited by libceph */ #define CEPH_MAX_READ_SIZE CEPH_MSG_MAX_DATA_LEN -/* osd has a configurable limitaion of max write size. +/* osd has a configurable limitation of max write size. * CEPH_MSG_MAX_DATA_LEN should be small enough. */ #define CEPH_MAX_WRITE_SIZE CEPH_MSG_MAX_DATA_LEN #define CEPH_RASIZE_DEFAULT (8192*1024) /* max readahead */ @@ -200,9 +200,10 @@ struct ceph_cap { struct list_head caps_item; }; -#define CHECK_CAPS_AUTHONLY 1 /* only check auth cap */ -#define CHECK_CAPS_FLUSH 2 /* flush any dirty caps */ -#define CHECK_CAPS_NOINVAL 4 /* don't invalidate pagecache */ +#define CHECK_CAPS_AUTHONLY 1 /* only check auth cap */ +#define CHECK_CAPS_FLUSH 2 /* flush any dirty caps */ +#define CHECK_CAPS_NOINVAL 4 /* don't invalidate pagecache */ +#define CHECK_CAPS_FLUSH_FORCE 8 /* force flush any caps */ struct ceph_cap_flush { u64 tid; @@ -795,7 +796,6 @@ extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask, extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci, struct ceph_cap *ocap, int mask); -extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask); extern int __ceph_caps_used(struct ceph_inode_info *ci); static inline bool __ceph_is_file_opened(struct ceph_inode_info *ci) @@ -903,7 +903,7 @@ ceph_find_rw_context(struct ceph_file_info *cf) } struct ceph_readdir_cache_control { - struct page *page; + struct folio *folio; struct dentry **dentries; int index; }; @@ -1055,8 +1055,6 @@ extern int ceph_fill_trace(struct super_block *sb, extern int ceph_readdir_prepopulate(struct ceph_mds_request *req, struct ceph_mds_session *session); -extern int ceph_inode_holds_cap(struct inode *inode, int mask); - extern bool ceph_inode_set_size(struct inode *inode, loff_t size); extern void __ceph_do_pending_vmtruncate(struct inode *inode); @@ -1134,8 +1132,7 @@ struct ceph_acl_sec_ctx { void *acl; #endif #ifdef CONFIG_CEPH_FS_SECURITY_LABEL - void *sec_ctx; - u32 sec_ctxlen; + struct lsm_context lsmctx; #endif #ifdef CONFIG_FS_ENCRYPTION struct ceph_fscrypt_auth *fscrypt_auth; @@ -1207,10 +1204,6 @@ static inline void ceph_init_inode_acls(struct inode *inode, struct ceph_acl_sec_ctx *as_ctx) { } -static inline int ceph_acl_chmod(struct dentry *dentry, struct inode *inode) -{ - return 0; -} static inline void ceph_forget_all_cached_acls(struct inode *inode) { @@ -1269,6 +1262,7 @@ extern bool __ceph_should_report_size(struct ceph_inode_info *ci); extern void ceph_check_caps(struct ceph_inode_info *ci, int flags); extern unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc); extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc); +extern void ceph_flush_cap_releases(struct ceph_mds_client *mdsc); extern int ceph_drop_caps_for_unlink(struct inode *inode); extern int ceph_encode_inode_release(void **p, struct inode *inode, int mds, int drop, int unless, int force); diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index e066a556eccb..537165db4519 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -899,7 +899,7 @@ static int __get_required_blob_size(struct ceph_inode_info *ci, int name_size, } /* - * If there are dirty xattrs, reencode xattrs into the prealloc_blob + * If there are dirty xattrs, re-encode xattrs into the prealloc_blob * and swap into place. It returns the old i_xattrs.blob (or NULL) so * that it can be freed by the caller as the i_ceph_lock is likely to be * held. @@ -1383,8 +1383,7 @@ int ceph_security_init_secctx(struct dentry *dentry, umode_t mode, int err; err = security_dentry_init_security(dentry, mode, &dentry->d_name, - &name, &as_ctx->sec_ctx, - &as_ctx->sec_ctxlen); + &name, &as_ctx->lsmctx); if (err < 0) { WARN_ON_ONCE(err != -EOPNOTSUPP); err = 0; /* do nothing */ @@ -1409,7 +1408,7 @@ int ceph_security_init_secctx(struct dentry *dentry, umode_t mode, */ name_len = strlen(name); err = ceph_pagelist_reserve(pagelist, - 4 * 2 + name_len + as_ctx->sec_ctxlen); + 4 * 2 + name_len + as_ctx->lsmctx.len); if (err) goto out; @@ -1432,8 +1431,9 @@ int ceph_security_init_secctx(struct dentry *dentry, umode_t mode, ceph_pagelist_encode_32(pagelist, name_len); ceph_pagelist_append(pagelist, name, name_len); - ceph_pagelist_encode_32(pagelist, as_ctx->sec_ctxlen); - ceph_pagelist_append(pagelist, as_ctx->sec_ctx, as_ctx->sec_ctxlen); + ceph_pagelist_encode_32(pagelist, as_ctx->lsmctx.len); + ceph_pagelist_append(pagelist, as_ctx->lsmctx.context, + as_ctx->lsmctx.len); err = 0; out: @@ -1451,7 +1451,7 @@ void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx) posix_acl_release(as_ctx->default_acl); #endif #ifdef CONFIG_CEPH_FS_SECURITY_LABEL - security_release_secctx(as_ctx->sec_ctx, as_ctx->sec_ctxlen); + security_release_secctx(&as_ctx->lsmctx); #endif #ifdef CONFIG_FS_ENCRYPTION kfree(as_ctx->fscrypt_auth); |