diff options
Diffstat (limited to 'fs')
97 files changed, 3321 insertions, 2008 deletions
diff --git a/fs/block_dev.c b/fs/block_dev.c index 7e83c3e71504..0c424a0cadaa 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1609,7 +1609,7 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) * Does not take i_mutex for the write and thus is not for general purpose * use. */ -ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) +static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *bd_inode = bdev_file_inode(file); @@ -1647,9 +1647,8 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) blk_finish_plug(&plug); return ret; } -EXPORT_SYMBOL_GPL(blkdev_write_iter); -ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) +static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct inode *bd_inode = bdev_file_inode(file); @@ -1671,7 +1670,6 @@ ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) iov_iter_reexpand(to, iov_iter_count(to) + shorted); return ret; } -EXPORT_SYMBOL_GPL(blkdev_read_iter); static int blkdev_writepages(struct address_space *mapping, struct writeback_control *wbc) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index c1570fada3d8..a1e2813731d1 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -82,10 +82,6 @@ static int ceph_set_page_dirty(struct page *page) struct inode *inode; struct ceph_inode_info *ci; struct ceph_snap_context *snapc; - int ret; - - if (unlikely(!mapping)) - return !TestSetPageDirty(page); if (PageDirty(page)) { dout("%p set_page_dirty %p idx %lu -- already dirty\n", @@ -130,11 +126,7 @@ static int ceph_set_page_dirty(struct page *page) BUG_ON(PagePrivate(page)); attach_page_private(page, snapc); - ret = __set_page_dirty_nobuffers(page); - WARN_ON(!PageLocked(page)); - WARN_ON(!page->mapping); - - return ret; + return __set_page_dirty_nobuffers(page); } /* @@ -226,7 +218,7 @@ static void finish_netfs_read(struct ceph_osd_request *req) int err = req->r_result; ceph_update_read_metrics(&fsc->mdsc->metric, req->r_start_latency, - req->r_end_latency, err); + req->r_end_latency, osd_data->length, err); dout("%s: result %d subreq->len=%zu i_size=%lld\n", __func__, req->r_result, subreq->len, i_size_read(req->r_inode)); @@ -313,7 +305,7 @@ static void ceph_readahead_cleanup(struct address_space *mapping, void *priv) ceph_put_cap_refs(ci, got); } -const struct netfs_read_request_ops ceph_netfs_read_ops = { +static const struct netfs_read_request_ops ceph_netfs_read_ops = { .init_rreq = ceph_init_rreq, .is_cache_enabled = ceph_is_cache_enabled, .begin_cache_operation = ceph_begin_cache_operation, @@ -560,7 +552,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) err = ceph_osdc_wait_request(osdc, req); ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, - req->r_end_latency, err); + req->r_end_latency, len, err); ceph_osdc_put_request(req); if (err == 0) @@ -635,6 +627,7 @@ static void writepages_finish(struct ceph_osd_request *req) struct ceph_snap_context *snapc = req->r_snapc; struct address_space *mapping = inode->i_mapping; struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + unsigned int len = 0; bool remove_page; dout("writepages_finish %p rc %d\n", inode, rc); @@ -647,9 +640,6 @@ static void writepages_finish(struct ceph_osd_request *req) ceph_clear_error_write(ci); } - ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, - req->r_end_latency, rc); - /* * We lost the cache cap, need to truncate the page before * it is unlocked, otherwise we'd truncate it later in the @@ -666,6 +656,7 @@ static void writepages_finish(struct ceph_osd_request *req) osd_data = osd_req_op_extent_osd_data(req, i); BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); + len += osd_data->length; num_pages = calc_pages_for((u64)osd_data->alignment, (u64)osd_data->length); total_pages += num_pages; @@ -696,6 +687,9 @@ static void writepages_finish(struct ceph_osd_request *req) release_pages(osd_data->pages, num_pages); } + ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, + req->r_end_latency, len, rc); + ceph_put_wrbuffer_cap_refs(ci, total_pages, snapc); osd_data = osd_req_op_extent_osd_data(req, 0); @@ -1711,7 +1705,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) err = ceph_osdc_wait_request(&fsc->client->osdc, req); ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, - req->r_end_latency, err); + req->r_end_latency, len, err); out_put: ceph_osdc_put_request(req); diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index a5e93b185515..7bdefd0c789a 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -645,9 +645,7 @@ void ceph_add_cap(struct inode *inode, dout("add_cap %p mds%d cap %llx %s seq %d\n", inode, session->s_mds, cap_id, ceph_cap_string(issued), seq); - spin_lock(&session->s_gen_ttl_lock); - gen = session->s_cap_gen; - spin_unlock(&session->s_gen_ttl_lock); + gen = atomic_read(&session->s_cap_gen); cap = __get_cap_for_mds(ci, mds); if (!cap) { @@ -785,10 +783,8 @@ static int __cap_is_valid(struct ceph_cap *cap) unsigned long ttl; u32 gen; - spin_lock(&cap->session->s_gen_ttl_lock); - gen = cap->session->s_cap_gen; + gen = atomic_read(&cap->session->s_cap_gen); ttl = cap->session->s_cap_ttl; - spin_unlock(&cap->session->s_gen_ttl_lock); if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) { dout("__cap_is_valid %p cap %p issued %s " @@ -1182,7 +1178,8 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) * s_cap_gen while session is in the reconnect state. */ if (queue_release && - (!session->s_cap_reconnect || cap->cap_gen == session->s_cap_gen)) { + (!session->s_cap_reconnect || + cap->cap_gen == atomic_read(&session->s_cap_gen))) { cap->queue_release = 1; if (removed) { __ceph_queue_cap_release(session, cap); @@ -1534,7 +1531,7 @@ static inline int __send_flush_snap(struct inode *inode, * asynchronously back to the MDS once sync writes complete and dirty * data is written out. * - * Called under i_ceph_lock. Takes s_mutex as needed. + * Called under i_ceph_lock. */ static void __ceph_flush_snaps(struct ceph_inode_info *ci, struct ceph_mds_session *session) @@ -1656,7 +1653,6 @@ retry: mds = ci->i_auth_cap->session->s_mds; if (session && session->s_mds != mds) { dout(" oops, wrong session %p mutex\n", session); - mutex_unlock(&session->s_mutex); ceph_put_mds_session(session); session = NULL; } @@ -1665,10 +1661,6 @@ retry: mutex_lock(&mdsc->mutex); session = __ceph_lookup_mds_session(mdsc, mds); mutex_unlock(&mdsc->mutex); - if (session) { - dout(" inverting session/ino locks on %p\n", session); - mutex_lock(&session->s_mutex); - } goto retry; } @@ -1680,12 +1672,10 @@ retry: out: spin_unlock(&ci->i_ceph_lock); - if (psession) { + if (psession) *psession = session; - } else if (session) { - mutex_unlock(&session->s_mutex); + else ceph_put_mds_session(session); - } /* we flushed them all; remove this inode from the queue */ spin_lock(&mdsc->snap_flush_lock); list_del_init(&ci->i_snap_flush_item); @@ -1915,7 +1905,6 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, struct ceph_cap *cap; u64 flush_tid, oldest_flush_tid; int file_wanted, used, cap_used; - int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */ int issued, implemented, want, retain, revoking, flushing = 0; int mds = -1; /* keep track of how far we've gone through i_caps list to avoid an infinite loop on retry */ @@ -1923,14 +1912,13 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, bool queue_invalidate = false; bool tried_invalidate = false; + if (session) + ceph_get_mds_session(session); + spin_lock(&ci->i_ceph_lock); if (ci->i_ceph_flags & CEPH_I_FLUSH) flags |= CHECK_CAPS_FLUSH; - - goto retry_locked; retry: - spin_lock(&ci->i_ceph_lock); -retry_locked: /* Caps wanted by virtue of active open files. */ file_wanted = __ceph_caps_file_wanted(ci); @@ -2010,7 +1998,7 @@ retry_locked: ci->i_rdcache_revoking = ci->i_rdcache_gen; } tried_invalidate = true; - goto retry_locked; + goto retry; } for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { @@ -2024,8 +2012,6 @@ retry_locked: ((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap)) continue; - /* NOTE: no side-effects allowed, until we take s_mutex */ - /* * If we have an auth cap, we don't need to consider any * overlapping caps as used. @@ -2088,37 +2074,8 @@ retry_locked: continue; /* nope, all good */ ack: - if (session && session != cap->session) { - dout("oops, wrong session %p mutex\n", session); - mutex_unlock(&session->s_mutex); - session = NULL; - } - if (!session) { - session = cap->session; - if (mutex_trylock(&session->s_mutex) == 0) { - dout("inverting session/ino locks on %p\n", - session); - session = ceph_get_mds_session(session); - spin_unlock(&ci->i_ceph_lock); - if (took_snap_rwsem) { - up_read(&mdsc->snap_rwsem); - took_snap_rwsem = 0; - } - if (session) { - mutex_lock(&session->s_mutex); - ceph_put_mds_session(session); - } else { - /* - * Because we take the reference while - * holding the i_ceph_lock, it should - * never be NULL. Throw a warning if it - * ever is. - */ - WARN_ON_ONCE(true); - } - goto retry; - } - } + ceph_put_mds_session(session); + session = ceph_get_mds_session(cap->session); /* kick flushing and flush snaps before sending normal * cap message */ @@ -2130,20 +2087,7 @@ ack: if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) __ceph_flush_snaps(ci, session); - goto retry_locked; - } - - /* take snap_rwsem after session mutex */ - if (!took_snap_rwsem) { - if (down_read_trylock(&mdsc->snap_rwsem) == 0) { - dout("inverting snap/in locks on %p\n", - inode); - spin_unlock(&ci->i_ceph_lock); - down_read(&mdsc->snap_rwsem); - took_snap_rwsem = 1; - goto retry; - } - took_snap_rwsem = 1; + goto retry; } if (cap == ci->i_auth_cap && ci->i_dirty_caps) { @@ -2165,9 +2109,10 @@ ack: __prep_cap(&arg, cap, CEPH_CAP_OP_UPDATE, mflags, cap_used, want, retain, flushing, flush_tid, oldest_flush_tid); - spin_unlock(&ci->i_ceph_lock); + spin_unlock(&ci->i_ceph_lock); __send_cap(&arg, ci); + spin_lock(&ci->i_ceph_lock); goto retry; /* retake i_ceph_lock and restart our cap scan. */ } @@ -2182,13 +2127,9 @@ ack: spin_unlock(&ci->i_ceph_lock); + ceph_put_mds_session(session); if (queue_invalidate) ceph_queue_invalidate(inode); - - if (session) - mutex_unlock(&session->s_mutex); - if (took_snap_rwsem) - up_read(&mdsc->snap_rwsem); } /* @@ -2198,26 +2139,17 @@ static int try_flush_caps(struct inode *inode, u64 *ptid) { struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_mds_session *session = NULL; int flushing = 0; u64 flush_tid = 0, oldest_flush_tid = 0; -retry: spin_lock(&ci->i_ceph_lock); retry_locked: if (ci->i_dirty_caps && ci->i_auth_cap) { struct ceph_cap *cap = ci->i_auth_cap; struct cap_msg_args arg; + struct ceph_mds_session *session = cap->session; - if (session != cap->session) { - spin_unlock(&ci->i_ceph_lock); - if (session) - mutex_unlock(&session->s_mutex); - session = cap->session; - mutex_lock(&session->s_mutex); - goto retry; - } - if (cap->session->s_state < CEPH_MDS_SESSION_OPEN) { + if (session->s_state < CEPH_MDS_SESSION_OPEN) { spin_unlock(&ci->i_ceph_lock); goto out; } @@ -2254,9 +2186,6 @@ retry_locked: spin_unlock(&ci->i_ceph_lock); } out: - if (session) - mutex_unlock(&session->s_mutex); - *ptid = flush_tid; return flushing; } @@ -3213,8 +3142,7 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, if (complete_capsnap) wake_up_all(&ci->i_cap_wq); while (put-- > 0) { - /* avoid calling iput_final() in osd dispatch threads */ - ceph_async_iput(inode); + iput(inode); } } @@ -3288,7 +3216,7 @@ static void handle_cap_grant(struct inode *inode, u64 size = le64_to_cpu(grant->size); u64 max_size = le64_to_cpu(grant->max_size); unsigned char check_caps = 0; - bool was_stale = cap->cap_gen < session->s_cap_gen; + bool was_stale = cap->cap_gen < atomic_read(&session->s_cap_gen); bool wake = false; bool writeback = false; bool queue_trunc = false; @@ -3340,7 +3268,7 @@ static void handle_cap_grant(struct inode *inode, } /* side effects now are allowed */ - cap->cap_gen = session->s_cap_gen; + cap->cap_gen = atomic_read(&session->s_cap_gen); cap->seq = seq; __check_cap_issue(ci, cap, newcaps); @@ -3553,13 +3481,12 @@ static void handle_cap_grant(struct inode *inode, if (wake) wake_up_all(&ci->i_cap_wq); + mutex_unlock(&session->s_mutex); if (check_caps == 1) ceph_check_caps(ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_NOINVAL, session); else if (check_caps == 2) ceph_check_caps(ci, CHECK_CAPS_NOINVAL, session); - else - mutex_unlock(&session->s_mutex); } /* @@ -4203,8 +4130,7 @@ done: mutex_unlock(&session->s_mutex); done_unlocked: ceph_put_string(extra_info.pool_ns); - /* avoid calling iput_final() in mds dispatch threads */ - ceph_async_iput(inode); + iput(inode); return; flush_cap_releases: @@ -4246,8 +4172,7 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc) spin_unlock(&mdsc->cap_delay_lock); dout("check_delayed_caps on %p\n", inode); ceph_check_caps(ci, 0, NULL); - /* avoid calling iput_final() in tick thread */ - ceph_async_iput(inode); + iput(inode); spin_lock(&mdsc->cap_delay_lock); } } diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 425f3356332a..38b78b45811f 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -127,7 +127,7 @@ static int mdsc_show(struct seq_file *s, void *p) return 0; } -#define CEPH_METRIC_SHOW(name, total, avg, min, max, sq) { \ +#define CEPH_LAT_METRIC_SHOW(name, total, avg, min, max, sq) { \ s64 _total, _avg, _min, _max, _sq, _st; \ _avg = ktime_to_us(avg); \ _min = ktime_to_us(min == KTIME_MAX ? 0 : min); \ @@ -140,6 +140,12 @@ static int mdsc_show(struct seq_file *s, void *p) name, total, _avg, _min, _max, _st); \ } +#define CEPH_SZ_METRIC_SHOW(name, total, avg, min, max, sum) { \ + u64 _min = min == U64_MAX ? 0 : min; \ + seq_printf(s, "%-14s%-12lld%-16llu%-16llu%-16llu%llu\n", \ + name, total, avg, _min, max, sum); \ +} + static int metric_show(struct seq_file *s, void *p) { struct ceph_fs_client *fsc = s->private; @@ -147,6 +153,7 @@ static int metric_show(struct seq_file *s, void *p) struct ceph_client_metric *m = &mdsc->metric; int nr_caps = 0; s64 total, sum, avg, min, max, sq; + u64 sum_sz, avg_sz, min_sz, max_sz; sum = percpu_counter_sum(&m->total_inodes); seq_printf(s, "item total\n"); @@ -170,7 +177,7 @@ static int metric_show(struct seq_file *s, void *p) max = m->read_latency_max; sq = m->read_latency_sq_sum; spin_unlock(&m->read_metric_lock); - CEPH_METRIC_SHOW("read", total, avg, min, max, sq); + CEPH_LAT_METRIC_SHOW("read", total, avg, min, max, sq); spin_lock(&m->write_metric_lock); total = m->total_writes; @@ -180,7 +187,7 @@ static int metric_show(struct seq_file *s, void *p) max = m->write_latency_max; sq = m->write_latency_sq_sum; spin_unlock(&m->write_metric_lock); - CEPH_METRIC_SHOW("write", total, avg, min, max, sq); + CEPH_LAT_METRIC_SHOW("write", total, avg, min, max, sq); spin_lock(&m->metadata_metric_lock); total = m->total_metadatas; @@ -190,7 +197,29 @@ static int metric_show(struct seq_file *s, void *p) max = m->metadata_latency_max; sq = m->metadata_latency_sq_sum; spin_unlock(&m->metadata_metric_lock); - CEPH_METRIC_SHOW("metadata", total, avg, min, max, sq); + CEPH_LAT_METRIC_SHOW("metadata", total, avg, min, max, sq); + + seq_printf(s, "\n"); + seq_printf(s, "item total avg_sz(bytes) min_sz(bytes) max_sz(bytes) total_sz(bytes)\n"); + seq_printf(s, "----------------------------------------------------------------------------------------\n"); + + spin_lock(&m->read_metric_lock); + total = m->total_reads; + sum_sz = m->read_size_sum; + avg_sz = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum_sz, total) : 0; + min_sz = m->read_size_min; + max_sz = m->read_size_max; + spin_unlock(&m->read_metric_lock); + CEPH_SZ_METRIC_SHOW("read", total, avg_sz, min_sz, max_sz, sum_sz); + + spin_lock(&m->write_metric_lock); + total = m->total_writes; + sum_sz = m->write_size_sum; + avg_sz = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum_sz, total) : 0; + min_sz = m->write_size_min; + max_sz = m->write_size_max; + spin_unlock(&m->write_metric_lock); + CEPH_SZ_METRIC_SHOW("write", total, avg_sz, min_sz, max_sz, sum_sz); seq_printf(s, "\n"); seq_printf(s, "item total miss hit\n"); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 9ba79b6531fb..133dbd9338e7 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -788,6 +788,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, mask |= CEPH_CAP_XATTR_SHARED; req->r_args.getattr.mask = cpu_to_le32(mask); + ihold(dir); req->r_parent = dir; set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); err = ceph_mdsc_do_request(mdsc, NULL, req); @@ -868,6 +869,7 @@ static int ceph_mknod(struct user_namespace *mnt_userns, struct inode *dir, req->r_dentry = dget(dentry); req->r_num_caps = 2; req->r_parent = dir; + ihold(dir); set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); req->r_args.mknod.mode = cpu_to_le32(mode); req->r_args.mknod.rdev = cpu_to_le32(rdev); @@ -929,6 +931,8 @@ static int ceph_symlink(struct user_namespace *mnt_userns, struct inode *dir, goto out; } req->r_parent = dir; + ihold(dir); + set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); req->r_dentry = dget(dentry); req->r_num_caps = 2; @@ -993,6 +997,7 @@ static int ceph_mkdir(struct user_namespace *mnt_userns, struct inode *dir, req->r_dentry = dget(dentry); req->r_num_caps = 2; req->r_parent = dir; + ihold(dir); set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); req->r_args.mkdir.mode = cpu_to_le32(mode); req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL; @@ -1037,6 +1042,7 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir, req->r_num_caps = 2; req->r_old_dentry = dget(old_dentry); req->r_parent = dir; + ihold(dir); set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); req->r_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; @@ -1158,6 +1164,7 @@ retry: req->r_dentry = dget(dentry); req->r_num_caps = 2; req->r_parent = dir; + ihold(dir); req->r_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; req->r_inode_drop = ceph_drop_caps_for_unlink(inode); @@ -1232,6 +1239,7 @@ static int ceph_rename(struct user_namespace *mnt_userns, struct inode *old_dir, req->r_old_dentry = dget(old_dentry); req->r_old_dentry_dir = old_dir; req->r_parent = new_dir; + ihold(new_dir); set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL; @@ -1548,10 +1556,8 @@ static bool __dentry_lease_is_valid(struct ceph_dentry_info *di) u32 gen; unsigned long ttl; - spin_lock(&session->s_gen_ttl_lock); - gen = session->s_cap_gen; + gen = atomic_read(&session->s_cap_gen); ttl = session->s_cap_ttl; - spin_unlock(&session->s_gen_ttl_lock); if (di->lease_gen == gen && time_before(jiffies, ttl) && @@ -1730,6 +1736,7 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) req->r_dentry = dget(dentry); req->r_num_caps = 2; req->r_parent = dir; + ihold(dir); mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED; if (ceph_security_xattr_wanted(dir)) @@ -1809,8 +1816,7 @@ static void ceph_d_release(struct dentry *dentry) dentry->d_fsdata = NULL; spin_unlock(&dentry->d_lock); - if (di->lease_session) - ceph_put_mds_session(di->lease_session); + ceph_put_mds_session(di->lease_session); kmem_cache_free(ceph_dentry_cachep, di); } diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 65540a4429b2..1d65934c1262 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -542,6 +542,7 @@ static int ceph_get_name(struct dentry *parent, char *name, ihold(inode); req->r_ino2 = ceph_vino(d_inode(parent)); req->r_parent = d_inode(parent); + ihold(req->r_parent); set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); req->r_num_caps = 2; err = ceph_mdsc_do_request(mdsc, NULL, req); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index d51af3698032..d1755ac1d964 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -706,6 +706,7 @@ retry: mask |= CEPH_CAP_XATTR_SHARED; req->r_args.open.mask = cpu_to_le32(mask); req->r_parent = dir; + ihold(dir); if (flags & O_CREAT) { struct ceph_file_layout lo; @@ -903,7 +904,7 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, ceph_update_read_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, - ret); + len, ret); ceph_osdc_put_request(req); @@ -1035,12 +1036,12 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req) struct ceph_aio_request *aio_req = req->r_priv; struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0); struct ceph_client_metric *metric = &ceph_sb_to_mdsc(inode->i_sb)->metric; + unsigned int len = osd_data->bvec_pos.iter.bi_size; BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_BVECS); BUG_ON(!osd_data->num_bvecs); - dout("ceph_aio_complete_req %p rc %d bytes %u\n", - inode, rc, osd_data->bvec_pos.iter.bi_size); + dout("ceph_aio_complete_req %p rc %d bytes %u\n", inode, rc, len); if (rc == -EOLDSNAPC) { struct ceph_aio_work *aio_work; @@ -1058,9 +1059,9 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req) } else if (!aio_req->write) { if (rc == -ENOENT) rc = 0; - if (rc >= 0 && osd_data->bvec_pos.iter.bi_size > rc) { + if (rc >= 0 && len > rc) { struct iov_iter i; - int zlen = osd_data->bvec_pos.iter.bi_size - rc; + int zlen = len - rc; /* * If read is satisfied by single OSD request, @@ -1077,8 +1078,7 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req) } iov_iter_bvec(&i, READ, osd_data->bvec_pos.bvecs, - osd_data->num_bvecs, - osd_data->bvec_pos.iter.bi_size); + osd_data->num_bvecs, len); iov_iter_advance(&i, rc); iov_iter_zero(zlen, &i); } @@ -1088,10 +1088,10 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req) if (req->r_start_latency) { if (aio_req->write) ceph_update_write_metrics(metric, req->r_start_latency, - req->r_end_latency, rc); + req->r_end_latency, len, rc); else ceph_update_read_metrics(metric, req->r_start_latency, - req->r_end_latency, rc); + req->r_end_latency, len, rc); } put_bvecs(osd_data->bvec_pos.bvecs, osd_data->num_bvecs, @@ -1299,10 +1299,10 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, if (write) ceph_update_write_metrics(metric, req->r_start_latency, - req->r_end_latency, ret); + req->r_end_latency, len, ret); else ceph_update_read_metrics(metric, req->r_start_latency, - req->r_end_latency, ret); + req->r_end_latency, len, ret); size = i_size_read(inode); if (!write) { @@ -1476,7 +1476,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, ret = ceph_osdc_wait_request(&fsc->client->osdc, req); ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, - req->r_end_latency, ret); + req->r_end_latency, len, ret); out: ceph_osdc_put_request(req); if (ret != 0) { diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index df0c8a724609..1bd2cc015913 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1124,7 +1124,7 @@ static void __update_dentry_lease(struct inode *dir, struct dentry *dentry, return; } - if (di->lease_gen == session->s_cap_gen && + if (di->lease_gen == atomic_read(&session->s_cap_gen) && time_before(ttl, di->time)) return; /* we already have a newer lease. */ @@ -1135,7 +1135,7 @@ static void __update_dentry_lease(struct inode *dir, struct dentry *dentry, if (!di->lease_session) di->lease_session = ceph_get_mds_session(session); - di->lease_gen = session->s_cap_gen; + di->lease_gen = atomic_read(&session->s_cap_gen); di->lease_seq = le32_to_cpu(lease->seq); di->lease_renew_after = half_ttl; di->lease_renew_from = 0; @@ -1154,8 +1154,7 @@ static inline void update_dentry_lease(struct inode *dir, struct dentry *dentry, __update_dentry_lease(dir, dentry, lease, session, from_time, &old_lease_session); spin_unlock(&dentry->d_lock); - if (old_lease_session) - ceph_put_mds_session(old_lease_session); + ceph_put_mds_session(old_lease_session); } /* @@ -1200,8 +1199,7 @@ static void update_dentry_lease_careful(struct dentry *dentry, from_time, &old_lease_session); out_unlock: spin_unlock(&dentry->d_lock); - if (old_lease_session) - ceph_put_mds_session(old_lease_session); + ceph_put_mds_session(old_lease_session); } /* @@ -1568,8 +1566,7 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req, unlock_new_inode(in); } - /* avoid calling iput_final() in mds dispatch threads */ - ceph_async_iput(in); + iput(in); } return err; @@ -1766,13 +1763,11 @@ retry_lookup: if (ret < 0) { pr_err("ceph_fill_inode badness on %p\n", in); if (d_really_is_negative(dn)) { - /* avoid calling iput_final() in mds - * dispatch threads */ if (in->i_state & I_NEW) { ihold(in); discard_new_inode(in); } - ceph_async_iput(in); + iput(in); } d_drop(dn); err = ret; @@ -1785,7 +1780,7 @@ retry_lookup: if (ceph_security_xattr_deadlock(in)) { dout(" skip splicing dn %p to inode %p" " (security xattr deadlock)\n", dn, in); - ceph_async_iput(in); + iput(in); skipped++; goto next_item; } @@ -1836,25 +1831,6 @@ bool ceph_inode_set_size(struct inode *inode, loff_t size) return ret; } -/* - * Put reference to inode, but avoid calling iput_final() in current thread. - * iput_final() may wait for reahahead pages. The wait can cause deadlock in - * some contexts. - */ -void ceph_async_iput(struct inode *inode) -{ - if (!inode) - return; - for (;;) { - if (atomic_add_unless(&inode->i_count, -1, 1)) - break; - if (queue_work(ceph_inode_to_client(inode)->inode_wq, - &ceph_inode(inode)->i_work)) - break; - /* queue work failed, i_count must be at least 2 */ - } -} - void ceph_queue_inode_work(struct inode *inode, int work_bit) { struct ceph_fs_client *fsc = ceph_inode_to_client(inode); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index e5af591d3bd4..a818213c972f 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -664,6 +664,9 @@ struct ceph_mds_session *ceph_get_mds_session(struct ceph_mds_session *s) void ceph_put_mds_session(struct ceph_mds_session *s) { + if (IS_ERR_OR_NULL(s)) + return; + dout("mdsc put_session %p %d -> %d\n", s, refcount_read(&s->s_ref), refcount_read(&s->s_ref)-1); if (refcount_dec_and_test(&s->s_ref)) { @@ -746,8 +749,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, ceph_con_init(&s->s_con, s, &mds_con_ops, &mdsc->fsc->client->msgr); - spin_lock_init(&s->s_gen_ttl_lock); - s->s_cap_gen = 1; + atomic_set(&s->s_cap_gen, 1); s->s_cap_ttl = jiffies - 1; spin_lock_init(&s->s_cap_lock); @@ -822,14 +824,13 @@ void ceph_mdsc_release_request(struct kref *kref) ceph_msg_put(req->r_reply); if (req->r_inode) { ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); - /* avoid calling iput_final() in mds dispatch threads */ - ceph_async_iput(req->r_inode); + iput(req->r_inode); } if (req->r_parent) { ceph_put_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN); - ceph_async_iput(req->r_parent); + iput(req->r_parent); } - ceph_async_iput(req->r_target_inode); + iput(req->r_target_inode); if (req->r_dentry) dput(req->r_dentry); if (req->r_old_dentry) @@ -843,7 +844,7 @@ void ceph_mdsc_release_request(struct kref *kref) */ ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir), CEPH_CAP_PIN); - ceph_async_iput(req->r_old_dentry_dir); + iput(req->r_old_dentry_dir); } kfree(req->r_path1); kfree(req->r_path2); @@ -958,8 +959,7 @@ static void __unregister_request(struct ceph_mds_client *mdsc, } if (req->r_unsafe_dir) { - /* avoid calling iput_final() in mds dispatch threads */ - ceph_async_iput(req->r_unsafe_dir); + iput(req->r_unsafe_dir); req->r_unsafe_dir = NULL; } @@ -1130,7 +1130,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc, cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node); if (!cap) { spin_unlock(&ci->i_ceph_lock); - ceph_async_iput(inode); + iput(inode); goto random; } mds = cap->session->s_mds; @@ -1139,9 +1139,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc, cap == ci->i_auth_cap ? "auth " : "", cap); spin_unlock(&ci->i_ceph_lock); out: - /* avoid calling iput_final() while holding mdsc->mutex or - * in mds dispatch threads */ - ceph_async_iput(inode); + iput(inode); return mds; random: @@ -1438,8 +1436,7 @@ static void __open_export_target_sessions(struct ceph_mds_client *mdsc, for (i = 0; i < mi->num_export_targets; i++) { ts = __open_export_target_session(mdsc, mi->export_targets[i]); - if (!IS_ERR(ts)) - ceph_put_mds_session(ts); + ceph_put_mds_session(ts); } } @@ -1545,9 +1542,7 @@ int ceph_iterate_session_caps(struct ceph_mds_session *session, spin_unlock(&session->s_cap_lock); if (last_inode) { - /* avoid calling iput_final() while holding - * s_mutex or in mds dispatch threads */ - ceph_async_iput(last_inode); + iput(last_inode); last_inode = NULL; } if (old_cap) { @@ -1581,7 +1576,7 @@ out: session->s_cap_iterator = NULL; spin_unlock(&session->s_cap_lock); - ceph_async_iput(last_inode); + iput(last_inode); if (old_cap) ceph_put_cap(session->s_mdsc, old_cap); @@ -1721,8 +1716,7 @@ static void remove_session_caps(struct ceph_mds_session *session) spin_unlock(&session->s_cap_lock); inode = ceph_find_inode(sb, vino); - /* avoid calling iput_final() while holding s_mutex */ - ceph_async_iput(inode); + iput(inode); spin_lock(&session->s_cap_lock); } @@ -1761,7 +1755,7 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap, ci->i_requested_max_size = 0; spin_unlock(&ci->i_ceph_lock); } else if (ev == RENEWCAPS) { - if (cap->cap_gen < cap->session->s_cap_gen) { + if (cap->cap_gen < atomic_read(&cap->session->s_cap_gen)) { /* mds did not re-issue stale cap */ spin_lock(&ci->i_ceph_lock); cap->issued = cap->implemented = CEPH_CAP_PIN; @@ -2988,7 +2982,6 @@ int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir, ceph_take_cap_refs(ci, CEPH_CAP_PIN, false); __ceph_touch_fmode(ci, mdsc, fmode); spin_unlock(&ci->i_ceph_lock); - ihold(req->r_parent); } if (req->r_old_dentry_dir) ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir), @@ -3499,10 +3492,8 @@ static void handle_session(struct ceph_mds_session *session, case CEPH_SESSION_STALE: pr_info("mds%d caps went stale, renewing\n", session->s_mds); - spin_lock(&session->s_gen_ttl_lock); - session->s_cap_gen++; + atomic_inc(&session->s_cap_gen); session->s_cap_ttl = jiffies - 1; - spin_unlock(&session->s_gen_ttl_lock); send_renew_caps(mdsc, session); break; @@ -3771,7 +3762,7 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap, cap->seq = 0; /* reset cap seq */ cap->issue_seq = 0; /* and issue_seq */ cap->mseq = 0; /* and migrate_seq */ - cap->cap_gen = cap->session->s_cap_gen; + cap->cap_gen = atomic_read(&cap->session->s_cap_gen); /* These are lost when the session goes away */ if (S_ISDIR(inode->i_mode)) { @@ -4011,9 +4002,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, dout("session %p state %s\n", session, ceph_session_state_name(session->s_state)); - spin_lock(&session->s_gen_ttl_lock); - session->s_cap_gen++; - spin_unlock(&session->s_gen_ttl_lock); + atomic_inc(&session->s_cap_gen); spin_lock(&session->s_cap_lock); /* don't know if session is readonly */ @@ -4344,7 +4333,7 @@ static void handle_lease(struct ceph_mds_client *mdsc, case CEPH_MDS_LEASE_RENEW: if (di->lease_session == session && - di->lease_gen == session->s_cap_gen && + di->lease_gen == atomic_read(&session->s_cap_gen) && di->lease_renew_from && di->lease_renew_after == 0) { unsigned long duration = @@ -4372,8 +4361,7 @@ release: out: mutex_unlock(&session->s_mutex); - /* avoid calling iput_final() in mds dispatch threads */ - ceph_async_iput(inode); + iput(inode); return; bad: diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 15c11a0f2caf..20e42d8b66c6 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -186,10 +186,8 @@ struct ceph_mds_session { struct ceph_auth_handshake s_auth; - /* protected by s_gen_ttl_lock */ - spinlock_t s_gen_ttl_lock; - u32 s_cap_gen; /* inc each time we get mds stale msg */ - unsigned long s_cap_ttl; /* when session caps expire */ + atomic_t s_cap_gen; /* inc each time we get mds stale msg */ + unsigned long s_cap_ttl; /* when session caps expire. protected by s_mutex */ /* protected by s_cap_lock */ spinlock_t s_cap_lock; diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c index 28b6b42ad677..5ac151eb0d49 100644 --- a/fs/ceph/metric.c +++ b/fs/ceph/metric.c @@ -20,8 +20,11 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, struct ceph_opened_files *files; struct ceph_pinned_icaps *icaps; struct ceph_opened_inodes *inodes; + struct ceph_read_io_size *rsize; + struct ceph_write_io_size *wsize; struct ceph_client_metric *m = &mdsc->metric; u64 nr_caps = atomic64_read(&m->total_caps); + u32 header_len = sizeof(struct ceph_metric_header); struct ceph_msg *msg; struct timespec64 ts; s64 sum; @@ -30,7 +33,8 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, len = sizeof(*head) + sizeof(*cap) + sizeof(*read) + sizeof(*write) + sizeof(*meta) + sizeof(*dlease) + sizeof(*files) - + sizeof(*icaps) + sizeof(*inodes); + + sizeof(*icaps) + sizeof(*inodes) + sizeof(*rsize) + + sizeof(*wsize); msg = ceph_msg_new(CEPH_MSG_CLIENT_METRICS, len, GFP_NOFS, true); if (!msg) { @@ -43,10 +47,10 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, /* encode the cap metric */ cap = (struct ceph_metric_cap *)(head + 1); - cap->type = cpu_to_le32(CLIENT_METRIC_TYPE_CAP_INFO); - cap->ver = 1; - cap->compat = 1; - cap->data_len = cpu_to_le32(sizeof(*cap) - 10); + cap->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_CAP_INFO); + cap->header.ver = 1; + cap->header.compat = 1; + cap->header.data_len = cpu_to_le32(sizeof(*cap) - header_len); cap->hit = cpu_to_le64(percpu_counter_sum(&m->i_caps_hit)); cap->mis = cpu_to_le64(percpu_counter_sum(&m->i_caps_mis)); cap->total = cpu_to_le64(nr_caps); @@ -54,10 +58,10 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, /* encode the read latency metric */ read = (struct ceph_metric_read_latency *)(cap + 1); - read->type = cpu_to_le32(CLIENT_METRIC_TYPE_READ_LATENCY); - read->ver = 1; - read->compat = 1; - read->data_len = cpu_to_le32(sizeof(*read) - 10); + read->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_READ_LATENCY); + read->header.ver = 1; + read->header.compat = 1; + read->header.data_len = cpu_to_le32(sizeof(*read) - header_len); sum = m->read_latency_sum; jiffies_to_timespec64(sum, &ts); read->sec = cpu_to_le32(ts.tv_sec); @@ -66,10 +70,10 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, /* encode the write latency metric */ write = (struct ceph_metric_write_latency *)(read + 1); - write->type = cpu_to_le32(CLIENT_METRIC_TYPE_WRITE_LATENCY); - write->ver = 1; - write->compat = 1; - write->data_len = cpu_to_le32(sizeof(*write) - 10); + write->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_WRITE_LATENCY); + write->header.ver = 1; + write->header.compat = 1; + write->header.data_len = cpu_to_le32(sizeof(*write) - header_len); sum = m->write_latency_sum; jiffies_to_timespec64(sum, &ts); write->sec = cpu_to_le32(ts.tv_sec); @@ -78,10 +82,10 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, /* encode the metadata latency metric */ meta = (struct ceph_metric_metadata_latency *)(write + 1); - meta->type = cpu_to_le32(CLIENT_METRIC_TYPE_METADATA_LATENCY); - meta->ver = 1; - meta->compat = 1; - meta->data_len = cpu_to_le32(sizeof(*meta) - 10); + meta->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_METADATA_LATENCY); + meta->header.ver = 1; + meta->header.compat = 1; + meta->header.data_len = cpu_to_le32(sizeof(*meta) - header_len); sum = m->metadata_latency_sum; jiffies_to_timespec64(sum, &ts); meta->sec = cpu_to_le32(ts.tv_sec); @@ -90,10 +94,10 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, /* encode the dentry lease metric */ dlease = (struct ceph_metric_dlease *)(meta + 1); - dlease->type = cpu_to_le32(CLIENT_METRIC_TYPE_DENTRY_LEASE); - dlease->ver = 1; - dlease->compat = 1; - dlease->data_len = cpu_to_le32(sizeof(*dlease) - 10); + dlease->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_DENTRY_LEASE); + dlease->header.ver = 1; + dlease->header.compat = 1; + dlease->header.data_len = cpu_to_le32(sizeof(*dlease) - header_len); dlease->hit = cpu_to_le64(percpu_counter_sum(&m->d_lease_hit)); dlease->mis = cpu_to_le64(percpu_counter_sum(&m->d_lease_mis)); dlease->total = cpu_to_le64(atomic64_read(&m->total_dentries)); @@ -103,34 +107,54 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, /* encode the opened files metric */ files = (struct ceph_opened_files *)(dlease + 1); - files->type = cpu_to_le32(CLIENT_METRIC_TYPE_OPENED_FILES); - files->ver = 1; - files->compat = 1; - files->data_len = cpu_to_le32(sizeof(*files) - 10); + files->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_OPENED_FILES); + files->header.ver = 1; + files->header.compat = 1; + files->header.data_len = cpu_to_le32(sizeof(*files) - header_len); files->opened_files = cpu_to_le64(atomic64_read(&m->opened_files)); files->total = cpu_to_le64(sum); items++; /* encode the pinned icaps metric */ icaps = (struct ceph_pinned_icaps *)(files + 1); - icaps->type = cpu_to_le32(CLIENT_METRIC_TYPE_PINNED_ICAPS); - icaps->ver = 1; - icaps->compat = 1; - icaps->data_len = cpu_to_le32(sizeof(*icaps) - 10); + icaps->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_PINNED_ICAPS); + icaps->header.ver = 1; + icaps->header.compat = 1; + icaps->header.data_len = cpu_to_le32(sizeof(*icaps) - header_len); icaps->pinned_icaps = cpu_to_le64(nr_caps); icaps->total = cpu_to_le64(sum); items++; /* encode the opened inodes metric */ inodes = (struct ceph_opened_inodes *)(icaps + 1); - inodes->type = cpu_to_le32(CLIENT_METRIC_TYPE_OPENED_INODES); - inodes->ver = 1; - inodes->compat = 1; - inodes->data_len = cpu_to_le32(sizeof(*inodes) - 10); + inodes->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_OPENED_INODES); + inodes->header.ver = 1; + inodes->header.compat = 1; + inodes->header.data_len = cpu_to_le32(sizeof(*inodes) - header_len); inodes->opened_inodes = cpu_to_le64(percpu_counter_sum(&m->opened_inodes)); inodes->total = cpu_to_le64(sum); items++; + /* encode the read io size metric */ + rsize = (struct ceph_read_io_size *)(inodes + 1); + rsize->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_READ_IO_SIZES); + rsize->header.ver = 1; + rsize->header.compat = 1; + rsize->header.data_len = cpu_to_le32(sizeof(*rsize) - header_len); + rsize->total_ops = cpu_to_le64(m->total_reads); + rsize->total_size = cpu_to_le64(m->read_size_sum); + items++; + + /* encode the write io size metric */ + wsize = (struct ceph_write_io_size *)(rsize + 1); + wsize->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_WRITE_IO_SIZES); + wsize->header.ver = 1; + wsize->header.compat = 1; + wsize->header.data_len = cpu_to_le32(sizeof(*wsize) - header_len); + wsize->total_ops = cpu_to_le64(m->total_writes); + wsize->total_size = cpu_to_le64(m->write_size_sum); + items++; + put_unaligned_le32(items, &head->num); msg->front.iov_len = len; msg->hdr.version = cpu_to_le16(1); @@ -225,6 +249,9 @@ int ceph_metric_init(struct ceph_client_metric *m) m->read_latency_max = 0; m->total_reads = 0; m->read_latency_sum = 0; + m->read_size_min = U64_MAX; + m->read_size_max = 0; + m->read_size_sum = 0; spin_lock_init(&m->write_metric_lock); m->write_latency_sq_sum = 0; @@ -232,6 +259,9 @@ int ceph_metric_init(struct ceph_client_metric *m) m->write_latency_max = 0; m->total_writes = 0; m->write_latency_sum = 0; + m->write_size_min = U64_MAX; + m->write_size_max = 0; + m->write_size_sum = 0; spin_lock_init(&m->metadata_metric_lock); m->metadata_latency_sq_sum = 0; @@ -281,23 +311,21 @@ void ceph_metric_destroy(struct ceph_client_metric *m) cancel_delayed_work_sync(&m->delayed_work); - if (m->session) - ceph_put_mds_session(m->session); + ceph_put_mds_session(m->session); } -static inline void __update_latency(ktime_t *totalp, ktime_t *lsump, - ktime_t *min, ktime_t *max, - ktime_t *sq_sump, ktime_t lat) -{ - ktime_t total, avg, sq, lsum; - - total = ++(*totalp); - lsum = (*lsump += lat); +#define METRIC_UPDATE_MIN_MAX(min, max, new) \ +{ \ + if (unlikely(new < min)) \ + min = new; \ + if (unlikely(new > max)) \ + max = new; \ +} - if (unlikely(lat < *min)) - *min = lat; - if (unlikely(lat > *max)) - *max = lat; +static inline void __update_stdev(ktime_t total, ktime_t lsum, + ktime_t *sq_sump, ktime_t lat) +{ + ktime_t avg, sq; if (unlikely(total == 1)) return; @@ -312,33 +340,51 @@ static inline void __update_latency(ktime_t *totalp, ktime_t *lsump, void ceph_update_read_metrics(struct ceph_client_metric *m, ktime_t r_start, ktime_t r_end, - int rc) + unsigned int size, int rc) { ktime_t lat = ktime_sub(r_end, r_start); + ktime_t total; if (unlikely(rc < 0 && rc != -ENOENT && rc != -ETIMEDOUT)) return; spin_lock(&m->read_metric_lock); - __update_latency(&m->total_reads, &m->read_latency_sum, - &m->read_latency_min, &m->read_latency_max, - &m->read_latency_sq_sum, lat); + total = ++m->total_reads; + m->read_size_sum += size; + m->read_latency_sum += lat; + METRIC_UPDATE_MIN_MAX(m->read_size_min, + m->read_size_max, + size); + METRIC_UPDATE_MIN_MAX(m->read_latency_min, + m->read_latency_max, + lat); + __update_stdev(total, m->read_latency_sum, + &m->read_latency_sq_sum, lat); spin_unlock(&m->read_metric_lock); } void ceph_update_write_metrics(struct ceph_client_metric *m, ktime_t r_start, ktime_t r_end, - int rc) + unsigned int size, int rc) { ktime_t lat = ktime_sub(r_end, r_start); + ktime_t total; if (unlikely(rc && rc != -ETIMEDOUT)) return; spin_lock(&m->write_metric_lock); - __update_latency(&m->total_writes, &m->write_latency_sum, - &m->write_latency_min, &m->write_latency_max, - &m->write_latency_sq_sum, lat); + total = ++m->total_writes; + m->write_size_sum += size; + m->write_latency_sum += lat; + METRIC_UPDATE_MIN_MAX(m->write_size_min, + m->write_size_max, + size); + METRIC_UPDATE_MIN_MAX(m->write_latency_min, + m->write_latency_max, + lat); + __update_stdev(total, m->write_latency_sum, + &m->write_latency_sq_sum, lat); spin_unlock(&m->write_metric_lock); } @@ -347,13 +393,18 @@ void ceph_update_metadata_metrics(struct ceph_client_metric *m, int rc) { ktime_t lat = ktime_sub(r_end, r_start); + ktime_t total; if (unlikely(rc && rc != -ENOENT)) return; spin_lock(&m->metadata_metric_lock); - __update_latency(&m->total_metadatas, &m->metadata_latency_sum, - &m->metadata_latency_min, &m->metadata_latency_max, - &m->metadata_latency_sq_sum, lat); + total = ++m->total_metadatas; + m->metadata_latency_sum += lat; + METRIC_UPDATE_MIN_MAX(m->metadata_latency_min, + m->metadata_latency_max, + lat); + __update_stdev(total, m->metadata_latency_sum, + &m->metadata_latency_sq_sum, lat); spin_unlock(&m->metadata_metric_lock); } diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h index e984eb2bb14b..0133955a3c6a 100644 --- a/fs/ceph/metric.h +++ b/fs/ceph/metric.h @@ -17,8 +17,10 @@ enum ceph_metric_type { CLIENT_METRIC_TYPE_OPENED_FILES, CLIENT_METRIC_TYPE_PINNED_ICAPS, CLIENT_METRIC_TYPE_OPENED_INODES, + CLIENT_METRIC_TYPE_READ_IO_SIZES, + CLIENT_METRIC_TYPE_WRITE_IO_SIZES, - CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_OPENED_INODES, + CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_WRITE_IO_SIZES, }; /* @@ -34,18 +36,22 @@ enum ceph_metric_type { CLIENT_METRIC_TYPE_OPENED_FILES, \ CLIENT_METRIC_TYPE_PINNED_ICAPS, \ CLIENT_METRIC_TYPE_OPENED_INODES, \ + CLIENT_METRIC_TYPE_READ_IO_SIZES, \ + CLIENT_METRIC_TYPE_WRITE_IO_SIZES, \ \ CLIENT_METRIC_TYPE_MAX, \ } -/* metric caps header */ -struct ceph_metric_cap { +struct ceph_metric_header { __le32 type; /* ceph metric type */ - __u8 ver; __u8 compat; - __le32 data_len; /* length of sizeof(hit + mis + total) */ +} __packed; + +/* metric caps header */ +struct ceph_metric_cap { + struct ceph_metric_header header; __le64 hit; __le64 mis; __le64 total; @@ -53,48 +59,28 @@ struct ceph_metric_cap { /* metric read latency header */ struct ceph_metric_read_latency { - __le32 type; /* ceph metric type */ - - __u8 ver; - __u8 compat; - - __le32 data_len; /* length of sizeof(sec + nsec) */ + struct ceph_metric_header header; __le32 sec; __le32 nsec; } __packed; /* metric write latency header */ struct ceph_metric_write_latency { - __le32 type; /* ceph metric type */ - - __u8 ver; - __u8 compat; - - __le32 data_len; /* length of sizeof(sec + nsec) */ + struct ceph_metric_header header; __le32 sec; __le32 nsec; } __packed; /* metric metadata latency header */ struct ceph_metric_metadata_latency { - __le32 type; /* ceph metric type */ - - __u8 ver; - __u8 compat; - - __le32 data_len; /* length of sizeof(sec + nsec) */ + struct ceph_metric_header header; __le32 sec; __le32 nsec; } __packed; /* metric dentry lease header */ struct ceph_metric_dlease { - __le32 type; /* ceph metric type */ - - __u8 ver; - __u8 compat; - - __le32 data_len; /* length of sizeof(hit + mis + total) */ + struct ceph_metric_header header; __le64 hit; __le64 mis; __le64 total; @@ -102,40 +88,39 @@ struct ceph_metric_dlease { /* metric opened files header */ struct ceph_opened_files { - __le32 type; /* ceph metric type */ - - __u8 ver; - __u8 compat; - - __le32 data_len; /* length of sizeof(opened_files + total) */ + struct ceph_metric_header header; __le64 opened_files; __le64 total; } __packed; /* metric pinned i_caps header */ struct ceph_pinned_icaps { - __le32 type; /* ceph metric type */ - - __u8 ver; - __u8 compat; - - __le32 data_len; /* length of sizeof(pinned_icaps + total) */ + struct ceph_metric_header header; __le64 pinned_icaps; __le64 total; } __packed; /* metric opened inodes header */ struct ceph_opened_inodes { - __le32 type; /* ceph metric type */ - - __u8 ver; - __u8 compat; - - __le32 data_len; /* length of sizeof(opened_inodes + total) */ + struct ceph_metric_header header; __le64 opened_inodes; __le64 total; } __packed; +/* metric read io size header */ +struct ceph_read_io_size { + struct ceph_metric_header header; + __le64 total_ops; + __le64 total_size; +} __packed; + +/* metric write io size header */ +struct ceph_write_io_size { + struct ceph_metric_header header; + __le64 total_ops; + __le64 total_size; +} __packed; + struct ceph_metric_head { __le32 num; /* the number of metrics that will be sent */ } __packed; @@ -152,6 +137,9 @@ struct ceph_client_metric { spinlock_t read_metric_lock; u64 total_reads; + u64 read_size_sum; + u64 read_size_min; + u64 read_size_max; ktime_t read_latency_sum; ktime_t read_latency_sq_sum; ktime_t read_latency_min; @@ -159,6 +147,9 @@ struct ceph_client_metric { spinlock_t write_metric_lock; u64 total_writes; + u64 write_size_sum; + u64 write_size_min; + u64 write_size_max; ktime_t write_latency_sum; ktime_t write_latency_sq_sum; ktime_t write_latency_min; @@ -206,10 +197,10 @@ static inline void ceph_update_cap_mis(struct ceph_client_metric *m) extern void ceph_update_read_metrics(struct ceph_client_metric *m, ktime_t r_start, ktime_t r_end, - int rc); + unsigned int size, int rc); extern void ceph_update_write_metrics(struct ceph_client_metric *m, ktime_t r_start, ktime_t r_end, - int rc); + unsigned int size, int rc); extern void ceph_update_metadata_metrics(struct ceph_client_metric *m, ktime_t r_start, ktime_t r_end, int rc); diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c index 4e32c9600ecc..620c691af40e 100644 --- a/fs/ceph/quota.c +++ b/fs/ceph/quota.c @@ -74,8 +74,7 @@ void ceph_handle_quota(struct ceph_mds_client *mdsc, le64_to_cpu(h->max_files)); spin_unlock(&ci->i_ceph_lock); - /* avoid calling iput_final() in dispatch thread */ - ceph_async_iput(inode); + iput(inode); } static struct ceph_quotarealm_inode * @@ -247,8 +246,7 @@ restart: ci = ceph_inode(in); has_quota = __ceph_has_any_quota(ci); - /* avoid calling iput_final() while holding mdsc->snap_rwsem */ - ceph_async_iput(in); + iput(in); next = realm->parent; if (has_quota || !next) @@ -383,8 +381,7 @@ restart: pr_warn("Invalid quota check op (%d)\n", op); exceeded = true; /* Just break the loop */ } - /* avoid calling iput_final() while holding mdsc->snap_rwsem */ - ceph_async_iput(in); + iput(in); next = realm->parent; if (exceeded || !next) diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 4ce18055d931..4ac0606dcbd4 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -60,11 +60,13 @@ /* * increase ref count for the realm * - * caller must hold snap_rwsem for write. + * caller must hold snap_rwsem. */ void ceph_get_snap_realm(struct ceph_mds_client *mdsc, struct ceph_snap_realm *realm) { + lockdep_assert_held(&mdsc->snap_rwsem); + dout("get_realm %p %d -> %d\n", realm, atomic_read(&realm->nref), atomic_read(&realm->nref)+1); /* @@ -113,6 +115,8 @@ static struct ceph_snap_realm *ceph_create_snap_realm( { struct ceph_snap_realm *realm; + lockdep_assert_held_write(&mdsc->snap_rwsem); + realm = kzalloc(sizeof(*realm), GFP_NOFS); if (!realm) return ERR_PTR(-ENOMEM); @@ -135,7 +139,7 @@ static struct ceph_snap_realm *ceph_create_snap_realm( /* * lookup the realm rooted at @ino. * - * caller must hold snap_rwsem for write. + * caller must hold snap_rwsem. */ static struct ceph_snap_realm *__lookup_snap_realm(struct ceph_mds_client *mdsc, u64 ino) @@ -143,6 +147,8 @@ static struct ceph_snap_realm *__lookup_snap_realm(struct ceph_mds_client *mdsc, struct rb_node *n = mdsc->snap_realms.rb_node; struct ceph_snap_realm *r; + lockdep_assert_held(&mdsc->snap_rwsem); + while (n) { r = rb_entry(n, struct ceph_snap_realm, node); if (ino < r->ino) @@ -176,6 +182,8 @@ static void __put_snap_realm(struct ceph_mds_client *mdsc, static void __destroy_snap_realm(struct ceph_mds_client *mdsc, struct ceph_snap_realm *realm) { + lockdep_assert_held_write(&mdsc->snap_rwsem); + dout("__destroy_snap_realm %p %llx\n", realm, realm->ino); rb_erase(&realm->node, &mdsc->snap_realms); @@ -198,6 +206,8 @@ static void __destroy_snap_realm(struct ceph_mds_client *mdsc, static void __put_snap_realm(struct ceph_mds_client *mdsc, struct ceph_snap_realm *realm) { + lockdep_assert_held_write(&mdsc->snap_rwsem); + dout("__put_snap_realm %llx %p %d -> %d\n", realm->ino, realm, atomic_read(&realm->nref), atomic_read(&realm->nref)-1); if (atomic_dec_and_test(&realm->nref)) @@ -236,6 +246,8 @@ static void __cleanup_empty_realms(struct ceph_mds_client *mdsc) { struct ceph_snap_realm *realm; + lockdep_assert_held_write(&mdsc->snap_rwsem); + spin_lock(&mdsc->snap_empty_lock); while (!list_empty(&mdsc->snap_empty)) { realm = list_first_entry(&mdsc->snap_empty, @@ -269,6 +281,8 @@ static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc, { struct ceph_snap_realm *parent; + lockdep_assert_held_write(&mdsc->snap_rwsem); + if (realm->parent_ino == parentino) return 0; @@ -460,7 +474,7 @@ static bool has_new_snaps(struct ceph_snap_context *o, * Caller must hold snap_rwsem for read (i.e., the realm topology won't * change). */ -void ceph_queue_cap_snap(struct ceph_inode_info *ci) +static void ceph_queue_cap_snap(struct ceph_inode_info *ci) { struct inode *inode = &ci->vfs_inode; struct ceph_cap_snap *capsnap; @@ -663,15 +677,13 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm) if (!inode) continue; spin_unlock(&realm->inodes_with_caps_lock); - /* avoid calling iput_final() while holding - * mdsc->snap_rwsem or in mds dispatch threads */ - ceph_async_iput(lastinode); + iput(lastinode); lastinode = inode; ceph_queue_cap_snap(ci); spin_lock(&realm->inodes_with_caps_lock); } spin_unlock(&realm->inodes_with_caps_lock); - ceph_async_iput(lastinode); + iput(lastinode); dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino); } @@ -696,6 +708,8 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc, int err = -ENOMEM; LIST_HEAD(dirty_realms); + lockdep_assert_held_write(&mdsc->snap_rwsem); + dout("update_snap_trace deletion=%d\n", deletion); more: ceph_decode_need(&p, e, sizeof(*ri), bad); @@ -791,7 +805,7 @@ more: return 0; bad: - err = -EINVAL; + err = -EIO; fail: if (realm && !IS_ERR(realm)) ceph_put_snap_realm(mdsc, realm); @@ -823,17 +837,12 @@ static void flush_snaps(struct ceph_mds_client *mdsc) ihold(inode); spin_unlock(&mdsc->snap_flush_lock); ceph_flush_snaps(ci, &session); - /* avoid calling iput_final() while holding - * session->s_mutex or in mds dispatch threads */ - ceph_async_iput(inode); + iput(inode); spin_lock(&mdsc->snap_flush_lock); } spin_unlock(&mdsc->snap_flush_lock); - if (session) { - mutex_unlock(&session->s_mutex); - ceph_put_mds_session(session); - } + ceph_put_mds_session(session); dout("flush_snaps done\n"); } @@ -969,14 +978,12 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, ceph_get_snap_realm(mdsc, realm); ceph_put_snap_realm(mdsc, oldrealm); - /* avoid calling iput_final() while holding - * mdsc->snap_rwsem or mds in dispatch threads */ - ceph_async_iput(inode); + iput(inode); continue; skip_inode: spin_unlock(&ci->i_ceph_lock); - ceph_async_iput(inode); + iput(inode); } /* we may have taken some of the old realm's children. */ diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 839e6b0239ee..6b6332a5c113 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -931,7 +931,6 @@ extern int ceph_update_snap_trace(struct ceph_mds_client *m, extern void ceph_handle_snap(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, struct ceph_msg *msg); -extern void ceph_queue_cap_snap(struct ceph_inode_info *ci); extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci, struct ceph_cap_snap *capsnap); extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc); @@ -989,8 +988,6 @@ extern int ceph_inode_holds_cap(struct inode *inode, int mask); extern bool ceph_inode_set_size(struct inode *inode, loff_t size); extern void __ceph_do_pending_vmtruncate(struct inode *inode); -extern void ceph_async_iput(struct inode *inode); - void ceph_queue_inode_work(struct inode *inode, int work_bit); static inline void ceph_queue_vmtruncate(struct inode *inode) diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index ba7c01cd9a5d..df00231d3ecc 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -582,22 +582,12 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ulong_wo, NULL, debugfs_ulong_set, "%llu\n"); * This function creates a file in debugfs with the given name that * contains the value of the variable @value. If the @mode variable is so * set, it can be read from, and written to. - * - * This function will return a pointer to a dentry if it succeeds. This - * pointer must be passed to the debugfs_remove() function when the file is - * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be - * returned. - * - * If debugfs is not enabled in the kernel, the value ERR_PTR(-ENODEV) will - * be returned. */ -struct dentry *debugfs_create_ulong(const char *name, umode_t mode, - struct dentry *parent, unsigned long *value) +void debugfs_create_ulong(const char *name, umode_t mode, struct dentry *parent, + unsigned long *value) { - return debugfs_create_mode_unsafe(name, mode, parent, value, - &fops_ulong, &fops_ulong_ro, - &fops_ulong_wo); + debugfs_create_mode_unsafe(name, mode, parent, value, &fops_ulong, + &fops_ulong_ro, &fops_ulong_wo); } EXPORT_SYMBOL_GPL(debugfs_create_ulong); @@ -846,20 +836,11 @@ static const struct file_operations fops_bool_wo = { * This function creates a file in debugfs with the given name that * contains the value of the variable @value. If the @mode variable is so * set, it can be read from, and written to. - * - * This function will return a pointer to a dentry if it succeeds. This - * pointer must be passed to the debugfs_remove() function when the file is - * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be - * returned. - * - * If debugfs is not enabled in the kernel, the value ERR_PTR(-ENODEV) will - * be returned. */ -struct dentry *debugfs_create_bool(const char *name, umode_t mode, - struct dentry *parent, bool *value) +void debugfs_create_bool(const char *name, umode_t mode, struct dentry *parent, + bool *value) { - return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_bool, + debugfs_create_mode_unsafe(name, mode, parent, value, &fops_bool, &fops_bool_ro, &fops_bool_wo); } EXPORT_SYMBOL_GPL(debugfs_create_bool); @@ -980,7 +961,8 @@ static const struct file_operations fops_blob = { /** * debugfs_create_blob - create a debugfs file that is used to read a binary blob * @name: a pointer to a string containing the name of the file to create. - * @mode: the permission that the file should have + * @mode: the read permission that the file should have (other permissions are + * masked out) * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. @@ -1004,7 +986,7 @@ struct dentry *debugfs_create_blob(const char *name, umode_t mode, struct dentry *parent, struct debugfs_blob_wrapper *blob) { - return debugfs_create_file_unsafe(name, mode, parent, blob, &fops_blob); + return debugfs_create_file_unsafe(name, mode & 0444, parent, blob, &fops_blob); } EXPORT_SYMBOL_GPL(debugfs_create_blob); diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c index c4523648472a..cb1c0d8c1714 100644 --- a/fs/exfat/dir.c +++ b/fs/exfat/dir.c @@ -63,7 +63,7 @@ static void exfat_get_uniname_from_ext_entry(struct super_block *sb, static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_entry *dir_entry) { int i, dentries_per_clu, dentries_per_clu_bits = 0, num_ext; - unsigned int type, clu_offset; + unsigned int type, clu_offset, max_dentries; sector_t sector; struct exfat_chain dir, clu; struct exfat_uni_name uni_name; @@ -86,6 +86,8 @@ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_ent dentries_per_clu = sbi->dentries_per_clu; dentries_per_clu_bits = ilog2(dentries_per_clu); + max_dentries = (unsigned int)min_t(u64, MAX_EXFAT_DENTRIES, + (u64)sbi->num_clusters << dentries_per_clu_bits); clu_offset = dentry >> dentries_per_clu_bits; exfat_chain_dup(&clu, &dir); @@ -109,7 +111,7 @@ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_ent } } - while (clu.dir != EXFAT_EOF_CLUSTER) { + while (clu.dir != EXFAT_EOF_CLUSTER && dentry < max_dentries) { i = dentry & (dentries_per_clu - 1); for ( ; i < dentries_per_clu; i++, dentry++) { @@ -245,7 +247,7 @@ static int exfat_iterate(struct file *filp, struct dir_context *ctx) if (err) goto unlock; get_new: - if (cpos >= i_size_read(inode)) + if (ei->flags == ALLOC_NO_FAT_CHAIN && cpos >= i_size_read(inode)) goto end_of_dir; err = exfat_readdir(inode, &cpos, &de); diff --git a/fs/exfat/super.c b/fs/exfat/super.c index d38d17a77e76..5539ffc20d16 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -690,7 +690,7 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc) if (!sb->s_root) { exfat_err(sb, "failed to get the root dentry"); err = -ENOMEM; - goto put_inode; + goto free_table; } return 0; diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index be799040a415..b96ecba91899 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -327,6 +327,7 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, set_buffer_meta(bh); set_buffer_prio(bh); + set_buffer_uptodate(bh); if (ext4_handle_valid(handle)) { err = jbd2_journal_dirty_metadata(handle, bh); /* Errors can only happen due to aborted journal or a nasty bug */ @@ -355,7 +356,6 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, err); } } else { - set_buffer_uptodate(bh); if (inode) mark_buffer_dirty_inode(bh, inode); else diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index e27f34bceb8d..6eed6170aded 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -692,6 +692,13 @@ static long ext4_ioctl_group_add(struct file *file, if (err) return err; + if (ext4_has_feature_bigalloc(sb)) { + ext4_msg(sb, KERN_ERR, + "Online resizing not supported with bigalloc"); + err = -EOPNOTSUPP; + goto group_add_out; + } + err = mnt_want_write_file(file); if (err) goto group_add_out; @@ -816,7 +823,7 @@ static int ext4_ioctl_checkpoint(struct file *filp, unsigned long arg) if (!EXT4_SB(sb)->s_journal) return -ENODEV; - if (flags & ~JBD2_JOURNAL_FLUSH_VALID) + if (flags & ~EXT4_IOC_CHECKPOINT_FLAG_VALID) return -EINVAL; q = bdev_get_queue(EXT4_SB(sb)->s_journal->j_dev); @@ -914,6 +921,13 @@ setversion_out: goto group_extend_out; } + if (ext4_has_feature_bigalloc(sb)) { + ext4_msg(sb, KERN_ERR, + "Online resizing not supported with bigalloc"); + err = -EOPNOTSUPP; + goto group_extend_out; + } + err = mnt_want_write_file(filp); if (err) goto group_extend_out; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index c2c22c2baac0..089c958aa2c3 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1909,10 +1909,11 @@ static int mb_find_extent(struct ext4_buddy *e4b, int block, if (ex->fe_start + ex->fe_len > EXT4_CLUSTERS_PER_GROUP(e4b->bd_sb)) { /* Should never happen! (but apparently sometimes does?!?) */ WARN_ON(1); - ext4_error(e4b->bd_sb, "corruption or bug in mb_find_extent " - "block=%d, order=%d needed=%d ex=%u/%d/%d@%u", - block, order, needed, ex->fe_group, ex->fe_start, - ex->fe_len, ex->fe_logical); + ext4_grp_locked_error(e4b->bd_sb, e4b->bd_group, 0, 0, + "corruption or bug in mb_find_extent " + "block=%d, order=%d needed=%d ex=%u/%d/%d@%u", + block, order, needed, ex->fe_group, ex->fe_start, + ex->fe_len, ex->fe_logical); ex->fe_len = 0; ex->fe_start = 0; ex->fe_group = 0; diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 6cb598b549ca..bc364c119af6 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -156,7 +156,12 @@ static int kmmpd(void *data) memcpy(mmp->mmp_nodename, init_utsname()->nodename, sizeof(mmp->mmp_nodename)); - while (!kthread_should_stop()) { + while (!kthread_should_stop() && !sb_rdonly(sb)) { + if (!ext4_has_feature_mmp(sb)) { + ext4_warning(sb, "kmmpd being stopped since MMP feature" + " has been disabled."); + goto wait_to_exit; + } if (++seq > EXT4_MMP_SEQ_MAX) seq = 1; @@ -177,16 +182,6 @@ static int kmmpd(void *data) failed_writes++; } - if (!(le32_to_cpu(es->s_feature_incompat) & - EXT4_FEATURE_INCOMPAT_MMP)) { - ext4_warning(sb, "kmmpd being stopped since MMP feature" - " has been disabled."); - goto exit_thread; - } - - if (sb_rdonly(sb)) - break; - diff = jiffies - last_update_time; if (diff < mmp_update_interval * HZ) schedule_timeout_interruptible(mmp_update_interval * @@ -207,7 +202,7 @@ static int kmmpd(void *data) ext4_error_err(sb, -retval, "error reading MMP data: %d", retval); - goto exit_thread; + goto wait_to_exit; } mmp_check = (struct mmp_struct *)(bh_check->b_data); @@ -221,7 +216,7 @@ static int kmmpd(void *data) ext4_error_err(sb, EBUSY, "abort"); put_bh(bh_check); retval = -EBUSY; - goto exit_thread; + goto wait_to_exit; } put_bh(bh_check); } @@ -244,7 +239,13 @@ static int kmmpd(void *data) retval = write_mmp_block(sb, bh); -exit_thread: +wait_to_exit: + while (!kthread_should_stop()) { + set_current_state(TASK_INTERRUPTIBLE); + if (!kthread_should_stop()) + schedule(); + } + set_current_state(TASK_RUNNING); return retval; } @@ -391,5 +392,3 @@ failed: brelse(bh); return 1; } - - diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index fc885914c88a..7a9f1adef679 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -74,10 +74,6 @@ int ext4_resize_begin(struct super_block *sb) return -EPERM; } - if (ext4_has_feature_bigalloc(sb)) { - ext4_msg(sb, KERN_ERR, "Online resizing not supported with bigalloc"); - return -EOPNOTSUPP; - } if (ext4_has_feature_sparse_super2(sb)) { ext4_msg(sb, KERN_ERR, "Online resizing not supported with sparse_super2"); return -EOPNOTSUPP; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 20344633bdd9..dfa09a277b56 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -705,15 +705,23 @@ static void flush_stashed_error_work(struct work_struct *work) * ext4 error handling code during handling of previous errors. */ if (!sb_rdonly(sbi->s_sb) && journal) { + struct buffer_head *sbh = sbi->s_sbh; handle = jbd2_journal_start(journal, 1); if (IS_ERR(handle)) goto write_directly; - if (jbd2_journal_get_write_access(handle, sbi->s_sbh)) { + if (jbd2_journal_get_write_access(handle, sbh)) { jbd2_journal_stop(handle); goto write_directly; } ext4_update_super(sbi->s_sb); - if (jbd2_journal_dirty_metadata(handle, sbi->s_sbh)) { + if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) { + ext4_msg(sbi->s_sb, KERN_ERR, "previous I/O error to " + "superblock detected"); + clear_buffer_write_io_error(sbh); + set_buffer_uptodate(sbh); + } + + if (jbd2_journal_dirty_metadata(handle, sbh)) { jbd2_journal_stop(handle); goto write_directly; } @@ -1176,7 +1184,6 @@ static void ext4_put_super(struct super_block *sb) ext4_unregister_sysfs(sb); if (sbi->s_journal) { - jbd2_journal_unregister_shrinker(sbi->s_journal); aborted = is_journal_aborted(sbi->s_journal); err = jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; @@ -5168,7 +5175,6 @@ failed_mount_wq: sbi->s_ea_block_cache = NULL; if (sbi->s_journal) { - jbd2_journal_unregister_shrinker(sbi->s_journal); jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; } @@ -5494,12 +5500,6 @@ static int ext4_load_journal(struct super_block *sb, ext4_commit_super(sb); } - err = jbd2_journal_register_shrinker(journal); - if (err) { - EXT4_SB(sb)->s_journal = NULL; - goto err_out; - } - return 0; err_out: @@ -5985,7 +5985,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) */ ext4_mark_recovery_complete(sb, es); } - ext4_stop_mmpd(sbi); } else { /* Make sure we can mount this feature set readwrite */ if (ext4_has_feature_readonly(sb) || @@ -6099,6 +6098,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks) ext4_release_system_zone(sb); + if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb)) + ext4_stop_mmpd(sbi); + /* * Some options can be enabled by ext4 and/or by VFS mount flag * either way we need to make sure it matches in both *flags and @@ -6132,6 +6134,8 @@ restore_opts: for (i = 0; i < EXT4_MAXQUOTAS; i++) kfree(to_free[i]); #endif + if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb)) + ext4_stop_mmpd(sbi); kfree(orig_data); return err; } diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index f795049e63d5..6c208108d69c 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -444,7 +444,7 @@ static int f2fs_set_meta_page_dirty(struct page *page) if (!PageDirty(page)) { __set_page_dirty_nobuffers(page); inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META); - f2fs_set_page_private(page, 0); + set_page_private_reference(page); return 1; } return 0; @@ -1018,7 +1018,7 @@ void f2fs_update_dirty_page(struct inode *inode, struct page *page) inode_inc_dirty_pages(inode); spin_unlock(&sbi->inode_lock[type]); - f2fs_set_page_private(page, 0); + set_page_private_reference(page); } void f2fs_remove_dirty_inode(struct inode *inode) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 925a5ca3744a..455561826c7d 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -12,9 +12,11 @@ #include <linux/lzo.h> #include <linux/lz4.h> #include <linux/zstd.h> +#include <linux/pagevec.h> #include "f2fs.h" #include "node.h" +#include "segment.h" #include <trace/events/f2fs.h> static struct kmem_cache *cic_entry_slab; @@ -74,7 +76,7 @@ bool f2fs_is_compressed_page(struct page *page) return false; if (!page_private(page)) return false; - if (IS_ATOMIC_WRITTEN_PAGE(page) || IS_DUMMY_WRITTEN_PAGE(page)) + if (page_private_nonpointer(page)) return false; f2fs_bug_on(F2FS_M_SB(page->mapping), @@ -85,8 +87,7 @@ bool f2fs_is_compressed_page(struct page *page) static void f2fs_set_compressed_page(struct page *page, struct inode *inode, pgoff_t index, void *data) { - SetPagePrivate(page); - set_page_private(page, (unsigned long)data); + attach_page_private(page, (void *)data); /* i_crypto_info and iv index */ page->index = index; @@ -589,8 +590,7 @@ static void f2fs_compress_free_page(struct page *page) { if (!page) return; - set_page_private(page, (unsigned long)NULL); - ClearPagePrivate(page); + detach_page_private(page); page->mapping = NULL; unlock_page(page); mempool_free(page, compress_page_pool); @@ -738,7 +738,7 @@ out: return ret; } -static void f2fs_decompress_cluster(struct decompress_io_ctx *dic) +void f2fs_decompress_cluster(struct decompress_io_ctx *dic) { struct f2fs_sb_info *sbi = F2FS_I_SB(dic->inode); struct f2fs_inode_info *fi = F2FS_I(dic->inode); @@ -837,7 +837,8 @@ out_end_io: * page being waited on in the cluster, and if so, it decompresses the cluster * (or in the case of a failure, cleans up without actually decompressing). */ -void f2fs_end_read_compressed_page(struct page *page, bool failed) +void f2fs_end_read_compressed_page(struct page *page, bool failed, + block_t blkaddr) { struct decompress_io_ctx *dic = (struct decompress_io_ctx *)page_private(page); @@ -847,6 +848,9 @@ void f2fs_end_read_compressed_page(struct page *page, bool failed) if (failed) WRITE_ONCE(dic->failed, true); + else if (blkaddr) + f2fs_cache_compressed_page(sbi, page, + dic->inode->i_ino, blkaddr); if (atomic_dec_and_test(&dic->remaining_pages)) f2fs_decompress_cluster(dic); @@ -876,7 +880,7 @@ bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index) return is_page_in_cluster(cc, index); } -static bool __cluster_may_compress(struct compress_ctx *cc) +static bool cluster_has_invalid_data(struct compress_ctx *cc) { loff_t i_size = i_size_read(cc->inode); unsigned nr_pages = DIV_ROUND_UP(i_size, PAGE_SIZE); @@ -889,19 +893,22 @@ static bool __cluster_may_compress(struct compress_ctx *cc) /* beyond EOF */ if (page->index >= nr_pages) - return false; + return true; } - return true; + return false; } -static int __f2fs_cluster_blocks(struct compress_ctx *cc, bool compr) +static int __f2fs_cluster_blocks(struct inode *inode, + unsigned int cluster_idx, bool compr) { struct dnode_of_data dn; + unsigned int cluster_size = F2FS_I(inode)->i_cluster_size; + unsigned int start_idx = cluster_idx << + F2FS_I(inode)->i_log_cluster_size; int ret; - set_new_dnode(&dn, cc->inode, NULL, NULL, 0); - ret = f2fs_get_dnode_of_data(&dn, start_idx_of_cluster(cc), - LOOKUP_NODE); + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE); if (ret) { if (ret == -ENOENT) ret = 0; @@ -912,7 +919,7 @@ static int __f2fs_cluster_blocks(struct compress_ctx *cc, bool compr) int i; ret = 1; - for (i = 1; i < cc->cluster_size; i++) { + for (i = 1; i < cluster_size; i++) { block_t blkaddr; blkaddr = data_blkaddr(dn.inode, @@ -925,6 +932,10 @@ static int __f2fs_cluster_blocks(struct compress_ctx *cc, bool compr) ret++; } } + + f2fs_bug_on(F2FS_I_SB(inode), + !compr && ret != cluster_size && + !is_inode_flag_set(inode, FI_COMPRESS_RELEASED)); } fail: f2fs_put_dnode(&dn); @@ -934,25 +945,15 @@ fail: /* return # of compressed blocks in compressed cluster */ static int f2fs_compressed_blocks(struct compress_ctx *cc) { - return __f2fs_cluster_blocks(cc, true); + return __f2fs_cluster_blocks(cc->inode, cc->cluster_idx, true); } /* return # of valid blocks in compressed cluster */ -static int f2fs_cluster_blocks(struct compress_ctx *cc) -{ - return __f2fs_cluster_blocks(cc, false); -} - int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index) { - struct compress_ctx cc = { - .inode = inode, - .log_cluster_size = F2FS_I(inode)->i_log_cluster_size, - .cluster_size = F2FS_I(inode)->i_cluster_size, - .cluster_idx = index >> F2FS_I(inode)->i_log_cluster_size, - }; - - return f2fs_cluster_blocks(&cc); + return __f2fs_cluster_blocks(inode, + index >> F2FS_I(inode)->i_log_cluster_size, + false); } static bool cluster_may_compress(struct compress_ctx *cc) @@ -961,13 +962,11 @@ static bool cluster_may_compress(struct compress_ctx *cc) return false; if (f2fs_is_atomic_file(cc->inode)) return false; - if (f2fs_is_mmap_file(cc->inode)) - return false; if (!f2fs_cluster_is_full(cc)) return false; if (unlikely(f2fs_cp_error(F2FS_I_SB(cc->inode)))) return false; - return __cluster_may_compress(cc); + return !cluster_has_invalid_data(cc); } static void set_cluster_writeback(struct compress_ctx *cc) @@ -995,21 +994,16 @@ static int prepare_compress_overwrite(struct compress_ctx *cc, struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode); struct address_space *mapping = cc->inode->i_mapping; struct page *page; - struct dnode_of_data dn; sector_t last_block_in_bio; unsigned fgp_flag = FGP_LOCK | FGP_WRITE | FGP_CREAT; pgoff_t start_idx = start_idx_of_cluster(cc); int i, ret; - bool prealloc; retry: - ret = f2fs_cluster_blocks(cc); + ret = f2fs_is_compressed_cluster(cc->inode, start_idx); if (ret <= 0) return ret; - /* compressed case */ - prealloc = (ret < cc->cluster_size); - ret = f2fs_init_compress_ctx(cc); if (ret) return ret; @@ -1067,25 +1061,6 @@ release_and_retry: } } - if (prealloc) { - f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true); - - set_new_dnode(&dn, cc->inode, NULL, NULL, 0); - - for (i = cc->cluster_size - 1; i > 0; i--) { - ret = f2fs_get_block(&dn, start_idx + i); - if (ret) { - i = cc->cluster_size; - break; - } - - if (dn.data_blkaddr != NEW_ADDR) - break; - } - - f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false); - } - if (likely(!ret)) { *fsdata = cc->rpages; *pagep = cc->rpages[offset_in_cluster(cc, index)]; @@ -1216,6 +1191,12 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, loff_t psize; int i, err; + /* we should bypass data pages to proceed the kworkder jobs */ + if (unlikely(f2fs_cp_error(sbi))) { + mapping_set_error(cc->rpages[0]->mapping, -EIO); + goto out_free; + } + if (IS_NOQUOTA(inode)) { /* * We need to wait for node_write to avoid block allocation during @@ -1399,7 +1380,7 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page) for (i = 0; i < cic->nr_rpages; i++) { WARN_ON(!cic->rpages[i]); - clear_cold_data(cic->rpages[i]); + clear_page_private_gcing(cic->rpages[i]); end_page_writeback(cic->rpages[i]); } @@ -1685,6 +1666,164 @@ void f2fs_put_page_dic(struct page *page) f2fs_put_dic(dic); } +const struct address_space_operations f2fs_compress_aops = { + .releasepage = f2fs_release_page, + .invalidatepage = f2fs_invalidate_page, +}; + +struct address_space *COMPRESS_MAPPING(struct f2fs_sb_info *sbi) +{ + return sbi->compress_inode->i_mapping; +} + +void f2fs_invalidate_compress_page(struct f2fs_sb_info *sbi, block_t blkaddr) +{ + if (!sbi->compress_inode) + return; + invalidate_mapping_pages(COMPRESS_MAPPING(sbi), blkaddr, blkaddr); +} + +void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, struct page *page, + nid_t ino, block_t blkaddr) +{ + struct page *cpage; + int ret; + + if (!test_opt(sbi, COMPRESS_CACHE)) + return; + + if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE_READ)) + return; + + if (!f2fs_available_free_memory(sbi, COMPRESS_PAGE)) + return; + + cpage = find_get_page(COMPRESS_MAPPING(sbi), blkaddr); + if (cpage) { + f2fs_put_page(cpage, 0); + return; + } + + cpage = alloc_page(__GFP_NOWARN | __GFP_IO); + if (!cpage) + return; + + ret = add_to_page_cache_lru(cpage, COMPRESS_MAPPING(sbi), + blkaddr, GFP_NOFS); + if (ret) { + f2fs_put_page(cpage, 0); + return; + } + + set_page_private_data(cpage, ino); + + if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE_READ)) + goto out; + + memcpy(page_address(cpage), page_address(page), PAGE_SIZE); + SetPageUptodate(cpage); +out: + f2fs_put_page(cpage, 1); +} + +bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi, struct page *page, + block_t blkaddr) +{ + struct page *cpage; + bool hitted = false; + + if (!test_opt(sbi, COMPRESS_CACHE)) + return false; + + cpage = f2fs_pagecache_get_page(COMPRESS_MAPPING(sbi), + blkaddr, FGP_LOCK | FGP_NOWAIT, GFP_NOFS); + if (cpage) { + if (PageUptodate(cpage)) { + atomic_inc(&sbi->compress_page_hit); + memcpy(page_address(page), + page_address(cpage), PAGE_SIZE); + hitted = true; + } + f2fs_put_page(cpage, 1); + } + + return hitted; +} + +void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino) +{ + struct address_space *mapping = sbi->compress_inode->i_mapping; + struct pagevec pvec; + pgoff_t index = 0; + pgoff_t end = MAX_BLKADDR(sbi); + + if (!mapping->nrpages) + return; + + pagevec_init(&pvec); + + do { + unsigned int nr_pages; + int i; + + nr_pages = pagevec_lookup_range(&pvec, mapping, + &index, end - 1); + if (!nr_pages) + break; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + if (page->index > end) + break; + + lock_page(page); + if (page->mapping != mapping) { + unlock_page(page); + continue; + } + + if (ino != get_page_private_data(page)) { + unlock_page(page); + continue; + } + + generic_error_remove_page(mapping, page); + unlock_page(page); + } + pagevec_release(&pvec); + cond_resched(); + } while (index < end); +} + +int f2fs_init_compress_inode(struct f2fs_sb_info *sbi) +{ + struct inode *inode; + + if (!test_opt(sbi, COMPRESS_CACHE)) + return 0; + + inode = f2fs_iget(sbi->sb, F2FS_COMPRESS_INO(sbi)); + if (IS_ERR(inode)) + return PTR_ERR(inode); + sbi->compress_inode = inode; + + sbi->compress_percent = COMPRESS_PERCENT; + sbi->compress_watermark = COMPRESS_WATERMARK; + + atomic_set(&sbi->compress_page_hit, 0); + + return 0; +} + +void f2fs_destroy_compress_inode(struct f2fs_sb_info *sbi) +{ + if (!sbi->compress_inode) + return; + iput(sbi->compress_inode); + sbi->compress_inode = NULL; +} + int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) { dev_t dev = sbi->sb->s_bdev->bd_dev; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 009a09fb9d88..d2cf48c5a2e4 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -58,18 +58,19 @@ static bool __is_cp_guaranteed(struct page *page) if (!mapping) return false; - if (f2fs_is_compressed_page(page)) - return false; - inode = mapping->host; sbi = F2FS_I_SB(inode); if (inode->i_ino == F2FS_META_INO(sbi) || inode->i_ino == F2FS_NODE_INO(sbi) || - S_ISDIR(inode->i_mode) || - (S_ISREG(inode->i_mode) && + S_ISDIR(inode->i_mode)) + return true; + + if (f2fs_is_compressed_page(page)) + return false; + if ((S_ISREG(inode->i_mode) && (f2fs_is_atomic_file(inode) || IS_NOQUOTA(inode))) || - is_cold_data(page)) + page_private_gcing(page)) return true; return false; } @@ -131,7 +132,7 @@ static void f2fs_finish_read_bio(struct bio *bio) if (f2fs_is_compressed_page(page)) { if (bio->bi_status) - f2fs_end_read_compressed_page(page, true); + f2fs_end_read_compressed_page(page, true, 0); f2fs_put_page_dic(page); continue; } @@ -227,15 +228,19 @@ static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx) struct bio_vec *bv; struct bvec_iter_all iter_all; bool all_compressed = true; + block_t blkaddr = SECTOR_TO_BLOCK(ctx->bio->bi_iter.bi_sector); bio_for_each_segment_all(bv, ctx->bio, iter_all) { struct page *page = bv->bv_page; /* PG_error was set if decryption failed. */ if (f2fs_is_compressed_page(page)) - f2fs_end_read_compressed_page(page, PageError(page)); + f2fs_end_read_compressed_page(page, PageError(page), + blkaddr); else all_compressed = false; + + blkaddr++; } /* @@ -299,9 +304,8 @@ static void f2fs_write_end_io(struct bio *bio) struct page *page = bvec->bv_page; enum count_type type = WB_DATA_TYPE(page); - if (IS_DUMMY_WRITTEN_PAGE(page)) { - set_page_private(page, (unsigned long)NULL); - ClearPagePrivate(page); + if (page_private_dummy(page)) { + clear_page_private_dummy(page); unlock_page(page); mempool_free(page, sbi->write_io_dummy); @@ -331,7 +335,7 @@ static void f2fs_write_end_io(struct bio *bio) dec_page_count(sbi, type); if (f2fs_in_warm_node_list(sbi, page)) f2fs_del_fsync_node_entry(sbi, page); - clear_cold_data(page); + clear_page_private_gcing(page); end_page_writeback(page); } if (!get_pages(sbi, F2FS_WB_CP_DATA) && @@ -455,10 +459,11 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, GFP_NOIO | __GFP_NOFAIL); f2fs_bug_on(sbi, !page); - zero_user_segment(page, 0, PAGE_SIZE); - SetPagePrivate(page); - set_page_private(page, DUMMY_WRITTEN_PAGE); lock_page(page); + + zero_user_segment(page, 0, PAGE_SIZE); + set_page_private_dummy(page); + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) f2fs_bug_on(sbi, 1); } @@ -1351,9 +1356,11 @@ alloc: old_blkaddr = dn->data_blkaddr; f2fs_allocate_data_block(sbi, NULL, old_blkaddr, &dn->data_blkaddr, &sum, seg_type, NULL); - if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) + if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) { invalidate_mapping_pages(META_MAPPING(sbi), old_blkaddr, old_blkaddr); + f2fs_invalidate_compress_page(sbi, old_blkaddr); + } f2fs_update_data_blkaddr(dn, dn->data_blkaddr); /* @@ -2173,7 +2180,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, goto out_put_dnode; } - for (i = 0; i < dic->nr_cpages; i++) { + for (i = 0; i < cc->nr_cpages; i++) { struct page *page = dic->cpages[i]; block_t blkaddr; struct bio_post_read_ctx *ctx; @@ -2181,6 +2188,14 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, blkaddr = data_blkaddr(dn.inode, dn.node_page, dn.ofs_in_node + i + 1); + f2fs_wait_on_block_writeback(inode, blkaddr); + + if (f2fs_load_compressed_page(sbi, page, blkaddr)) { + if (atomic_dec_and_test(&dic->remaining_pages)) + f2fs_decompress_cluster(dic); + continue; + } + if (bio && (!page_is_mergeable(sbi, bio, *last_block_in_bio, blkaddr) || !f2fs_crypt_mergeable_bio(bio, inode, page->index, NULL))) { @@ -2202,8 +2217,6 @@ submit_and_realloc: } } - f2fs_wait_on_block_writeback(inode, blkaddr); - if (bio_add_page(bio, page, blocksize, 0) < blocksize) goto submit_and_realloc; @@ -2459,6 +2472,10 @@ static inline bool check_inplace_update_policy(struct inode *inode, bool f2fs_should_update_inplace(struct inode *inode, struct f2fs_io_info *fio) { + /* swap file is migrating in aligned write mode */ + if (is_inode_flag_set(inode, FI_ALIGNED_WRITE)) + return false; + if (f2fs_is_pinned_file(inode)) return true; @@ -2481,10 +2498,15 @@ bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio) return true; if (f2fs_is_atomic_file(inode)) return true; + + /* swap file is migrating in aligned write mode */ + if (is_inode_flag_set(inode, FI_ALIGNED_WRITE)) + return true; + if (fio) { - if (is_cold_data(fio->page)) + if (page_private_gcing(fio->page)) return true; - if (IS_ATOMIC_WRITTEN_PAGE(fio->page)) + if (page_private_dummy(fio->page)) return true; if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED) && f2fs_is_checkpointed_data(sbi, fio->old_blkaddr))) @@ -2540,7 +2562,7 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) /* This page is already truncated */ if (fio->old_blkaddr == NULL_ADDR) { ClearPageUptodate(page); - clear_cold_data(page); + clear_page_private_gcing(page); goto out_writepage; } got_it: @@ -2750,7 +2772,7 @@ out: inode_dec_dirty_pages(inode); if (err) { ClearPageUptodate(page); - clear_cold_data(page); + clear_page_private_gcing(page); } if (wbc->for_reclaim) { @@ -3224,7 +3246,7 @@ restart: f2fs_do_read_inline_data(page, ipage); set_inode_flag(inode, FI_DATA_EXIST); if (inode->i_nlink) - set_inline_node(ipage); + set_page_private_inline(ipage); } else { err = f2fs_convert_inline_page(&dn, page); if (err) @@ -3615,12 +3637,20 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, } } - clear_cold_data(page); + clear_page_private_gcing(page); + + if (test_opt(sbi, COMPRESS_CACHE)) { + if (f2fs_compressed_file(inode)) + f2fs_invalidate_compress_pages(sbi, inode->i_ino); + if (inode->i_ino == F2FS_COMPRESS_INO(sbi)) + clear_page_private_data(page); + } - if (IS_ATOMIC_WRITTEN_PAGE(page)) + if (page_private_atomic(page)) return f2fs_drop_inmem_page(inode, page); - f2fs_clear_page_private(page); + detach_page_private(page); + set_page_private(page, 0); } int f2fs_release_page(struct page *page, gfp_t wait) @@ -3630,11 +3660,23 @@ int f2fs_release_page(struct page *page, gfp_t wait) return 0; /* This is atomic written page, keep Private */ - if (IS_ATOMIC_WRITTEN_PAGE(page)) + if (page_private_atomic(page)) return 0; - clear_cold_data(page); - f2fs_clear_page_private(page); + if (test_opt(F2FS_P_SB(page), COMPRESS_CACHE)) { + struct f2fs_sb_info *sbi = F2FS_P_SB(page); + struct inode *inode = page->mapping->host; + + if (f2fs_compressed_file(inode)) + f2fs_invalidate_compress_pages(sbi, inode->i_ino); + if (inode->i_ino == F2FS_COMPRESS_INO(sbi)) + clear_page_private_data(page); + } + + clear_page_private_gcing(page); + + detach_page_private(page); + set_page_private(page, 0); return 1; } @@ -3650,7 +3692,7 @@ static int f2fs_set_data_page_dirty(struct page *page) return __set_page_dirty_nobuffers(page); if (f2fs_is_atomic_file(inode) && !f2fs_is_commit_atomic_write(inode)) { - if (!IS_ATOMIC_WRITTEN_PAGE(page)) { + if (!page_private_atomic(page)) { f2fs_register_inmem_page(inode, page); return 1; } @@ -3742,7 +3784,7 @@ int f2fs_migrate_page(struct address_space *mapping, { int rc, extra_count; struct f2fs_inode_info *fi = F2FS_I(mapping->host); - bool atomic_written = IS_ATOMIC_WRITTEN_PAGE(page); + bool atomic_written = page_private_atomic(page); BUG_ON(PageWriteback(page)); @@ -3777,9 +3819,16 @@ int f2fs_migrate_page(struct address_space *mapping, get_page(newpage); } + /* guarantee to start from no stale private field */ + set_page_private(newpage, 0); if (PagePrivate(page)) { - f2fs_set_page_private(newpage, page_private(page)); - f2fs_clear_page_private(page); + set_page_private(newpage, page_private(page)); + SetPagePrivate(newpage); + get_page(newpage); + + set_page_private(page, 0); + ClearPagePrivate(page); + put_page(page); } if (mode != MIGRATE_SYNC_NO_COPY) @@ -3792,67 +3841,66 @@ int f2fs_migrate_page(struct address_space *mapping, #endif #ifdef CONFIG_SWAP -static int f2fs_is_file_aligned(struct inode *inode) +static int f2fs_migrate_blocks(struct inode *inode, block_t start_blk, + unsigned int blkcnt) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - block_t main_blkaddr = SM_I(sbi)->main_blkaddr; - block_t cur_lblock; - block_t last_lblock; - block_t pblock; - unsigned long nr_pblocks; - unsigned int blocks_per_sec = BLKS_PER_SEC(sbi); - unsigned int not_aligned = 0; + unsigned int blkofs; + unsigned int blk_per_sec = BLKS_PER_SEC(sbi); + unsigned int secidx = start_blk / blk_per_sec; + unsigned int end_sec = secidx + blkcnt / blk_per_sec; int ret = 0; - cur_lblock = 0; - last_lblock = bytes_to_blks(inode, i_size_read(inode)); + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + down_write(&F2FS_I(inode)->i_mmap_sem); - while (cur_lblock < last_lblock) { - struct f2fs_map_blocks map; + set_inode_flag(inode, FI_ALIGNED_WRITE); - memset(&map, 0, sizeof(map)); - map.m_lblk = cur_lblock; - map.m_len = last_lblock - cur_lblock; - map.m_next_pgofs = NULL; - map.m_next_extent = NULL; - map.m_seg_type = NO_CHECK_TYPE; - map.m_may_create = false; + for (; secidx < end_sec; secidx++) { + down_write(&sbi->pin_sem); - ret = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_FIEMAP); - if (ret) - goto out; + f2fs_lock_op(sbi); + f2fs_allocate_new_section(sbi, CURSEG_COLD_DATA_PINNED, false); + f2fs_unlock_op(sbi); - /* hole */ - if (!(map.m_flags & F2FS_MAP_FLAGS)) { - f2fs_err(sbi, "Swapfile has holes\n"); - ret = -ENOENT; - goto out; - } + set_inode_flag(inode, FI_DO_DEFRAG); - pblock = map.m_pblk; - nr_pblocks = map.m_len; + for (blkofs = 0; blkofs < blk_per_sec; blkofs++) { + struct page *page; + unsigned int blkidx = secidx * blk_per_sec + blkofs; - if ((pblock - main_blkaddr) & (blocks_per_sec - 1) || - nr_pblocks & (blocks_per_sec - 1)) { - if (f2fs_is_pinned_file(inode)) { - f2fs_err(sbi, "Swapfile does not align to section"); - ret = -EINVAL; - goto out; + page = f2fs_get_lock_data_page(inode, blkidx, true); + if (IS_ERR(page)) { + up_write(&sbi->pin_sem); + ret = PTR_ERR(page); + goto done; } - not_aligned++; + + set_page_dirty(page); + f2fs_put_page(page, 1); } - cur_lblock += nr_pblocks; + clear_inode_flag(inode, FI_DO_DEFRAG); + + ret = filemap_fdatawrite(inode->i_mapping); + + up_write(&sbi->pin_sem); + + if (ret) + break; } - if (not_aligned) - f2fs_warn(sbi, "Swapfile (%u) is not align to section: \n" - "\t1) creat(), 2) ioctl(F2FS_IOC_SET_PIN_FILE), 3) fallocate()", - not_aligned); -out: + +done: + clear_inode_flag(inode, FI_DO_DEFRAG); + clear_inode_flag(inode, FI_ALIGNED_WRITE); + + up_write(&F2FS_I(inode)->i_mmap_sem); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + return ret; } -static int check_swap_activate_fast(struct swap_info_struct *sis, +static int check_swap_activate(struct swap_info_struct *sis, struct file *swap_file, sector_t *span) { struct address_space *mapping = swap_file->f_mapping; @@ -3865,7 +3913,8 @@ static int check_swap_activate_fast(struct swap_info_struct *sis, sector_t highest_pblock = 0; int nr_extents = 0; unsigned long nr_pblocks; - unsigned int blocks_per_sec = BLKS_PER_SEC(sbi); + unsigned int blks_per_sec = BLKS_PER_SEC(sbi); + unsigned int sec_blks_mask = BLKS_PER_SEC(sbi) - 1; unsigned int not_aligned = 0; int ret = 0; @@ -3878,7 +3927,7 @@ static int check_swap_activate_fast(struct swap_info_struct *sis, while (cur_lblock < last_lblock && cur_lblock < sis->max) { struct f2fs_map_blocks map; - +retry: cond_resched(); memset(&map, 0, sizeof(map)); @@ -3895,7 +3944,7 @@ static int check_swap_activate_fast(struct swap_info_struct *sis, /* hole */ if (!(map.m_flags & F2FS_MAP_FLAGS)) { - f2fs_err(sbi, "Swapfile has holes\n"); + f2fs_err(sbi, "Swapfile has holes"); ret = -EINVAL; goto out; } @@ -3903,16 +3952,28 @@ static int check_swap_activate_fast(struct swap_info_struct *sis, pblock = map.m_pblk; nr_pblocks = map.m_len; - if ((pblock - SM_I(sbi)->main_blkaddr) & (blocks_per_sec - 1) || - nr_pblocks & (blocks_per_sec - 1)) { - if (f2fs_is_pinned_file(inode)) { - f2fs_err(sbi, "Swapfile does not align to section"); - ret = -EINVAL; - goto out; - } + if ((pblock - SM_I(sbi)->main_blkaddr) & sec_blks_mask || + nr_pblocks & sec_blks_mask) { not_aligned++; - } + nr_pblocks = roundup(nr_pblocks, blks_per_sec); + if (cur_lblock + nr_pblocks > sis->max) + nr_pblocks -= blks_per_sec; + + if (!nr_pblocks) { + /* this extent is last one */ + nr_pblocks = map.m_len; + f2fs_warn(sbi, "Swapfile: last extent is not aligned to section"); + goto next; + } + + ret = f2fs_migrate_blocks(inode, cur_lblock, + nr_pblocks); + if (ret) + goto out; + goto retry; + } +next: if (cur_lblock + nr_pblocks >= sis->max) nr_pblocks = sis->max - cur_lblock; @@ -3939,120 +4000,11 @@ static int check_swap_activate_fast(struct swap_info_struct *sis, sis->max = cur_lblock; sis->pages = cur_lblock - 1; sis->highest_bit = cur_lblock - 1; - - if (not_aligned) - f2fs_warn(sbi, "Swapfile (%u) is not align to section: \n" - "\t1) creat(), 2) ioctl(F2FS_IOC_SET_PIN_FILE), 3) fallocate()", - not_aligned); -out: - return ret; -} - -/* Copied from generic_swapfile_activate() to check any holes */ -static int check_swap_activate(struct swap_info_struct *sis, - struct file *swap_file, sector_t *span) -{ - struct address_space *mapping = swap_file->f_mapping; - struct inode *inode = mapping->host; - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - unsigned blocks_per_page; - unsigned long page_no; - sector_t probe_block; - sector_t last_block; - sector_t lowest_block = -1; - sector_t highest_block = 0; - int nr_extents = 0; - int ret = 0; - - if (PAGE_SIZE == F2FS_BLKSIZE) - return check_swap_activate_fast(sis, swap_file, span); - - ret = f2fs_is_file_aligned(inode); - if (ret) - goto out; - - blocks_per_page = bytes_to_blks(inode, PAGE_SIZE); - - /* - * Map all the blocks into the extent list. This code doesn't try - * to be very smart. - */ - probe_block = 0; - page_no = 0; - last_block = bytes_to_blks(inode, i_size_read(inode)); - while ((probe_block + blocks_per_page) <= last_block && - page_no < sis->max) { - unsigned block_in_page; - sector_t first_block; - sector_t block = 0; - - cond_resched(); - - block = probe_block; - ret = bmap(inode, &block); - if (ret) - goto out; - if (!block) - goto bad_bmap; - first_block = block; - - /* - * It must be PAGE_SIZE aligned on-disk - */ - if (first_block & (blocks_per_page - 1)) { - probe_block++; - goto reprobe; - } - - for (block_in_page = 1; block_in_page < blocks_per_page; - block_in_page++) { - - block = probe_block + block_in_page; - ret = bmap(inode, &block); - if (ret) - goto out; - if (!block) - goto bad_bmap; - - if (block != first_block + block_in_page) { - /* Discontiguity */ - probe_block++; - goto reprobe; - } - } - - first_block >>= (PAGE_SHIFT - inode->i_blkbits); - if (page_no) { /* exclude the header page */ - if (first_block < lowest_block) - lowest_block = first_block; - if (first_block > highest_block) - highest_block = first_block; - } - - /* - * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks - */ - ret = add_swap_extent(sis, page_no, 1, first_block); - if (ret < 0) - goto out; - nr_extents += ret; - page_no++; - probe_block += blocks_per_page; -reprobe: - continue; - } - ret = nr_extents; - *span = 1 + highest_block - lowest_block; - if (page_no == 0) - page_no = 1; /* force Empty message */ - sis->max = page_no; - sis->pages = page_no - 1; - sis->highest_bit = page_no - 1; out: + if (not_aligned) + f2fs_warn(sbi, "Swapfile (%u) is not align to section: 1) creat(), 2) ioctl(F2FS_IOC_SET_PIN_FILE), 3) fallocate(%u * N)", + not_aligned, blks_per_sec * F2FS_BLKSIZE); return ret; -bad_bmap: - f2fs_err(sbi, "Swapfile has holes\n"); - return -EINVAL; } static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file, @@ -4067,6 +4019,12 @@ static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file, if (f2fs_readonly(F2FS_I_SB(inode)->sb)) return -EROFS; + if (f2fs_lfs_mode(F2FS_I_SB(inode))) { + f2fs_err(F2FS_I_SB(inode), + "Swapfile not supported in LFS mode"); + return -EINVAL; + } + ret = f2fs_convert_inline_inode(inode); if (ret) return ret; diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index c03949a7ccff..833325038ef3 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -152,6 +152,12 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->node_pages = NODE_MAPPING(sbi)->nrpages; if (sbi->meta_inode) si->meta_pages = META_MAPPING(sbi)->nrpages; +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (sbi->compress_inode) { + si->compress_pages = COMPRESS_MAPPING(sbi)->nrpages; + si->compress_page_hit = atomic_read(&sbi->compress_page_hit); + } +#endif si->nats = NM_I(sbi)->nat_cnt[TOTAL_NAT]; si->dirty_nats = NM_I(sbi)->nat_cnt[DIRTY_NAT]; si->sits = MAIN_SEGS(sbi); @@ -309,6 +315,12 @@ get_cache: si->page_mem += (unsigned long long)npages << PAGE_SHIFT; } +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (sbi->compress_inode) { + unsigned npages = COMPRESS_MAPPING(sbi)->nrpages; + si->page_mem += (unsigned long long)npages << PAGE_SHIFT; + } +#endif } static int stat_show(struct seq_file *s, void *v) @@ -476,6 +488,7 @@ static int stat_show(struct seq_file *s, void *v) "volatile IO: %4d (Max. %4d)\n", si->inmem_pages, si->aw_cnt, si->max_aw_cnt, si->vw_cnt, si->max_vw_cnt); + seq_printf(s, " - compress: %4d, hit:%8d\n", si->compress_pages, si->compress_page_hit); seq_printf(s, " - nodes: %4d in %4d\n", si->ndirty_node, si->node_pages); seq_printf(s, " - dents: %4d in dirs:%4d (%4d)\n", diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index dc7ce79672b8..456651682daf 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -16,6 +16,10 @@ #include "xattr.h" #include <trace/events/f2fs.h> +#ifdef CONFIG_UNICODE +extern struct kmem_cache *f2fs_cf_name_slab; +#endif + static unsigned long dir_blocks(struct inode *inode) { return ((unsigned long long) (i_size_read(inode) + PAGE_SIZE - 1)) @@ -77,11 +81,10 @@ int f2fs_init_casefolded_name(const struct inode *dir, { #ifdef CONFIG_UNICODE struct super_block *sb = dir->i_sb; - struct f2fs_sb_info *sbi = F2FS_SB(sb); if (IS_CASEFOLDED(dir)) { - fname->cf_name.name = f2fs_kmalloc(sbi, F2FS_NAME_LEN, - GFP_NOFS); + fname->cf_name.name = kmem_cache_alloc(f2fs_cf_name_slab, + GFP_NOFS); if (!fname->cf_name.name) return -ENOMEM; fname->cf_name.len = utf8_casefold(sb->s_encoding, @@ -89,7 +92,7 @@ int f2fs_init_casefolded_name(const struct inode *dir, fname->cf_name.name, F2FS_NAME_LEN); if ((int)fname->cf_name.len <= 0) { - kfree(fname->cf_name.name); + kmem_cache_free(f2fs_cf_name_slab, fname->cf_name.name); fname->cf_name.name = NULL; if (sb_has_strict_encoding(sb)) return -EINVAL; @@ -172,8 +175,10 @@ void f2fs_free_filename(struct f2fs_filename *fname) fname->crypto_buf.name = NULL; #endif #ifdef CONFIG_UNICODE - kfree(fname->cf_name.name); - fname->cf_name.name = NULL; + if (fname->cf_name.name) { + kmem_cache_free(f2fs_cf_name_slab, fname->cf_name.name); + fname->cf_name.name = NULL; + } #endif } @@ -929,11 +934,15 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, !f2fs_truncate_hole(dir, page->index, page->index + 1)) { f2fs_clear_page_cache_dirty_tag(page); clear_page_dirty_for_io(page); - f2fs_clear_page_private(page); ClearPageUptodate(page); - clear_cold_data(page); + + clear_page_private_gcing(page); + inode_dec_dirty_pages(dir); f2fs_remove_dirty_inode(dir); + + detach_page_private(page); + set_page_private(page, 0); } f2fs_put_page(page, 1); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c83d90125ebd..ee8eb33e2c25 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -98,6 +98,7 @@ extern const char *f2fs_fault_name[FAULT_MAX]; #define F2FS_MOUNT_ATGC 0x08000000 #define F2FS_MOUNT_MERGE_CHECKPOINT 0x10000000 #define F2FS_MOUNT_GC_MERGE 0x20000000 +#define F2FS_MOUNT_COMPRESS_CACHE 0x40000000 #define F2FS_OPTION(sbi) ((sbi)->mount_opt) #define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option) @@ -150,8 +151,10 @@ struct f2fs_mount_info { unsigned char compress_level; /* compress level */ bool compress_chksum; /* compressed data chksum */ unsigned char compress_ext_cnt; /* extension count */ + unsigned char nocompress_ext_cnt; /* nocompress extension count */ int compress_mode; /* compression mode */ unsigned char extensions[COMPRESS_EXT_NUM][F2FS_EXTENSION_LEN]; /* extensions */ + unsigned char noextensions[COMPRESS_EXT_NUM][F2FS_EXTENSION_LEN]; /* extensions */ }; #define F2FS_FEATURE_ENCRYPT 0x0001 @@ -168,6 +171,7 @@ struct f2fs_mount_info { #define F2FS_FEATURE_SB_CHKSUM 0x0800 #define F2FS_FEATURE_CASEFOLD 0x1000 #define F2FS_FEATURE_COMPRESSION 0x2000 +#define F2FS_FEATURE_RO 0x4000 #define __F2FS_HAS_FEATURE(raw_super, mask) \ ((raw_super->feature & cpu_to_le32(mask)) != 0) @@ -706,6 +710,8 @@ enum { FI_COMPRESS_CORRUPT, /* indicate compressed cluster is corrupted */ FI_MMAP_FILE, /* indicate file was mmapped */ FI_ENABLE_COMPRESS, /* enable compression in "user" compression mode */ + FI_COMPRESS_RELEASED, /* compressed blocks were released */ + FI_ALIGNED_WRITE, /* enable aligned write */ FI_MAX, /* max flag, never be used */ }; @@ -939,6 +945,7 @@ static inline void set_new_dnode(struct dnode_of_data *dn, struct inode *inode, #define NR_CURSEG_DATA_TYPE (3) #define NR_CURSEG_NODE_TYPE (3) #define NR_CURSEG_INMEM_TYPE (2) +#define NR_CURSEG_RO_TYPE (2) #define NR_CURSEG_PERSIST_TYPE (NR_CURSEG_DATA_TYPE + NR_CURSEG_NODE_TYPE) #define NR_CURSEG_TYPE (NR_CURSEG_INMEM_TYPE + NR_CURSEG_PERSIST_TYPE) @@ -1291,17 +1298,119 @@ enum { */ }; +static inline int f2fs_test_bit(unsigned int nr, char *addr); +static inline void f2fs_set_bit(unsigned int nr, char *addr); +static inline void f2fs_clear_bit(unsigned int nr, char *addr); + /* - * this value is set in page as a private data which indicate that - * the page is atomically written, and it is in inmem_pages list. + * Layout of f2fs page.private: + * + * Layout A: lowest bit should be 1 + * | bit0 = 1 | bit1 | bit2 | ... | bit MAX | private data .... | + * bit 0 PAGE_PRIVATE_NOT_POINTER + * bit 1 PAGE_PRIVATE_ATOMIC_WRITE + * bit 2 PAGE_PRIVATE_DUMMY_WRITE + * bit 3 PAGE_PRIVATE_ONGOING_MIGRATION + * bit 4 PAGE_PRIVATE_INLINE_INODE + * bit 5 PAGE_PRIVATE_REF_RESOURCE + * bit 6- f2fs private data + * + * Layout B: lowest bit should be 0 + * page.private is a wrapped pointer. */ -#define ATOMIC_WRITTEN_PAGE ((unsigned long)-1) -#define DUMMY_WRITTEN_PAGE ((unsigned long)-2) +enum { + PAGE_PRIVATE_NOT_POINTER, /* private contains non-pointer data */ + PAGE_PRIVATE_ATOMIC_WRITE, /* data page from atomic write path */ + PAGE_PRIVATE_DUMMY_WRITE, /* data page for padding aligned IO */ + PAGE_PRIVATE_ONGOING_MIGRATION, /* data page which is on-going migrating */ + PAGE_PRIVATE_INLINE_INODE, /* inode page contains inline data */ + PAGE_PRIVATE_REF_RESOURCE, /* dirty page has referenced resources */ + PAGE_PRIVATE_MAX +}; -#define IS_ATOMIC_WRITTEN_PAGE(page) \ - (page_private(page) == ATOMIC_WRITTEN_PAGE) -#define IS_DUMMY_WRITTEN_PAGE(page) \ - (page_private(page) == DUMMY_WRITTEN_PAGE) +#define PAGE_PRIVATE_GET_FUNC(name, flagname) \ +static inline bool page_private_##name(struct page *page) \ +{ \ + return PagePrivate(page) && \ + test_bit(PAGE_PRIVATE_NOT_POINTER, &page_private(page)) && \ + test_bit(PAGE_PRIVATE_##flagname, &page_private(page)); \ +} + +#define PAGE_PRIVATE_SET_FUNC(name, flagname) \ +static inline void set_page_private_##name(struct page *page) \ +{ \ + if (!PagePrivate(page)) { \ + get_page(page); \ + SetPagePrivate(page); \ + set_page_private(page, 0); \ + } \ + set_bit(PAGE_PRIVATE_NOT_POINTER, &page_private(page)); \ + set_bit(PAGE_PRIVATE_##flagname, &page_private(page)); \ +} + +#define PAGE_PRIVATE_CLEAR_FUNC(name, flagname) \ +static inline void clear_page_private_##name(struct page *page) \ +{ \ + clear_bit(PAGE_PRIVATE_##flagname, &page_private(page)); \ + if (page_private(page) == 1 << PAGE_PRIVATE_NOT_POINTER) { \ + set_page_private(page, 0); \ + if (PagePrivate(page)) { \ + ClearPagePrivate(page); \ + put_page(page); \ + }\ + } \ +} + +PAGE_PRIVATE_GET_FUNC(nonpointer, NOT_POINTER); +PAGE_PRIVATE_GET_FUNC(reference, REF_RESOURCE); +PAGE_PRIVATE_GET_FUNC(inline, INLINE_INODE); +PAGE_PRIVATE_GET_FUNC(gcing, ONGOING_MIGRATION); +PAGE_PRIVATE_GET_FUNC(atomic, ATOMIC_WRITE); +PAGE_PRIVATE_GET_FUNC(dummy, DUMMY_WRITE); + +PAGE_PRIVATE_SET_FUNC(reference, REF_RESOURCE); +PAGE_PRIVATE_SET_FUNC(inline, INLINE_INODE); +PAGE_PRIVATE_SET_FUNC(gcing, ONGOING_MIGRATION); +PAGE_PRIVATE_SET_FUNC(atomic, ATOMIC_WRITE); +PAGE_PRIVATE_SET_FUNC(dummy, DUMMY_WRITE); + +PAGE_PRIVATE_CLEAR_FUNC(reference, REF_RESOURCE); +PAGE_PRIVATE_CLEAR_FUNC(inline, INLINE_INODE); +PAGE_PRIVATE_CLEAR_FUNC(gcing, ONGOING_MIGRATION); +PAGE_PRIVATE_CLEAR_FUNC(atomic, ATOMIC_WRITE); +PAGE_PRIVATE_CLEAR_FUNC(dummy, DUMMY_WRITE); + +static inline unsigned long get_page_private_data(struct page *page) +{ + unsigned long data = page_private(page); + + if (!test_bit(PAGE_PRIVATE_NOT_POINTER, &data)) + return 0; + return data >> PAGE_PRIVATE_MAX; +} + +static inline void set_page_private_data(struct page *page, unsigned long data) +{ + if (!PagePrivate(page)) { + get_page(page); + SetPagePrivate(page); + set_page_private(page, 0); + } + set_bit(PAGE_PRIVATE_NOT_POINTER, &page_private(page)); + page_private(page) |= data << PAGE_PRIVATE_MAX; +} + +static inline void clear_page_private_data(struct page *page) +{ + page_private(page) &= (1 << PAGE_PRIVATE_MAX) - 1; + if (page_private(page) == 1 << PAGE_PRIVATE_NOT_POINTER) { + set_page_private(page, 0); + if (PagePrivate(page)) { + ClearPagePrivate(page); + put_page(page); + } + } +} /* For compression */ enum compress_algorithm_type { @@ -1317,6 +1426,9 @@ enum compress_flag { COMPRESS_MAX_FLAG, }; +#define COMPRESS_WATERMARK 20 +#define COMPRESS_PERCENT 20 + #define COMPRESS_DATA_RESERVED_SIZE 4 struct compress_data { __le32 clen; /* compressed data size */ @@ -1594,6 +1706,9 @@ struct f2fs_sb_info { struct kobject s_stat_kobj; /* /sys/fs/f2fs/<devname>/stat */ struct completion s_stat_kobj_unregister; + struct kobject s_feature_list_kobj; /* /sys/fs/f2fs/<devname>/feature_list */ + struct completion s_feature_list_kobj_unregister; + /* For shrinker support */ struct list_head s_list; int s_ndevs; /* number of devices */ @@ -1626,6 +1741,12 @@ struct f2fs_sb_info { u64 compr_written_block; u64 compr_saved_block; u32 compr_new_inode; + + /* For compressed block cache */ + struct inode *compress_inode; /* cache compressed blocks */ + unsigned int compress_percent; /* cache page percentage */ + unsigned int compress_watermark; /* cache page watermark */ + atomic_t compress_page_hit; /* cache hit count */ #endif }; @@ -2678,6 +2799,7 @@ static inline void __mark_inode_dirty_flag(struct inode *inode, case FI_DATA_EXIST: case FI_INLINE_DOTS: case FI_PIN_FILE: + case FI_COMPRESS_RELEASED: f2fs_mark_inode_dirty_sync(inode, true); } } @@ -2799,6 +2921,8 @@ static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri) set_bit(FI_EXTRA_ATTR, fi->flags); if (ri->i_inline & F2FS_PIN_FILE) set_bit(FI_PIN_FILE, fi->flags); + if (ri->i_inline & F2FS_COMPRESS_RELEASED) + set_bit(FI_COMPRESS_RELEASED, fi->flags); } static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri) @@ -2819,6 +2943,8 @@ static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri) ri->i_inline |= F2FS_EXTRA_ATTR; if (is_inode_flag_set(inode, FI_PIN_FILE)) ri->i_inline |= F2FS_PIN_FILE; + if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) + ri->i_inline |= F2FS_COMPRESS_RELEASED; } static inline int f2fs_has_extra_attr(struct inode *inode) @@ -3027,25 +3153,6 @@ static inline bool is_dot_dotdot(const u8 *name, size_t len) return false; } -static inline bool f2fs_may_extent_tree(struct inode *inode) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - - if (!test_opt(sbi, EXTENT_CACHE) || - is_inode_flag_set(inode, FI_NO_EXTENT) || - is_inode_flag_set(inode, FI_COMPRESSED_FILE)) - return false; - - /* - * for recovered files during mount do not create extents - * if shrinker is not registered. - */ - if (list_empty(&sbi->s_list)) - return false; - - return S_ISREG(inode->i_mode); -} - static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi, size_t size, gfp_t flags) { @@ -3169,20 +3276,6 @@ static inline bool __is_valid_data_blkaddr(block_t blkaddr) return true; } -static inline void f2fs_set_page_private(struct page *page, - unsigned long data) -{ - if (PagePrivate(page)) - return; - - attach_page_private(page, (void *)data); -} - -static inline void f2fs_clear_page_private(struct page *page) -{ - detach_page_private(page); -} - /* * file.c */ @@ -3566,6 +3659,8 @@ void f2fs_destroy_garbage_collection_cache(void); */ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only); bool f2fs_space_for_roll_forward(struct f2fs_sb_info *sbi); +int __init f2fs_create_recovery_cache(void); +void f2fs_destroy_recovery_cache(void); /* * debug.c @@ -3604,7 +3699,8 @@ struct f2fs_stat_info { unsigned int bimodal, avg_vblocks; int util_free, util_valid, util_invalid; int rsvd_segs, overp_segs; - int dirty_count, node_pages, meta_pages; + int dirty_count, node_pages, meta_pages, compress_pages; + int compress_page_hit; int prefree_count, call_count, cp_count, bg_cp_count; int tot_segs, node_segs, data_segs, free_segs, free_secs; int bg_node_segs, bg_data_segs; @@ -3940,7 +4036,9 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page); bool f2fs_is_compress_backend_ready(struct inode *inode); int f2fs_init_compress_mempool(void); void f2fs_destroy_compress_mempool(void); -void f2fs_end_read_compressed_page(struct page *page, bool failed); +void f2fs_decompress_cluster(struct decompress_io_ctx *dic); +void f2fs_end_read_compressed_page(struct page *page, bool failed, + block_t blkaddr); bool f2fs_cluster_is_empty(struct compress_ctx *cc); bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index); void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct page *page); @@ -3958,10 +4056,19 @@ void f2fs_put_page_dic(struct page *page); int f2fs_init_compress_ctx(struct compress_ctx *cc); void f2fs_destroy_compress_ctx(struct compress_ctx *cc, bool reuse); void f2fs_init_compress_info(struct f2fs_sb_info *sbi); +int f2fs_init_compress_inode(struct f2fs_sb_info *sbi); +void f2fs_destroy_compress_inode(struct f2fs_sb_info *sbi); int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi); void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi); int __init f2fs_init_compress_cache(void); void f2fs_destroy_compress_cache(void); +struct address_space *COMPRESS_MAPPING(struct f2fs_sb_info *sbi); +void f2fs_invalidate_compress_page(struct f2fs_sb_info *sbi, block_t blkaddr); +void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, struct page *page, + nid_t ino, block_t blkaddr); +bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi, struct page *page, + block_t blkaddr); +void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino); #define inc_compr_inode_stat(inode) \ do { \ struct f2fs_sb_info *sbi = F2FS_I_SB(inode); \ @@ -3990,7 +4097,9 @@ static inline struct page *f2fs_compress_control_page(struct page *page) } static inline int f2fs_init_compress_mempool(void) { return 0; } static inline void f2fs_destroy_compress_mempool(void) { } -static inline void f2fs_end_read_compressed_page(struct page *page, bool failed) +static inline void f2fs_decompress_cluster(struct decompress_io_ctx *dic) { } +static inline void f2fs_end_read_compressed_page(struct page *page, + bool failed, block_t blkaddr) { WARN_ON_ONCE(1); } @@ -3998,10 +4107,20 @@ static inline void f2fs_put_page_dic(struct page *page) { WARN_ON_ONCE(1); } +static inline int f2fs_init_compress_inode(struct f2fs_sb_info *sbi) { return 0; } +static inline void f2fs_destroy_compress_inode(struct f2fs_sb_info *sbi) { } static inline int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi) { } static inline int __init f2fs_init_compress_cache(void) { return 0; } static inline void f2fs_destroy_compress_cache(void) { } +static inline void f2fs_invalidate_compress_page(struct f2fs_sb_info *sbi, + block_t blkaddr) { } +static inline void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, + struct page *page, nid_t ino, block_t blkaddr) { } +static inline bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi, + struct page *page, block_t blkaddr) { return false; } +static inline void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, + nid_t ino) { } #define inc_compr_inode_stat(inode) do { } while (0) #endif @@ -4066,6 +4185,27 @@ F2FS_FEATURE_FUNCS(verity, VERITY); F2FS_FEATURE_FUNCS(sb_chksum, SB_CHKSUM); F2FS_FEATURE_FUNCS(casefold, CASEFOLD); F2FS_FEATURE_FUNCS(compression, COMPRESSION); +F2FS_FEATURE_FUNCS(readonly, RO); + +static inline bool f2fs_may_extent_tree(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (!test_opt(sbi, EXTENT_CACHE) || + is_inode_flag_set(inode, FI_NO_EXTENT) || + (is_inode_flag_set(inode, FI_COMPRESSED_FILE) && + !f2fs_sb_has_readonly(sbi))) + return false; + + /* + * for recovered files during mount do not create extents + * if shrinker is not registered. + */ + if (list_empty(&sbi->s_list)) + return false; + + return S_ISREG(inode->i_mode); +} #ifdef CONFIG_BLK_DEV_ZONED static inline bool f2fs_blkz_is_seq(struct f2fs_sb_info *sbi, int devi, diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index ceb575f99048..6afd4562335f 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -63,6 +63,9 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) if (unlikely(IS_IMMUTABLE(inode))) return VM_FAULT_SIGBUS; + if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) + return VM_FAULT_SIGBUS; + if (unlikely(f2fs_cp_error(sbi))) { err = -EIO; goto err; @@ -85,10 +88,6 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) err = ret; goto err; } else if (ret) { - if (ret < F2FS_I(inode)->i_cluster_size) { - err = -EAGAIN; - goto err; - } need_alloc = false; } } @@ -117,7 +116,6 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true); set_new_dnode(&dn, inode, NULL, NULL, 0); err = f2fs_get_block(&dn, page->index); - f2fs_put_dnode(&dn); f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false); } @@ -3203,7 +3201,7 @@ int f2fs_precache_extents(struct inode *inode) map.m_lblk = m_next_extent; } - return err; + return 0; } static int f2fs_ioc_precache_extents(struct file *filp, unsigned long arg) @@ -3237,7 +3235,7 @@ static int f2fs_ioc_enable_verity(struct file *filp, unsigned long arg) if (!f2fs_sb_has_verity(F2FS_I_SB(inode))) { f2fs_warn(F2FS_I_SB(inode), - "Can't enable fs-verity on inode %lu: the verity feature is not enabled on this filesystem.\n", + "Can't enable fs-verity on inode %lu: the verity feature is not enabled on this filesystem", inode->i_ino); return -EOPNOTSUPP; } @@ -3425,7 +3423,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) goto out; } - if (IS_IMMUTABLE(inode)) { + if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) { ret = -EINVAL; goto out; } @@ -3434,8 +3432,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) if (ret) goto out; - F2FS_I(inode)->i_flags |= F2FS_IMMUTABLE_FL; - f2fs_set_inode_flags(inode); + set_inode_flag(inode, FI_COMPRESS_RELEASED); inode->i_ctime = current_time(inode); f2fs_mark_inode_dirty_sync(inode, true); @@ -3590,7 +3587,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) inode_lock(inode); - if (!IS_IMMUTABLE(inode)) { + if (!is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) { ret = -EINVAL; goto unlock_inode; } @@ -3635,8 +3632,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) up_write(&F2FS_I(inode)->i_mmap_sem); if (ret >= 0) { - F2FS_I(inode)->i_flags &= ~F2FS_IMMUTABLE_FL; - f2fs_set_inode_flags(inode); + clear_inode_flag(inode, FI_COMPRESS_RELEASED); inode->i_ctime = current_time(inode); f2fs_mark_inode_dirty_sync(inode, true); } @@ -4023,9 +4019,8 @@ static int f2fs_ioc_decompress_file(struct file *filp, unsigned long arg) LLONG_MAX); if (ret) - f2fs_warn(sbi, "%s: The file might be partially decompressed " - "(errno=%d). Please delete the file.\n", - __func__, ret); + f2fs_warn(sbi, "%s: The file might be partially decompressed (errno=%d). Please delete the file.", + __func__, ret); out: inode_unlock(inode); file_end_write(filp); @@ -4097,9 +4092,8 @@ static int f2fs_ioc_compress_file(struct file *filp, unsigned long arg) clear_inode_flag(inode, FI_ENABLE_COMPRESS); if (ret) - f2fs_warn(sbi, "%s: The file might be partially compressed " - "(errno=%d). Please delete the file.\n", - __func__, ret); + f2fs_warn(sbi, "%s: The file might be partially compressed (errno=%d). Please delete the file.", + __func__, ret); out: inode_unlock(inode); file_end_write(filp); @@ -4254,6 +4248,11 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) goto unlock; } + if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) { + ret = -EPERM; + goto unlock; + } + ret = generic_write_checks(iocb, from); if (ret > 0) { bool preallocated = false; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 8d1f17ab94d8..0e42ee5f7770 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1031,8 +1031,8 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (unlikely(check_valid_map(sbi, segno, offset))) { if (!test_and_set_bit(segno, SIT_I(sbi)->invalid_segmap)) { - f2fs_err(sbi, "mismatched blkaddr %u (source_blkaddr %u) in seg %u\n", - blkaddr, source_blkaddr, segno); + f2fs_err(sbi, "mismatched blkaddr %u (source_blkaddr %u) in seg %u", + blkaddr, source_blkaddr, segno); f2fs_bug_on(sbi, 1); } } @@ -1261,6 +1261,7 @@ static int move_data_block(struct inode *inode, block_t bidx, f2fs_put_page(mpage, 1); invalidate_mapping_pages(META_MAPPING(fio.sbi), fio.old_blkaddr, fio.old_blkaddr); + f2fs_invalidate_compress_page(fio.sbi, fio.old_blkaddr); set_page_dirty(fio.encrypted_page); if (clear_page_dirty_for_io(fio.encrypted_page)) @@ -1336,7 +1337,7 @@ static int move_data_page(struct inode *inode, block_t bidx, int gc_type, goto out; } set_page_dirty(page); - set_cold_data(page); + set_page_private_gcing(page); } else { struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), @@ -1362,11 +1363,11 @@ retry: f2fs_remove_dirty_inode(inode); } - set_cold_data(page); + set_page_private_gcing(page); err = f2fs_do_write_data_page(&fio); if (err) { - clear_cold_data(page); + clear_page_private_gcing(page); if (err == -ENOMEM) { congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); @@ -1450,10 +1451,8 @@ next_step: if (phase == 3) { inode = f2fs_iget(sb, dni.ino); - if (IS_ERR(inode) || is_bad_inode(inode)) { - set_sbi_flag(sbi, SBI_NEED_FSCK); + if (IS_ERR(inode) || is_bad_inode(inode)) continue; - } if (!down_write_trylock( &F2FS_I(inode)->i_gc_rwsem[WRITE])) { @@ -1822,6 +1821,7 @@ static void init_atgc_management(struct f2fs_sb_info *sbi) am->candidate_ratio = DEF_GC_THREAD_CANDIDATE_RATIO; am->max_candidate_count = DEF_GC_THREAD_MAX_CANDIDATE_COUNT; am->age_weight = DEF_GC_THREAD_AGE_WEIGHT; + am->age_threshold = DEF_GC_THREAD_AGE_THRESHOLD; } void f2fs_build_gc_manager(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 92652ca7a7c8..56a20d5c15da 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -173,7 +173,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) /* clear inline data and flag after data writeback */ f2fs_truncate_inline_inode(dn->inode, dn->inode_page, 0); - clear_inline_node(dn->inode_page); + clear_page_private_inline(dn->inode_page); clear_out: stat_dec_inline_inode(dn->inode); clear_inode_flag(dn->inode, FI_INLINE_DATA); @@ -255,7 +255,7 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) set_inode_flag(inode, FI_APPEND_WRITE); set_inode_flag(inode, FI_DATA_EXIST); - clear_inline_node(dn.inode_page); + clear_page_private_inline(dn.inode_page); f2fs_put_dnode(&dn); return 0; } diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index b401f08569f7..9141147b5bb0 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -18,6 +18,10 @@ #include <trace/events/f2fs.h> +#ifdef CONFIG_F2FS_FS_COMPRESSION +extern const struct address_space_operations f2fs_compress_aops; +#endif + void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync) { if (is_inode_flag_set(inode, FI_NEW_INODE)) @@ -494,6 +498,11 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) if (ino == F2FS_NODE_INO(sbi) || ino == F2FS_META_INO(sbi)) goto make_now; +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (ino == F2FS_COMPRESS_INO(sbi)) + goto make_now; +#endif + ret = do_read_inode(inode); if (ret) goto bad_inode; @@ -504,6 +513,12 @@ make_now: } else if (ino == F2FS_META_INO(sbi)) { inode->i_mapping->a_ops = &f2fs_meta_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); + } else if (ino == F2FS_COMPRESS_INO(sbi)) { +#ifdef CONFIG_F2FS_FS_COMPRESSION + inode->i_mapping->a_ops = &f2fs_compress_aops; +#endif + mapping_set_gfp_mask(inode->i_mapping, + GFP_NOFS | __GFP_HIGHMEM | __GFP_MOVABLE); } else if (S_ISREG(inode->i_mode)) { inode->i_op = &f2fs_file_inode_operations; inode->i_fop = &f2fs_file_operations; @@ -646,7 +661,7 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page) /* deleted inode */ if (inode->i_nlink == 0) - clear_inline_node(node_page); + clear_page_private_inline(node_page); F2FS_I(inode)->i_disk_time[0] = inode->i_atime; F2FS_I(inode)->i_disk_time[1] = inode->i_ctime; @@ -723,8 +738,12 @@ void f2fs_evict_inode(struct inode *inode) trace_f2fs_evict_inode(inode); truncate_inode_pages_final(&inode->i_data); + if (test_opt(sbi, COMPRESS_CACHE) && f2fs_compressed_file(inode)) + f2fs_invalidate_compress_pages(sbi, inode->i_ino); + if (inode->i_ino == F2FS_NODE_INO(sbi) || - inode->i_ino == F2FS_META_INO(sbi)) + inode->i_ino == F2FS_META_INO(sbi) || + inode->i_ino == F2FS_COMPRESS_INO(sbi)) goto out_clear; f2fs_bug_on(sbi, get_dirty_pages(inode)); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index a9cd9cf97229..e149c8c66a71 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -153,7 +153,8 @@ fail_drop: return ERR_PTR(err); } -static inline int is_extension_exist(const unsigned char *s, const char *sub) +static inline int is_extension_exist(const unsigned char *s, const char *sub, + bool tmp_ext) { size_t slen = strlen(s); size_t sublen = strlen(sub); @@ -169,6 +170,13 @@ static inline int is_extension_exist(const unsigned char *s, const char *sub) if (slen < sublen + 2) return 0; + if (!tmp_ext) { + /* file has no temp extension */ + if (s[slen - sublen - 1] != '.') + return 0; + return !strncasecmp(s + slen - sublen, sub, sublen); + } + for (i = 1; i < slen - sublen; i++) { if (s[i] != '.') continue; @@ -194,7 +202,7 @@ static inline void set_file_temperature(struct f2fs_sb_info *sbi, struct inode * hot_count = sbi->raw_super->hot_ext_count; for (i = 0; i < cold_count + hot_count; i++) { - if (is_extension_exist(name, extlist[i])) + if (is_extension_exist(name, extlist[i], true)) break; } @@ -279,14 +287,16 @@ static void set_compress_inode(struct f2fs_sb_info *sbi, struct inode *inode, const unsigned char *name) { __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; - unsigned char (*ext)[F2FS_EXTENSION_LEN]; - unsigned int ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt; + unsigned char (*noext)[F2FS_EXTENSION_LEN] = F2FS_OPTION(sbi).noextensions; + unsigned char (*ext)[F2FS_EXTENSION_LEN] = F2FS_OPTION(sbi).extensions; + unsigned char ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt; + unsigned char noext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt; int i, cold_count, hot_count; if (!f2fs_sb_has_compression(sbi) || - is_inode_flag_set(inode, FI_COMPRESSED_FILE) || F2FS_I(inode)->i_flags & F2FS_NOCOMP_FL || - !f2fs_may_compress(inode)) + !f2fs_may_compress(inode) || + (!ext_cnt && !noext_cnt)) return; down_read(&sbi->sb_lock); @@ -295,7 +305,7 @@ static void set_compress_inode(struct f2fs_sb_info *sbi, struct inode *inode, hot_count = sbi->raw_super->hot_ext_count; for (i = cold_count; i < cold_count + hot_count; i++) { - if (is_extension_exist(name, extlist[i])) { + if (is_extension_exist(name, extlist[i], false)) { up_read(&sbi->sb_lock); return; } @@ -303,10 +313,18 @@ static void set_compress_inode(struct f2fs_sb_info *sbi, struct inode *inode, up_read(&sbi->sb_lock); - ext = F2FS_OPTION(sbi).extensions; + for (i = 0; i < noext_cnt; i++) { + if (is_extension_exist(name, noext[i], false)) { + f2fs_disable_compressed_file(inode); + return; + } + } + + if (is_inode_flag_set(inode, FI_COMPRESSED_FILE)) + return; for (i = 0; i < ext_cnt; i++) { - if (!is_extension_exist(name, ext[i])) + if (!is_extension_exist(name, ext[i], false)) continue; set_compress_context(inode); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index e67ce5f13b98..0be9e2d7120e 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -97,6 +97,20 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) mem_size = (atomic_read(&dcc->discard_cmd_cnt) * sizeof(struct discard_cmd)) >> PAGE_SHIFT; res = mem_size < (avail_ram * nm_i->ram_thresh / 100); + } else if (type == COMPRESS_PAGE) { +#ifdef CONFIG_F2FS_FS_COMPRESSION + unsigned long free_ram = val.freeram; + + /* + * free memory is lower than watermark or cached page count + * exceed threshold, deny caching compress page. + */ + res = (free_ram > avail_ram * sbi->compress_watermark / 100) && + (COMPRESS_MAPPING(sbi)->nrpages < + free_ram * sbi->compress_percent / 100); +#else + res = false; +#endif } else { if (!sbi->sb->s_bdi->wb.dirty_exceeded) return true; @@ -1535,13 +1549,10 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, trace_f2fs_writepage(page, NODE); if (unlikely(f2fs_cp_error(sbi))) { - if (is_sbi_flag_set(sbi, SBI_IS_CLOSE)) { - ClearPageUptodate(page); - dec_page_count(sbi, F2FS_DIRTY_NODES); - unlock_page(page); - return 0; - } - goto redirty_out; + ClearPageUptodate(page); + dec_page_count(sbi, F2FS_DIRTY_NODES); + unlock_page(page); + return 0; } if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) @@ -1860,8 +1871,8 @@ continue_unlock: } /* flush inline_data, if it's async context. */ - if (is_inline_node(page)) { - clear_inline_node(page); + if (page_private_inline(page)) { + clear_page_private_inline(page); unlock_page(page); flush_inline_data(sbi, ino_of_node(page)); continue; @@ -1941,8 +1952,8 @@ continue_unlock: goto write_node; /* flush inline_data */ - if (is_inline_node(page)) { - clear_inline_node(page); + if (page_private_inline(page)) { + clear_page_private_inline(page); unlock_page(page); flush_inline_data(sbi, ino_of_node(page)); goto lock_node; @@ -2096,7 +2107,7 @@ static int f2fs_set_node_page_dirty(struct page *page) if (!PageDirty(page)) { __set_page_dirty_nobuffers(page); inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES); - f2fs_set_page_private(page, 0); + set_page_private_reference(page); return 1; } return 0; diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 7a45c0f10629..ff14a6e5ac1c 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -38,6 +38,9 @@ /* return value for read_node_page */ #define LOCKED_PAGE 1 +/* check pinned file's alignment status of physical blocks */ +#define FILE_NOT_ALIGNED 1 + /* For flag in struct node_info */ enum { IS_CHECKPOINTED, /* is it checkpointed before? */ @@ -148,6 +151,7 @@ enum mem_type { EXTENT_CACHE, /* indicates extent cache */ INMEM_PAGES, /* indicates inmemory pages */ DISCARD_CACHE, /* indicates memory of cached discard cmds */ + COMPRESS_PAGE, /* indicates memory of cached compressed pages */ BASE_CHECK, /* check kernel status */ }; @@ -389,20 +393,6 @@ static inline nid_t get_nid(struct page *p, int off, bool i) * - Mark cold node blocks in their node footer * - Mark cold data pages in page cache */ -static inline int is_cold_data(struct page *page) -{ - return PageChecked(page); -} - -static inline void set_cold_data(struct page *page) -{ - SetPageChecked(page); -} - -static inline void clear_cold_data(struct page *page) -{ - ClearPageChecked(page); -} static inline int is_node(struct page *page, int type) { @@ -414,21 +404,6 @@ static inline int is_node(struct page *page, int type) #define is_fsync_dnode(page) is_node(page, FSYNC_BIT_SHIFT) #define is_dent_dnode(page) is_node(page, DENT_BIT_SHIFT) -static inline int is_inline_node(struct page *page) -{ - return PageChecked(page); -} - -static inline void set_inline_node(struct page *page) -{ - SetPageChecked(page); -} - -static inline void clear_inline_node(struct page *page) -{ - ClearPageChecked(page); -} - static inline void set_cold_node(struct page *page, bool is_dir) { struct f2fs_node *rn = F2FS_NODE(page); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 422146c6d866..695eacfe776c 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -45,6 +45,10 @@ static struct kmem_cache *fsync_entry_slab; +#ifdef CONFIG_UNICODE +extern struct kmem_cache *f2fs_cf_name_slab; +#endif + bool f2fs_space_for_roll_forward(struct f2fs_sb_info *sbi) { s64 nalloc = percpu_counter_sum_positive(&sbi->alloc_valid_block_count); @@ -145,7 +149,7 @@ static int init_recovered_filename(const struct inode *dir, f2fs_hash_filename(dir, fname); #ifdef CONFIG_UNICODE /* Case-sensitive match is fine for recovery */ - kfree(fname->cf_name.name); + kmem_cache_free(f2fs_cf_name_slab, fname->cf_name.name); fname->cf_name.name = NULL; #endif } else { @@ -788,13 +792,6 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) quota_enabled = f2fs_enable_quota_files(sbi, s_flags & SB_RDONLY); #endif - fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", - sizeof(struct fsync_inode_entry)); - if (!fsync_entry_slab) { - err = -ENOMEM; - goto out; - } - INIT_LIST_HEAD(&inode_list); INIT_LIST_HEAD(&tmp_inode_list); INIT_LIST_HEAD(&dir_list); @@ -867,8 +864,6 @@ skip: } } - kmem_cache_destroy(fsync_entry_slab); -out: #ifdef CONFIG_QUOTA /* Turn quotas off */ if (quota_enabled) @@ -878,3 +873,17 @@ out: return ret ? ret : err; } + +int __init f2fs_create_recovery_cache(void) +{ + fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", + sizeof(struct fsync_inode_entry)); + if (!fsync_entry_slab) + return -ENOMEM; + return 0; +} + +void f2fs_destroy_recovery_cache(void) +{ + kmem_cache_destroy(fsync_entry_slab); +} diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 51dc79fad4fe..15cc89eef28d 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -186,10 +186,7 @@ void f2fs_register_inmem_page(struct inode *inode, struct page *page) { struct inmem_pages *new; - if (PagePrivate(page)) - set_page_private(page, (unsigned long)ATOMIC_WRITTEN_PAGE); - else - f2fs_set_page_private(page, ATOMIC_WRITTEN_PAGE); + set_page_private_atomic(page); new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS); @@ -272,9 +269,10 @@ next: /* we don't need to invalidate this in the sccessful status */ if (drop || recover) { ClearPageUptodate(page); - clear_cold_data(page); + clear_page_private_gcing(page); } - f2fs_clear_page_private(page); + detach_page_private(page); + set_page_private(page, 0); f2fs_put_page(page, 1); list_del(&cur->list); @@ -357,7 +355,7 @@ void f2fs_drop_inmem_page(struct inode *inode, struct page *page) struct list_head *head = &fi->inmem_pages; struct inmem_pages *cur = NULL; - f2fs_bug_on(sbi, !IS_ATOMIC_WRITTEN_PAGE(page)); + f2fs_bug_on(sbi, !page_private_atomic(page)); mutex_lock(&fi->inmem_lock); list_for_each_entry(cur, head, list) { @@ -373,9 +371,12 @@ void f2fs_drop_inmem_page(struct inode *inode, struct page *page) kmem_cache_free(inmem_entry_slab, cur); ClearPageUptodate(page); - f2fs_clear_page_private(page); + clear_page_private_atomic(page); f2fs_put_page(page, 0); + detach_page_private(page); + set_page_private(page, 0); + trace_f2fs_commit_inmem_page(page, INMEM_INVALIDATE); } @@ -2321,6 +2322,7 @@ void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) return; invalidate_mapping_pages(META_MAPPING(sbi), addr, addr); + f2fs_invalidate_compress_page(sbi, addr); /* add it into sit main buffer */ down_write(&sit_i->sentry_lock); @@ -3289,7 +3291,10 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) if (fio->type == DATA) { struct inode *inode = fio->page->mapping->host; - if (is_cold_data(fio->page)) { + if (is_inode_flag_set(inode, FI_ALIGNED_WRITE)) + return CURSEG_COLD_DATA_PINNED; + + if (page_private_gcing(fio->page)) { if (fio->sbi->am.atgc_enabled && (fio->io_type == FS_DATA_IO) && (fio->sbi->gc_mode != GC_URGENT_HIGH)) @@ -3468,9 +3473,11 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) reallocate: f2fs_allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, &fio->new_blkaddr, sum, type, fio); - if (GET_SEGNO(fio->sbi, fio->old_blkaddr) != NULL_SEGNO) + if (GET_SEGNO(fio->sbi, fio->old_blkaddr) != NULL_SEGNO) { invalidate_mapping_pages(META_MAPPING(fio->sbi), fio->old_blkaddr, fio->old_blkaddr); + f2fs_invalidate_compress_page(fio->sbi, fio->old_blkaddr); + } /* writeout dirty page into bdev */ f2fs_submit_page_write(fio); @@ -3660,6 +3667,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) { invalidate_mapping_pages(META_MAPPING(sbi), old_blkaddr, old_blkaddr); + f2fs_invalidate_compress_page(sbi, old_blkaddr); if (!from_gc) update_segment_mtime(sbi, old_blkaddr, 0); update_sit_entry(sbi, old_blkaddr, -1); @@ -3919,7 +3927,7 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi) /* sanity check for summary blocks */ if (nats_in_cursum(nat_j) > NAT_JOURNAL_ENTRIES || sits_in_cursum(sit_j) > SIT_JOURNAL_ENTRIES) { - f2fs_err(sbi, "invalid journal entries nats %u sits %u\n", + f2fs_err(sbi, "invalid journal entries nats %u sits %u", nats_in_cursum(nat_j), sits_in_cursum(sit_j)); return -EINVAL; } @@ -4682,6 +4690,10 @@ static int sanity_check_curseg(struct f2fs_sb_info *sbi) struct seg_entry *se = get_seg_entry(sbi, curseg->segno); unsigned int blkofs = curseg->next_blkoff; + if (f2fs_sb_has_readonly(sbi) && + i != CURSEG_HOT_DATA && i != CURSEG_HOT_NODE) + continue; + sanity_check_seg_type(sbi, curseg->seg_type); if (f2fs_test_bit(blkofs, se->cur_valid_map)) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 7d325bfaf65a..8fecd3050ccd 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -148,8 +148,10 @@ enum { Opt_compress_algorithm, Opt_compress_log_size, Opt_compress_extension, + Opt_nocompress_extension, Opt_compress_chksum, Opt_compress_mode, + Opt_compress_cache, Opt_atgc, Opt_gc_merge, Opt_nogc_merge, @@ -222,8 +224,10 @@ static match_table_t f2fs_tokens = { {Opt_compress_algorithm, "compress_algorithm=%s"}, {Opt_compress_log_size, "compress_log_size=%u"}, {Opt_compress_extension, "compress_extension=%s"}, + {Opt_nocompress_extension, "nocompress_extension=%s"}, {Opt_compress_chksum, "compress_chksum"}, {Opt_compress_mode, "compress_mode=%s"}, + {Opt_compress_cache, "compress_cache"}, {Opt_atgc, "atgc"}, {Opt_gc_merge, "gc_merge"}, {Opt_nogc_merge, "nogc_merge"}, @@ -275,6 +279,24 @@ static int f2fs_sb_read_encoding(const struct f2fs_super_block *sb, return 0; } + +struct kmem_cache *f2fs_cf_name_slab; +static int __init f2fs_create_casefold_cache(void) +{ + f2fs_cf_name_slab = f2fs_kmem_cache_create("f2fs_casefolded_name", + F2FS_NAME_LEN); + if (!f2fs_cf_name_slab) + return -ENOMEM; + return 0; +} + +static void f2fs_destroy_casefold_cache(void) +{ + kmem_cache_destroy(f2fs_cf_name_slab); +} +#else +static int __init f2fs_create_casefold_cache(void) { return 0; } +static void f2fs_destroy_casefold_cache(void) { } #endif static inline void limit_reserve_root(struct f2fs_sb_info *sbi) @@ -473,6 +495,43 @@ static int f2fs_set_test_dummy_encryption(struct super_block *sb, } #ifdef CONFIG_F2FS_FS_COMPRESSION +/* + * 1. The same extension name cannot not appear in both compress and non-compress extension + * at the same time. + * 2. If the compress extension specifies all files, the types specified by the non-compress + * extension will be treated as special cases and will not be compressed. + * 3. Don't allow the non-compress extension specifies all files. + */ +static int f2fs_test_compress_extension(struct f2fs_sb_info *sbi) +{ + unsigned char (*ext)[F2FS_EXTENSION_LEN]; + unsigned char (*noext)[F2FS_EXTENSION_LEN]; + int ext_cnt, noext_cnt, index = 0, no_index = 0; + + ext = F2FS_OPTION(sbi).extensions; + ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt; + noext = F2FS_OPTION(sbi).noextensions; + noext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt; + + if (!noext_cnt) + return 0; + + for (no_index = 0; no_index < noext_cnt; no_index++) { + if (!strcasecmp("*", noext[no_index])) { + f2fs_info(sbi, "Don't allow the nocompress extension specifies all files"); + return -EINVAL; + } + for (index = 0; index < ext_cnt; index++) { + if (!strcasecmp(ext[index], noext[no_index])) { + f2fs_info(sbi, "Don't allow the same extension %s appear in both compress and nocompress extension", + ext[index]); + return -EINVAL; + } + } + } + return 0; +} + #ifdef CONFIG_F2FS_FS_LZ4 static int f2fs_set_lz4hc_level(struct f2fs_sb_info *sbi, const char *str) { @@ -546,7 +605,8 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) substring_t args[MAX_OPT_ARGS]; #ifdef CONFIG_F2FS_FS_COMPRESSION unsigned char (*ext)[F2FS_EXTENSION_LEN]; - int ext_cnt; + unsigned char (*noext)[F2FS_EXTENSION_LEN]; + int ext_cnt, noext_cnt; #endif char *p, *name; int arg = 0; @@ -555,7 +615,7 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) int ret; if (!options) - return 0; + goto default_check; while ((p = strsep(&options, ",")) != NULL) { int token; @@ -1049,6 +1109,30 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) F2FS_OPTION(sbi).compress_ext_cnt++; kfree(name); break; + case Opt_nocompress_extension: + if (!f2fs_sb_has_compression(sbi)) { + f2fs_info(sbi, "Image doesn't support compression"); + break; + } + name = match_strdup(&args[0]); + if (!name) + return -ENOMEM; + + noext = F2FS_OPTION(sbi).noextensions; + noext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt; + + if (strlen(name) >= F2FS_EXTENSION_LEN || + noext_cnt >= COMPRESS_EXT_NUM) { + f2fs_err(sbi, + "invalid extension length/number"); + kfree(name); + return -EINVAL; + } + + strcpy(noext[noext_cnt], name); + F2FS_OPTION(sbi).nocompress_ext_cnt++; + kfree(name); + break; case Opt_compress_chksum: F2FS_OPTION(sbi).compress_chksum = true; break; @@ -1066,12 +1150,17 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) } kfree(name); break; + case Opt_compress_cache: + set_opt(sbi, COMPRESS_CACHE); + break; #else case Opt_compress_algorithm: case Opt_compress_log_size: case Opt_compress_extension: + case Opt_nocompress_extension: case Opt_compress_chksum: case Opt_compress_mode: + case Opt_compress_cache: f2fs_info(sbi, "compression options not supported"); break; #endif @@ -1090,6 +1179,7 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) return -EINVAL; } } +default_check: #ifdef CONFIG_QUOTA if (f2fs_check_quota_options(sbi)) return -EINVAL; @@ -1122,6 +1212,13 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) } #endif +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (f2fs_test_compress_extension(sbi)) { + f2fs_err(sbi, "invalid compress or nocompress extension"); + return -EINVAL; + } +#endif + if (F2FS_IO_SIZE_BITS(sbi) && !f2fs_lfs_mode(sbi)) { f2fs_err(sbi, "Should set mode=lfs with %uKB-sized IO", F2FS_IO_SIZE_KB(sbi)); @@ -1153,7 +1250,7 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) } if (test_opt(sbi, DISABLE_CHECKPOINT) && f2fs_lfs_mode(sbi)) { - f2fs_err(sbi, "LFS not compatible with checkpoint=disable\n"); + f2fs_err(sbi, "LFS not compatible with checkpoint=disable"); return -EINVAL; } @@ -1162,6 +1259,11 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) */ if (F2FS_OPTION(sbi).active_logs != NR_CURSEG_TYPE) F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; + + if (f2fs_sb_has_readonly(sbi) && !f2fs_readonly(sbi->sb)) { + f2fs_err(sbi, "Allow to mount readonly mode only"); + return -EROFS; + } return 0; } @@ -1403,6 +1505,8 @@ static void f2fs_put_super(struct super_block *sb) f2fs_bug_on(sbi, sbi->fsync_node_num); + f2fs_destroy_compress_inode(sbi); + iput(sbi->node_inode); sbi->node_inode = NULL; @@ -1665,6 +1769,11 @@ static inline void f2fs_show_compress_options(struct seq_file *seq, F2FS_OPTION(sbi).extensions[i]); } + for (i = 0; i < F2FS_OPTION(sbi).nocompress_ext_cnt; i++) { + seq_printf(seq, ",nocompress_extension=%s", + F2FS_OPTION(sbi).noextensions[i]); + } + if (F2FS_OPTION(sbi).compress_chksum) seq_puts(seq, ",compress_chksum"); @@ -1672,6 +1781,9 @@ static inline void f2fs_show_compress_options(struct seq_file *seq, seq_printf(seq, ",compress_mode=%s", "fs"); else if (F2FS_OPTION(sbi).compress_mode == COMPR_MODE_USER) seq_printf(seq, ",compress_mode=%s", "user"); + + if (test_opt(sbi, COMPRESS_CACHE)) + seq_puts(seq, ",compress_cache"); } #endif @@ -1819,7 +1931,11 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) static void default_options(struct f2fs_sb_info *sbi) { /* init some FS parameters */ - F2FS_OPTION(sbi).active_logs = NR_CURSEG_PERSIST_TYPE; + if (f2fs_sb_has_readonly(sbi)) + F2FS_OPTION(sbi).active_logs = NR_CURSEG_RO_TYPE; + else + F2FS_OPTION(sbi).active_logs = NR_CURSEG_PERSIST_TYPE; + F2FS_OPTION(sbi).inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS; F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT; @@ -1949,6 +2065,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) bool disable_checkpoint = test_opt(sbi, DISABLE_CHECKPOINT); bool no_io_align = !F2FS_IO_ALIGNED(sbi); bool no_atgc = !test_opt(sbi, ATGC); + bool no_compress_cache = !test_opt(sbi, COMPRESS_CACHE); bool checkpoint_changed; #ifdef CONFIG_QUOTA int i, j; @@ -2004,6 +2121,11 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) if (f2fs_readonly(sb) && (*flags & SB_RDONLY)) goto skip; + if (f2fs_sb_has_readonly(sbi) && !(*flags & SB_RDONLY)) { + err = -EROFS; + goto restore_opts; + } + #ifdef CONFIG_QUOTA if (!f2fs_readonly(sb) && (*flags & SB_RDONLY)) { err = dquot_suspend(sb, -1); @@ -2041,6 +2163,12 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) goto restore_opts; } + if (no_compress_cache == !!test_opt(sbi, COMPRESS_CACHE)) { + err = -EINVAL; + f2fs_warn(sbi, "switch compress_cache option is not allowed"); + goto restore_opts; + } + if ((*flags & SB_RDONLY) && test_opt(sbi, DISABLE_CHECKPOINT)) { err = -EINVAL; f2fs_warn(sbi, "disabling checkpoint not compatible with read-only"); @@ -3137,14 +3265,15 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) ovp_segments = le32_to_cpu(ckpt->overprov_segment_count); reserved_segments = le32_to_cpu(ckpt->rsvd_segment_count); - if (unlikely(fsmeta < F2FS_MIN_META_SEGMENTS || + if (!f2fs_sb_has_readonly(sbi) && + unlikely(fsmeta < F2FS_MIN_META_SEGMENTS || ovp_segments == 0 || reserved_segments == 0)) { f2fs_err(sbi, "Wrong layout: check mkfs.f2fs version"); return 1; } - user_block_count = le64_to_cpu(ckpt->user_block_count); - segment_count_main = le32_to_cpu(raw_super->segment_count_main); + segment_count_main = le32_to_cpu(raw_super->segment_count_main) + + (f2fs_sb_has_readonly(sbi) ? 1 : 0); log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg); if (!user_block_count || user_block_count >= segment_count_main << log_blocks_per_seg) { @@ -3175,6 +3304,10 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) if (le32_to_cpu(ckpt->cur_node_segno[i]) >= main_segs || le16_to_cpu(ckpt->cur_node_blkoff[i]) >= blocks_per_seg) return 1; + + if (f2fs_sb_has_readonly(sbi)) + goto check_data; + for (j = i + 1; j < NR_CURSEG_NODE_TYPE; j++) { if (le32_to_cpu(ckpt->cur_node_segno[i]) == le32_to_cpu(ckpt->cur_node_segno[j])) { @@ -3185,10 +3318,15 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) } } } +check_data: for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) { if (le32_to_cpu(ckpt->cur_data_segno[i]) >= main_segs || le16_to_cpu(ckpt->cur_data_blkoff[i]) >= blocks_per_seg) return 1; + + if (f2fs_sb_has_readonly(sbi)) + goto skip_cross; + for (j = i + 1; j < NR_CURSEG_DATA_TYPE; j++) { if (le32_to_cpu(ckpt->cur_data_segno[i]) == le32_to_cpu(ckpt->cur_data_segno[j])) { @@ -3210,7 +3348,7 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) } } } - +skip_cross: sit_bitmap_size = le32_to_cpu(ckpt->sit_ver_bitmap_bytesize); nat_bitmap_size = le32_to_cpu(ckpt->nat_ver_bitmap_bytesize); @@ -3555,7 +3693,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) #ifdef CONFIG_BLK_DEV_ZONED if (bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HM && !f2fs_sb_has_blkzoned(sbi)) { - f2fs_err(sbi, "Zoned block device feature not enabled\n"); + f2fs_err(sbi, "Zoned block device feature not enabled"); return -EINVAL; } if (bdev_zoned_model(FDEV(i).bdev) != BLK_ZONED_NONE) { @@ -3940,10 +4078,14 @@ try_onemore: goto free_node_inode; } - err = f2fs_register_sysfs(sbi); + err = f2fs_init_compress_inode(sbi); if (err) goto free_root_inode; + err = f2fs_register_sysfs(sbi); + if (err) + goto free_compress_inode; + #ifdef CONFIG_QUOTA /* Enable quota usage during mount */ if (f2fs_sb_has_quota_ino(sbi) && !f2fs_readonly(sb)) { @@ -4084,6 +4226,8 @@ free_meta: /* evict some inodes being cached by GC */ evict_inodes(sb); f2fs_unregister_sysfs(sbi); +free_compress_inode: + f2fs_destroy_compress_inode(sbi); free_root_inode: dput(sb->s_root); sb->s_root = NULL; @@ -4162,6 +4306,15 @@ static void kill_f2fs_super(struct super_block *sb) f2fs_stop_gc_thread(sbi); f2fs_stop_discard_thread(sbi); +#ifdef CONFIG_F2FS_FS_COMPRESSION + /* + * latter evict_inode() can bypass checking and invalidating + * compress inode cache. + */ + if (test_opt(sbi, COMPRESS_CACHE)) + truncate_inode_pages_final(COMPRESS_MAPPING(sbi)); +#endif + if (is_sbi_flag_set(sbi, SBI_IS_DIRTY) || !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { struct cp_control cpc = { @@ -4227,9 +4380,12 @@ static int __init init_f2fs_fs(void) err = f2fs_create_checkpoint_caches(); if (err) goto free_segment_manager_caches; - err = f2fs_create_extent_cache(); + err = f2fs_create_recovery_cache(); if (err) goto free_checkpoint_caches; + err = f2fs_create_extent_cache(); + if (err) + goto free_recovery_cache; err = f2fs_create_garbage_collection_cache(); if (err) goto free_extent_cache; @@ -4258,7 +4414,12 @@ static int __init init_f2fs_fs(void) err = f2fs_init_compress_cache(); if (err) goto free_compress_mempool; + err = f2fs_create_casefold_cache(); + if (err) + goto free_compress_cache; return 0; +free_compress_cache: + f2fs_destroy_compress_cache(); free_compress_mempool: f2fs_destroy_compress_mempool(); free_bioset: @@ -4278,6 +4439,8 @@ free_garbage_collection_cache: f2fs_destroy_garbage_collection_cache(); free_extent_cache: f2fs_destroy_extent_cache(); +free_recovery_cache: + f2fs_destroy_recovery_cache(); free_checkpoint_caches: f2fs_destroy_checkpoint_caches(); free_segment_manager_caches: @@ -4292,6 +4455,7 @@ fail: static void __exit exit_f2fs_fs(void) { + f2fs_destroy_casefold_cache(); f2fs_destroy_compress_cache(); f2fs_destroy_compress_mempool(); f2fs_destroy_bioset(); @@ -4303,6 +4467,7 @@ static void __exit exit_f2fs_fs(void) f2fs_exit_sysfs(); f2fs_destroy_garbage_collection_cache(); f2fs_destroy_extent_cache(); + f2fs_destroy_recovery_cache(); f2fs_destroy_checkpoint_caches(); f2fs_destroy_segment_manager_caches(); f2fs_destroy_node_manager_caches(); @@ -4315,4 +4480,5 @@ module_exit(exit_f2fs_fs) MODULE_AUTHOR("Samsung Electronics's Praesto Team"); MODULE_DESCRIPTION("Flash Friendly File System"); MODULE_LICENSE("GPL"); +MODULE_SOFTDEP("pre: crc32"); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 39b522ec73e7..6642246206bd 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -37,6 +37,7 @@ enum { #endif RESERVED_BLOCKS, /* struct f2fs_sb_info */ CPRC_INFO, /* struct ckpt_req_control */ + ATGC_INFO, /* struct atgc_management */ }; struct f2fs_attr { @@ -75,6 +76,8 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) #endif else if (struct_type == CPRC_INFO) return (unsigned char *)&sbi->cprc_info; + else if (struct_type == ATGC_INFO) + return (unsigned char *)&sbi->am; return NULL; } @@ -155,6 +158,9 @@ static ssize_t features_show(struct f2fs_attr *a, if (f2fs_sb_has_casefold(sbi)) len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "casefold"); + if (f2fs_sb_has_readonly(sbi)) + len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "readonly"); if (f2fs_sb_has_compression(sbi)) len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "compression"); @@ -495,6 +501,20 @@ out: } #endif + if (!strcmp(a->attr.name, "atgc_candidate_ratio")) { + if (t > 100) + return -EINVAL; + sbi->am.candidate_ratio = t; + return count; + } + + if (!strcmp(a->attr.name, "atgc_age_weight")) { + if (t > 100) + return -EINVAL; + sbi->am.age_weight = t; + return count; + } + *ui = (unsigned int)t; return count; @@ -546,46 +566,49 @@ static void f2fs_sb_release(struct kobject *kobj) complete(&sbi->s_kobj_unregister); } -enum feat_id { - FEAT_CRYPTO = 0, - FEAT_BLKZONED, - FEAT_ATOMIC_WRITE, - FEAT_EXTRA_ATTR, - FEAT_PROJECT_QUOTA, - FEAT_INODE_CHECKSUM, - FEAT_FLEXIBLE_INLINE_XATTR, - FEAT_QUOTA_INO, - FEAT_INODE_CRTIME, - FEAT_LOST_FOUND, - FEAT_VERITY, - FEAT_SB_CHECKSUM, - FEAT_CASEFOLD, - FEAT_COMPRESSION, - FEAT_TEST_DUMMY_ENCRYPTION_V2, -}; - +/* + * Note that there are three feature list entries: + * 1) /sys/fs/f2fs/features + * : shows runtime features supported by in-kernel f2fs along with Kconfig. + * - ref. F2FS_FEATURE_RO_ATTR() + * + * 2) /sys/fs/f2fs/$s_id/features <deprecated> + * : shows on-disk features enabled by mkfs.f2fs, used for old kernels. This + * won't add new feature anymore, and thus, users should check entries in 3) + * instead of this 2). + * + * 3) /sys/fs/f2fs/$s_id/feature_list + * : shows on-disk features enabled by mkfs.f2fs per instance, which follows + * sysfs entry rule where each entry should expose single value. + * This list covers old feature list provided by 2) and beyond. Therefore, + * please add new on-disk feature in this list only. + * - ref. F2FS_SB_FEATURE_RO_ATTR() + */ static ssize_t f2fs_feature_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - switch (a->id) { - case FEAT_CRYPTO: - case FEAT_BLKZONED: - case FEAT_ATOMIC_WRITE: - case FEAT_EXTRA_ATTR: - case FEAT_PROJECT_QUOTA: - case FEAT_INODE_CHECKSUM: - case FEAT_FLEXIBLE_INLINE_XATTR: - case FEAT_QUOTA_INO: - case FEAT_INODE_CRTIME: - case FEAT_LOST_FOUND: - case FEAT_VERITY: - case FEAT_SB_CHECKSUM: - case FEAT_CASEFOLD: - case FEAT_COMPRESSION: - case FEAT_TEST_DUMMY_ENCRYPTION_V2: + return sprintf(buf, "supported\n"); +} + +#define F2FS_FEATURE_RO_ATTR(_name) \ +static struct f2fs_attr f2fs_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = 0444 }, \ + .show = f2fs_feature_show, \ +} + +static ssize_t f2fs_sb_feature_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + if (F2FS_HAS_FEATURE(sbi, a->id)) return sprintf(buf, "supported\n"); - } - return 0; + return sprintf(buf, "unsupported\n"); +} + +#define F2FS_SB_FEATURE_RO_ATTR(_name, _feat) \ +static struct f2fs_attr f2fs_attr_sb_##_name = { \ + .attr = {.name = __stringify(_name), .mode = 0444 }, \ + .show = f2fs_sb_feature_show, \ + .id = F2FS_FEATURE_##_feat, \ } #define F2FS_ATTR_OFFSET(_struct_type, _name, _mode, _show, _store, _offset) \ @@ -605,13 +628,6 @@ static struct f2fs_attr f2fs_attr_##_name = { \ #define F2FS_GENERAL_RO_ATTR(name) \ static struct f2fs_attr f2fs_attr_##name = __ATTR(name, 0444, name##_show, NULL) -#define F2FS_FEATURE_RO_ATTR(_name, _id) \ -static struct f2fs_attr f2fs_attr_##_name = { \ - .attr = {.name = __stringify(_name), .mode = 0444 }, \ - .show = f2fs_feature_show, \ - .id = _id, \ -} - #define F2FS_STAT_ATTR(_struct_type, _struct_name, _name, _elname) \ static struct f2fs_attr f2fs_attr_##_name = { \ .attr = {.name = __stringify(_name), .mode = 0444 }, \ @@ -685,31 +701,44 @@ F2FS_GENERAL_RO_ATTR(avg_vblocks); #endif #ifdef CONFIG_FS_ENCRYPTION -F2FS_FEATURE_RO_ATTR(encryption, FEAT_CRYPTO); -F2FS_FEATURE_RO_ATTR(test_dummy_encryption_v2, FEAT_TEST_DUMMY_ENCRYPTION_V2); +F2FS_FEATURE_RO_ATTR(encryption); +F2FS_FEATURE_RO_ATTR(test_dummy_encryption_v2); +#ifdef CONFIG_UNICODE +F2FS_FEATURE_RO_ATTR(encrypted_casefold); #endif +#endif /* CONFIG_FS_ENCRYPTION */ #ifdef CONFIG_BLK_DEV_ZONED -F2FS_FEATURE_RO_ATTR(block_zoned, FEAT_BLKZONED); +F2FS_FEATURE_RO_ATTR(block_zoned); #endif -F2FS_FEATURE_RO_ATTR(atomic_write, FEAT_ATOMIC_WRITE); -F2FS_FEATURE_RO_ATTR(extra_attr, FEAT_EXTRA_ATTR); -F2FS_FEATURE_RO_ATTR(project_quota, FEAT_PROJECT_QUOTA); -F2FS_FEATURE_RO_ATTR(inode_checksum, FEAT_INODE_CHECKSUM); -F2FS_FEATURE_RO_ATTR(flexible_inline_xattr, FEAT_FLEXIBLE_INLINE_XATTR); -F2FS_FEATURE_RO_ATTR(quota_ino, FEAT_QUOTA_INO); -F2FS_FEATURE_RO_ATTR(inode_crtime, FEAT_INODE_CRTIME); -F2FS_FEATURE_RO_ATTR(lost_found, FEAT_LOST_FOUND); +F2FS_FEATURE_RO_ATTR(atomic_write); +F2FS_FEATURE_RO_ATTR(extra_attr); +F2FS_FEATURE_RO_ATTR(project_quota); +F2FS_FEATURE_RO_ATTR(inode_checksum); +F2FS_FEATURE_RO_ATTR(flexible_inline_xattr); +F2FS_FEATURE_RO_ATTR(quota_ino); +F2FS_FEATURE_RO_ATTR(inode_crtime); +F2FS_FEATURE_RO_ATTR(lost_found); #ifdef CONFIG_FS_VERITY -F2FS_FEATURE_RO_ATTR(verity, FEAT_VERITY); +F2FS_FEATURE_RO_ATTR(verity); #endif -F2FS_FEATURE_RO_ATTR(sb_checksum, FEAT_SB_CHECKSUM); -F2FS_FEATURE_RO_ATTR(casefold, FEAT_CASEFOLD); +F2FS_FEATURE_RO_ATTR(sb_checksum); +#ifdef CONFIG_UNICODE +F2FS_FEATURE_RO_ATTR(casefold); +#endif +F2FS_FEATURE_RO_ATTR(readonly); #ifdef CONFIG_F2FS_FS_COMPRESSION -F2FS_FEATURE_RO_ATTR(compression, FEAT_COMPRESSION); +F2FS_FEATURE_RO_ATTR(compression); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_written_block, compr_written_block); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_saved_block, compr_saved_block); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_new_inode, compr_new_inode); #endif +F2FS_FEATURE_RO_ATTR(pin_file); + +/* For ATGC */ +F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_candidate_ratio, candidate_ratio); +F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_candidate_count, max_candidate_count); +F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_age_weight, age_weight); +F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_age_threshold, age_threshold); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { @@ -778,6 +807,11 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(compr_saved_block), ATTR_LIST(compr_new_inode), #endif + /* For ATGC */ + ATTR_LIST(atgc_candidate_ratio), + ATTR_LIST(atgc_candidate_count), + ATTR_LIST(atgc_age_weight), + ATTR_LIST(atgc_age_threshold), NULL, }; ATTRIBUTE_GROUPS(f2fs); @@ -786,7 +820,10 @@ static struct attribute *f2fs_feat_attrs[] = { #ifdef CONFIG_FS_ENCRYPTION ATTR_LIST(encryption), ATTR_LIST(test_dummy_encryption_v2), +#ifdef CONFIG_UNICODE + ATTR_LIST(encrypted_casefold), #endif +#endif /* CONFIG_FS_ENCRYPTION */ #ifdef CONFIG_BLK_DEV_ZONED ATTR_LIST(block_zoned), #endif @@ -802,10 +839,14 @@ static struct attribute *f2fs_feat_attrs[] = { ATTR_LIST(verity), #endif ATTR_LIST(sb_checksum), +#ifdef CONFIG_UNICODE ATTR_LIST(casefold), +#endif + ATTR_LIST(readonly), #ifdef CONFIG_F2FS_FS_COMPRESSION ATTR_LIST(compression), #endif + ATTR_LIST(pin_file), NULL, }; ATTRIBUTE_GROUPS(f2fs_feat); @@ -817,6 +858,40 @@ static struct attribute *f2fs_stat_attrs[] = { }; ATTRIBUTE_GROUPS(f2fs_stat); +F2FS_SB_FEATURE_RO_ATTR(encryption, ENCRYPT); +F2FS_SB_FEATURE_RO_ATTR(block_zoned, BLKZONED); +F2FS_SB_FEATURE_RO_ATTR(extra_attr, EXTRA_ATTR); +F2FS_SB_FEATURE_RO_ATTR(project_quota, PRJQUOTA); +F2FS_SB_FEATURE_RO_ATTR(inode_checksum, INODE_CHKSUM); +F2FS_SB_FEATURE_RO_ATTR(flexible_inline_xattr, FLEXIBLE_INLINE_XATTR); +F2FS_SB_FEATURE_RO_ATTR(quota_ino, QUOTA_INO); +F2FS_SB_FEATURE_RO_ATTR(inode_crtime, INODE_CRTIME); +F2FS_SB_FEATURE_RO_ATTR(lost_found, LOST_FOUND); +F2FS_SB_FEATURE_RO_ATTR(verity, VERITY); +F2FS_SB_FEATURE_RO_ATTR(sb_checksum, SB_CHKSUM); +F2FS_SB_FEATURE_RO_ATTR(casefold, CASEFOLD); +F2FS_SB_FEATURE_RO_ATTR(compression, COMPRESSION); +F2FS_SB_FEATURE_RO_ATTR(readonly, RO); + +static struct attribute *f2fs_sb_feat_attrs[] = { + ATTR_LIST(sb_encryption), + ATTR_LIST(sb_block_zoned), + ATTR_LIST(sb_extra_attr), + ATTR_LIST(sb_project_quota), + ATTR_LIST(sb_inode_checksum), + ATTR_LIST(sb_flexible_inline_xattr), + ATTR_LIST(sb_quota_ino), + ATTR_LIST(sb_inode_crtime), + ATTR_LIST(sb_lost_found), + ATTR_LIST(sb_verity), + ATTR_LIST(sb_sb_checksum), + ATTR_LIST(sb_casefold), + ATTR_LIST(sb_compression), + ATTR_LIST(sb_readonly), + NULL, +}; +ATTRIBUTE_GROUPS(f2fs_sb_feat); + static const struct sysfs_ops f2fs_attr_ops = { .show = f2fs_attr_show, .store = f2fs_attr_store, @@ -883,6 +958,33 @@ static struct kobj_type f2fs_stat_ktype = { .release = f2fs_stat_kobj_release, }; +static ssize_t f2fs_sb_feat_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_feature_list_kobj); + struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); + + return a->show ? a->show(a, sbi, buf) : 0; +} + +static void f2fs_feature_list_kobj_release(struct kobject *kobj) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_feature_list_kobj); + complete(&sbi->s_feature_list_kobj_unregister); +} + +static const struct sysfs_ops f2fs_feature_list_attr_ops = { + .show = f2fs_sb_feat_attr_show, +}; + +static struct kobj_type f2fs_feature_list_ktype = { + .default_groups = f2fs_sb_feat_groups, + .sysfs_ops = &f2fs_feature_list_attr_ops, + .release = f2fs_feature_list_kobj_release, +}; + static int __maybe_unused segment_info_seq_show(struct seq_file *seq, void *offset) { @@ -1099,6 +1201,14 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi) if (err) goto put_stat_kobj; + sbi->s_feature_list_kobj.kset = &f2fs_kset; + init_completion(&sbi->s_feature_list_kobj_unregister); + err = kobject_init_and_add(&sbi->s_feature_list_kobj, + &f2fs_feature_list_ktype, + &sbi->s_kobj, "feature_list"); + if (err) + goto put_feature_list_kobj; + if (f2fs_proc_root) sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); @@ -1113,6 +1223,9 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi) victim_bits_seq_show, sb); } return 0; +put_feature_list_kobj: + kobject_put(&sbi->s_feature_list_kobj); + wait_for_completion(&sbi->s_feature_list_kobj_unregister); put_stat_kobj: kobject_put(&sbi->s_stat_kobj); wait_for_completion(&sbi->s_stat_kobj_unregister); @@ -1135,6 +1248,9 @@ void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi) kobject_del(&sbi->s_stat_kobj); kobject_put(&sbi->s_stat_kobj); wait_for_completion(&sbi->s_stat_kobj_unregister); + kobject_del(&sbi->s_feature_list_kobj); + kobject_put(&sbi->s_feature_list_kobj); + wait_for_completion(&sbi->s_feature_list_kobj_unregister); kobject_del(&sbi->s_kobj); kobject_put(&sbi->s_kobj); diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index fb733eb5aead..e55723744f58 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -213,7 +213,7 @@ static int fuse_setup_one_mapping(struct inode *inode, unsigned long start_idx, dmap->writable = writable; if (!upgrade) { /* - * We don't take a refernce on inode. inode is valid right now + * We don't take a reference on inode. inode is valid right now * and when inode is going away, cleanup logic should first * cleanup dmap entries. */ @@ -622,7 +622,7 @@ static int fuse_iomap_begin(struct inode *inode, loff_t pos, loff_t length, } /* - * If read beyond end of file happnes, fs code seems to return + * If read beyond end of file happens, fs code seems to return * it as hole */ iomap_hole: @@ -1207,7 +1207,7 @@ static void fuse_dax_free_mem_worker(struct work_struct *work) ret); } - /* If number of free ranges are still below threhold, requeue */ + /* If number of free ranges are still below threshold, requeue */ kick_dmap_free_worker(fcd, 1); } diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index a5ceccc5ef00..1c8f79b3dd06 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -91,7 +91,7 @@ static void fuse_drop_waiting(struct fuse_conn *fc) { /* * lockess check of fc->connected is okay, because atomic_dec_and_test() - * provides a memory barrier mached with the one in fuse_wait_aborted() + * provides a memory barrier matched with the one in fuse_wait_aborted() * to ensure no wake-up is missed. */ if (atomic_dec_and_test(&fc->num_waiting) && @@ -783,6 +783,7 @@ static int fuse_check_page(struct page *page) 1 << PG_uptodate | 1 << PG_lru | 1 << PG_active | + 1 << PG_workingset | 1 << PG_reclaim | 1 << PG_waiters))) { dump_page(page, "fuse: trying to steal weird page"); @@ -1271,6 +1272,15 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, goto restart; } spin_lock(&fpq->lock); + /* + * Must not put request on fpq->io queue after having been shut down by + * fuse_abort_conn() + */ + if (!fpq->connected) { + req->out.h.error = err = -ECONNABORTED; + goto out_end; + + } list_add(&req->list, &fpq->io); spin_unlock(&fpq->lock); cs->req = req; @@ -1857,7 +1867,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, } err = -EINVAL; - if (oh.error <= -1000 || oh.error > 0) + if (oh.error <= -512 || oh.error > 0) goto copy_finish; spin_lock(&fpq->lock); diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 1b6c001a7dd1..eade6f965b2e 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -252,7 +252,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) if (ret == -ENOMEM) goto out; if (ret || fuse_invalid_attr(&outarg.attr) || - inode_wrong_type(inode, outarg.attr.mode)) + fuse_stale_inode(inode, outarg.generation, &outarg.attr)) goto invalid; forget_all_cached_acls(inode); @@ -309,68 +309,23 @@ static int fuse_dentry_delete(const struct dentry *dentry) static struct vfsmount *fuse_dentry_automount(struct path *path) { struct fs_context *fsc; - struct fuse_mount *parent_fm = get_fuse_mount_super(path->mnt->mnt_sb); - struct fuse_conn *fc = parent_fm->fc; - struct fuse_mount *fm; struct vfsmount *mnt; struct fuse_inode *mp_fi = get_fuse_inode(d_inode(path->dentry)); - struct super_block *sb; - int err; fsc = fs_context_for_submount(path->mnt->mnt_sb->s_type, path->dentry); - if (IS_ERR(fsc)) { - err = PTR_ERR(fsc); - goto out; - } - - err = -ENOMEM; - fm = kzalloc(sizeof(struct fuse_mount), GFP_KERNEL); - if (!fm) - goto out_put_fsc; - - fsc->s_fs_info = fm; - sb = sget_fc(fsc, NULL, set_anon_super_fc); - if (IS_ERR(sb)) { - err = PTR_ERR(sb); - kfree(fm); - goto out_put_fsc; - } - fm->fc = fuse_conn_get(fc); - - /* Initialize superblock, making @mp_fi its root */ - err = fuse_fill_super_submount(sb, mp_fi); - if (err) - goto out_put_sb; + if (IS_ERR(fsc)) + return ERR_CAST(fsc); - sb->s_flags |= SB_ACTIVE; - fsc->root = dget(sb->s_root); - /* We are done configuring the superblock, so unlock it */ - up_write(&sb->s_umount); - - down_write(&fc->killsb); - list_add_tail(&fm->fc_entry, &fc->mounts); - up_write(&fc->killsb); + /* Pass the FUSE inode of the mount for fuse_get_tree_submount() */ + fsc->fs_private = mp_fi; /* Create the submount */ - mnt = vfs_create_mount(fsc); - if (IS_ERR(mnt)) { - err = PTR_ERR(mnt); - goto out_put_fsc; - } - mntget(mnt); - put_fs_context(fsc); - return mnt; + mnt = fc_mount(fsc); + if (!IS_ERR(mnt)) + mntget(mnt); -out_put_sb: - /* - * Only jump here when fsc->root is NULL and sb is still locked - * (otherwise put_fs_context() will put the superblock) - */ - deactivate_locked_super(sb); -out_put_fsc: put_fs_context(fsc); -out: - return ERR_PTR(err); + return mnt; } const struct dentry_operations fuse_dentry_operations = { diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 4722fa31a185..97f860cfc195 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -645,7 +645,7 @@ static ssize_t fuse_get_res_by_io(struct fuse_io_priv *io) * == bytes_transferred or rw == WRITE, the caller sets 'pos' to -1. * * An example: - * User requested DIO read of 64K. It was splitted into two 32K fuse requests, + * User requested DIO read of 64K. It was split into two 32K fuse requests, * both submitted asynchronously. The first of them was ACKed by userspace as * fully completed (req->out.args[0].size == 32K) resulting in pos == -1. The * second request was ACKed as short, e.g. only 1K was read, resulting in @@ -1403,7 +1403,7 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, nbytes += ret; ret += start; - npages = (ret + PAGE_SIZE - 1) / PAGE_SIZE; + npages = DIV_ROUND_UP(ret, PAGE_SIZE); ap->descs[ap->num_pages].offset = start; fuse_page_descs_length_init(ap->descs, ap->num_pages, npages); @@ -2905,11 +2905,13 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, }; int err; bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) || - (mode & FALLOC_FL_PUNCH_HOLE); + (mode & (FALLOC_FL_PUNCH_HOLE | + FALLOC_FL_ZERO_RANGE)); bool block_faults = FUSE_IS_DAX(inode) && lock_inode; - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | + FALLOC_FL_ZERO_RANGE)) return -EOPNOTSUPP; if (fm->fc->no_fallocate) @@ -2924,7 +2926,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, goto out; } - if (mode & FALLOC_FL_PUNCH_HOLE) { + if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) { loff_t endbyte = offset + length - 1; err = fuse_writeback_range(inode, offset, endbyte); @@ -2964,7 +2966,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, file_update_time(file); } - if (mode & FALLOC_FL_PUNCH_HOLE) + if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) truncate_pagecache_range(inode, offset, offset + length - 1); fuse_invalidate_attr(inode); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 7e463e220053..07829ce78695 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -761,6 +761,9 @@ struct fuse_conn { /* Auto-mount submounts announced by the server */ unsigned int auto_submounts:1; + /* Propagate syncfs() to server */ + unsigned int sync_fs:1; + /** The number of requests waiting for completion */ atomic_t num_waiting; @@ -867,6 +870,13 @@ static inline u64 fuse_get_attr_version(struct fuse_conn *fc) return atomic64_read(&fc->attr_version); } +static inline bool fuse_stale_inode(const struct inode *inode, int generation, + struct fuse_attr *attr) +{ + return inode->i_generation != generation || + inode_wrong_type(inode, attr->mode); +} + static inline void fuse_make_bad(struct inode *inode) { remove_inode_hash(inode); @@ -1082,15 +1092,6 @@ void fuse_send_init(struct fuse_mount *fm); int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx); /* - * Fill in superblock for submounts - * @sb: partially-initialized superblock to fill in - * @parent_fi: The fuse_inode of the parent filesystem where this submount is - * mounted - */ -int fuse_fill_super_submount(struct super_block *sb, - struct fuse_inode *parent_fi); - -/* * Remove the mount from the connection * * Returns whether this was the last mount @@ -1098,6 +1099,11 @@ int fuse_fill_super_submount(struct super_block *sb, bool fuse_mount_remove(struct fuse_mount *fm); /* + * Setup context ops for submounts + */ +int fuse_init_fs_context_submount(struct fs_context *fsc); + +/* * Shut down the connection (possibly sending DESTROY request). */ void fuse_conn_destroy(struct fuse_mount *fm); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 393e36b74dc4..b9beb39a4a18 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -350,8 +350,8 @@ retry: inode->i_generation = generation; fuse_init_inode(inode, attr); unlock_new_inode(inode); - } else if (inode_wrong_type(inode, attr->mode)) { - /* Inode has changed type, any I/O on the old should fail */ + } else if (fuse_stale_inode(inode, generation, attr)) { + /* nodeid was reused, any I/O on the old inode should fail */ fuse_make_bad(inode); iput(inode); goto retry; @@ -506,6 +506,45 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf) return err; } +static int fuse_sync_fs(struct super_block *sb, int wait) +{ + struct fuse_mount *fm = get_fuse_mount_super(sb); + struct fuse_conn *fc = fm->fc; + struct fuse_syncfs_in inarg; + FUSE_ARGS(args); + int err; + + /* + * Userspace cannot handle the wait == 0 case. Avoid a + * gratuitous roundtrip. + */ + if (!wait) + return 0; + + /* The filesystem is being unmounted. Nothing to do. */ + if (!sb->s_root) + return 0; + + if (!fc->sync_fs) + return 0; + + memset(&inarg, 0, sizeof(inarg)); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.opcode = FUSE_SYNCFS; + args.nodeid = get_node_id(sb->s_root->d_inode); + args.out_numargs = 0; + + err = fuse_simple_request(fm, &args); + if (err == -ENOSYS) { + fc->sync_fs = 0; + err = 0; + } + + return err; +} + enum { OPT_SOURCE, OPT_SUBTYPE, @@ -909,6 +948,7 @@ static const struct super_operations fuse_super_operations = { .put_super = fuse_put_super, .umount_begin = fuse_umount_begin, .statfs = fuse_statfs, + .sync_fs = fuse_sync_fs, .show_options = fuse_show_options, }; @@ -1275,8 +1315,8 @@ static void fuse_sb_defaults(struct super_block *sb) sb->s_xattr = fuse_no_acl_xattr_handlers; } -int fuse_fill_super_submount(struct super_block *sb, - struct fuse_inode *parent_fi) +static int fuse_fill_super_submount(struct super_block *sb, + struct fuse_inode *parent_fi) { struct fuse_mount *fm = get_fuse_mount_super(sb); struct super_block *parent_sb = parent_fi->inode.i_sb; @@ -1313,6 +1353,58 @@ int fuse_fill_super_submount(struct super_block *sb, return 0; } +/* Filesystem context private data holds the FUSE inode of the mount point */ +static int fuse_get_tree_submount(struct fs_context *fsc) +{ + struct fuse_mount *fm; + struct fuse_inode *mp_fi = fsc->fs_private; + struct fuse_conn *fc = get_fuse_conn(&mp_fi->inode); + struct super_block *sb; + int err; + + fm = kzalloc(sizeof(struct fuse_mount), GFP_KERNEL); + if (!fm) + return -ENOMEM; + + fsc->s_fs_info = fm; + sb = sget_fc(fsc, NULL, set_anon_super_fc); + if (IS_ERR(sb)) { + kfree(fm); + return PTR_ERR(sb); + } + fm->fc = fuse_conn_get(fc); + + /* Initialize superblock, making @mp_fi its root */ + err = fuse_fill_super_submount(sb, mp_fi); + if (err) { + fuse_conn_put(fc); + kfree(fm); + sb->s_fs_info = NULL; + deactivate_locked_super(sb); + return err; + } + + down_write(&fc->killsb); + list_add_tail(&fm->fc_entry, &fc->mounts); + up_write(&fc->killsb); + + sb->s_flags |= SB_ACTIVE; + fsc->root = dget(sb->s_root); + + return 0; +} + +static const struct fs_context_operations fuse_context_submount_ops = { + .get_tree = fuse_get_tree_submount, +}; + +int fuse_init_fs_context_submount(struct fs_context *fsc) +{ + fsc->ops = &fuse_context_submount_ops; + return 0; +} +EXPORT_SYMBOL_GPL(fuse_init_fs_context_submount); + int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) { struct fuse_dev *fud = NULL; diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index 277f7041d55a..bc267832310c 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -200,9 +200,12 @@ retry: if (!d_in_lookup(dentry)) { struct fuse_inode *fi; inode = d_inode(dentry); + if (inode && get_node_id(inode) != o->nodeid) + inode = NULL; if (!inode || - get_node_id(inode) != o->nodeid || - inode_wrong_type(inode, o->attr.mode)) { + fuse_stale_inode(inode, o->generation, &o->attr)) { + if (inode) + fuse_make_bad(inode); d_invalidate(dentry); dput(dentry); goto retry; diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index bcb8a02e2d8b..8f52cdaa8445 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -1447,6 +1447,7 @@ static int virtio_fs_get_tree(struct fs_context *fsc) fc->release = fuse_free_conn; fc->delete_stale = true; fc->auto_submounts = true; + fc->sync_fs = true; /* Tell FUSE to split requests that exceed the virtqueue's size */ fc->max_pages_limit = min_t(unsigned int, fc->max_pages_limit, @@ -1496,6 +1497,9 @@ static int virtio_fs_init_fs_context(struct fs_context *fsc) { struct fuse_fs_context *ctx; + if (fsc->purpose == FS_CONTEXT_FOR_SUBMOUNT) + return fuse_init_fs_context_submount(fsc); + ctx = kzalloc(sizeof(struct fuse_fs_context), GFP_KERNEL); if (!ctx) return -ENOMEM; diff --git a/fs/io_uring.c b/fs/io_uring.c index e55b21fc0ab2..d94fb5835a20 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -465,7 +465,8 @@ struct io_ring_ctx { struct mm_struct *mm_account; /* ctx exit and cancelation */ - struct callback_head *exit_task_work; + struct llist_head fallback_llist; + struct delayed_work fallback_work; struct work_struct exit_work; struct list_head tctx_list; struct completion ref_comp; @@ -784,9 +785,14 @@ struct async_poll { struct io_poll_iocb *double_poll; }; +typedef void (*io_req_tw_func_t)(struct io_kiocb *req); + struct io_task_work { - struct io_wq_work_node node; - task_work_func_t func; + union { + struct io_wq_work_node node; + struct llist_node fallback_node; + }; + io_req_tw_func_t func; }; enum { @@ -849,10 +855,7 @@ struct io_kiocb { /* used with ctx->iopoll_list with reads/writes */ struct list_head inflight_entry; - union { - struct io_task_work io_task_work; - struct callback_head task_work; - }; + struct io_task_work io_task_work; /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ struct hlist_node hash_node; struct async_poll *apoll; @@ -1071,6 +1074,8 @@ static void io_submit_flush_completions(struct io_ring_ctx *ctx); static bool io_poll_remove_waitqs(struct io_kiocb *req); static int io_req_prep_async(struct io_kiocb *req); +static void io_fallback_req_func(struct work_struct *unused); + static struct kmem_cache *req_cachep; static const struct file_operations io_uring_fops; @@ -1202,6 +1207,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->tctx_list); INIT_LIST_HEAD(&ctx->submit_state.comp.free_list); INIT_LIST_HEAD(&ctx->locked_free_list); + INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func); return ctx; err: kfree(ctx->dummy_ubuf); @@ -1929,7 +1935,7 @@ static void tctx_task_work(struct callback_head *cb) ctx = req->ctx; percpu_ref_get(&ctx->refs); } - req->task_work.func(&req->task_work); + req->io_task_work.func(req); node = next; } if (wq_list_empty(&tctx->task_list)) { @@ -1946,17 +1952,13 @@ static void tctx_task_work(struct callback_head *cb) ctx_flush_and_put(ctx); } -static int io_req_task_work_add(struct io_kiocb *req) +static void io_req_task_work_add(struct io_kiocb *req) { struct task_struct *tsk = req->task; struct io_uring_task *tctx = tsk->io_uring; enum task_work_notify_mode notify; - struct io_wq_work_node *node, *prev; + struct io_wq_work_node *node; unsigned long flags; - int ret = 0; - - if (unlikely(tsk->flags & PF_EXITING)) - return -ESRCH; WARN_ON_ONCE(!tctx); @@ -1967,7 +1969,9 @@ static int io_req_task_work_add(struct io_kiocb *req) /* task_work already pending, we're done */ if (test_bit(0, &tctx->task_state) || test_and_set_bit(0, &tctx->task_state)) - return 0; + return; + if (unlikely(tsk->flags & PF_EXITING)) + goto fail; /* * SQPOLL kernel thread doesn't need notification, just a wakeup. For @@ -1976,72 +1980,28 @@ static int io_req_task_work_add(struct io_kiocb *req) * will do the job. */ notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL; - if (!task_work_add(tsk, &tctx->task_work, notify)) { wake_up_process(tsk); - return 0; + return; } - - /* - * Slow path - we failed, find and delete work. if the work is not - * in the list, it got run and we're fine. - */ +fail: + clear_bit(0, &tctx->task_state); spin_lock_irqsave(&tctx->task_lock, flags); - wq_list_for_each(node, prev, &tctx->task_list) { - if (&req->io_task_work.node == node) { - wq_list_del(&tctx->task_list, node, prev); - ret = 1; - break; - } - } + node = tctx->task_list.first; + INIT_WQ_LIST(&tctx->task_list); spin_unlock_irqrestore(&tctx->task_lock, flags); - clear_bit(0, &tctx->task_state); - return ret; -} -static bool io_run_task_work_head(struct callback_head **work_head) -{ - struct callback_head *work, *next; - bool executed = false; - - do { - work = xchg(work_head, NULL); - if (!work) - break; - - do { - next = work->next; - work->func(work); - work = next; - cond_resched(); - } while (work); - executed = true; - } while (1); - - return executed; -} - -static void io_task_work_add_head(struct callback_head **work_head, - struct callback_head *task_work) -{ - struct callback_head *head; - - do { - head = READ_ONCE(*work_head); - task_work->next = head; - } while (cmpxchg(work_head, head, task_work) != head); -} - -static void io_req_task_work_add_fallback(struct io_kiocb *req, - task_work_func_t cb) -{ - init_task_work(&req->task_work, cb); - io_task_work_add_head(&req->ctx->exit_task_work, &req->task_work); + while (node) { + req = container_of(node, struct io_kiocb, io_task_work.node); + node = node->next; + if (llist_add(&req->io_task_work.fallback_node, + &req->ctx->fallback_llist)) + schedule_delayed_work(&req->ctx->fallback_work, 1); + } } -static void io_req_task_cancel(struct callback_head *cb) +static void io_req_task_cancel(struct io_kiocb *req) { - struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); struct io_ring_ctx *ctx = req->ctx; /* ctx is guaranteed to stay alive while we hold uring_lock */ @@ -2050,7 +2010,7 @@ static void io_req_task_cancel(struct callback_head *cb) mutex_unlock(&ctx->uring_lock); } -static void __io_req_task_submit(struct io_kiocb *req) +static void io_req_task_submit(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; @@ -2063,28 +2023,17 @@ static void __io_req_task_submit(struct io_kiocb *req) mutex_unlock(&ctx->uring_lock); } -static void io_req_task_submit(struct callback_head *cb) -{ - struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); - - __io_req_task_submit(req); -} - static void io_req_task_queue_fail(struct io_kiocb *req, int ret) { req->result = ret; - req->task_work.func = io_req_task_cancel; - - if (unlikely(io_req_task_work_add(req))) - io_req_task_work_add_fallback(req, io_req_task_cancel); + req->io_task_work.func = io_req_task_cancel; + io_req_task_work_add(req); } static void io_req_task_queue(struct io_kiocb *req) { - req->task_work.func = io_req_task_submit; - - if (unlikely(io_req_task_work_add(req))) - io_req_task_queue_fail(req, -ECANCELED); + req->io_task_work.func = io_req_task_submit; + io_req_task_work_add(req); } static inline void io_queue_next(struct io_kiocb *req) @@ -2195,18 +2144,10 @@ static inline void io_put_req(struct io_kiocb *req) io_free_req(req); } -static void io_put_req_deferred_cb(struct callback_head *cb) -{ - struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); - - io_free_req(req); -} - static void io_free_req_deferred(struct io_kiocb *req) { - req->task_work.func = io_put_req_deferred_cb; - if (unlikely(io_req_task_work_add(req))) - io_req_task_work_add_fallback(req, io_put_req_deferred_cb); + req->io_task_work.func = io_free_req; + io_req_task_work_add(req); } static inline void io_put_req_deferred(struct io_kiocb *req, int refs) @@ -2415,11 +2356,15 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) * very same mutex. */ if (list_empty(&ctx->iopoll_list)) { + u32 tail = ctx->cached_cq_tail; + mutex_unlock(&ctx->uring_lock); io_run_task_work(); mutex_lock(&ctx->uring_lock); - if (list_empty(&ctx->iopoll_list)) + /* some requests don't go through iopoll_list */ + if (tail != ctx->cached_cq_tail || + list_empty(&ctx->iopoll_list)) break; } ret = io_do_iopoll(ctx, &nr_events, min); @@ -2485,6 +2430,17 @@ static bool io_rw_should_reissue(struct io_kiocb *req) } #endif +static void io_fallback_req_func(struct work_struct *work) +{ + struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, + fallback_work.work); + struct llist_node *node = llist_del_all(&ctx->fallback_llist); + struct io_kiocb *req, *tmp; + + llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node) + req->io_task_work.func(req); +} + static void __io_complete_rw(struct io_kiocb *req, long res, long res2, unsigned int issue_flags) { @@ -4850,10 +4806,8 @@ struct io_poll_table { }; static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, - __poll_t mask, task_work_func_t func) + __poll_t mask, io_req_tw_func_t func) { - int ret; - /* for instances that support it check for an event match first: */ if (mask && !(mask & poll->events)) return 0; @@ -4863,7 +4817,7 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, list_del_init(&poll->wait.entry); req->result = mask; - req->task_work.func = func; + req->io_task_work.func = func; /* * If this fails, then the task is exiting. When a task exits, the @@ -4871,11 +4825,7 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, * of executing it. We can't safely execute it anyway, as we may not * have the needed state needed for it anyway. */ - ret = io_req_task_work_add(req); - if (unlikely(ret)) { - WRITE_ONCE(poll->canceled, true); - io_req_task_work_add_fallback(req, func); - } + io_req_task_work_add(req); return 1; } @@ -4884,6 +4834,9 @@ static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll) { struct io_ring_ctx *ctx = req->ctx; + if (unlikely(req->task->flags & PF_EXITING)) + WRITE_ONCE(poll->canceled, true); + if (!req->result && !READ_ONCE(poll->canceled)) { struct poll_table_struct pt = { ._key = poll->events }; @@ -4960,9 +4913,8 @@ static bool io_poll_complete(struct io_kiocb *req, __poll_t mask) return !(flags & IORING_CQE_F_MORE); } -static void io_poll_task_func(struct callback_head *cb) +static void io_poll_task_func(struct io_kiocb *req) { - struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *nxt; @@ -4984,7 +4936,7 @@ static void io_poll_task_func(struct callback_head *cb) if (done) { nxt = io_put_req_find_next(req); if (nxt) - __io_req_task_submit(nxt); + io_req_task_submit(nxt); } } } @@ -5004,7 +4956,7 @@ static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode, list_del_init(&wait->entry); - if (poll && poll->head) { + if (poll->head) { bool done; spin_lock(&poll->head->lock); @@ -5093,9 +5045,8 @@ static void io_async_queue_proc(struct file *file, struct wait_queue_head *head, __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll); } -static void io_async_task_func(struct callback_head *cb) +static void io_async_task_func(struct io_kiocb *req) { - struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); struct async_poll *apoll = req->apoll; struct io_ring_ctx *ctx = req->ctx; @@ -5111,7 +5062,7 @@ static void io_async_task_func(struct callback_head *cb) spin_unlock_irq(&ctx->completion_lock); if (!READ_ONCE(apoll->poll.canceled)) - __io_req_task_submit(req); + io_req_task_submit(req); else io_req_complete_failed(req, -ECANCELED); } @@ -6072,7 +6023,7 @@ static bool io_drain_req(struct io_kiocb *req) io_prep_async_link(req); de = kmalloc(sizeof(*de), GFP_KERNEL); if (!de) { - io_req_complete_failed(req, ret); + io_req_complete_failed(req, -ENOMEM); return true; } @@ -8767,11 +8718,6 @@ static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) return -EINVAL; } -static inline bool io_run_ctx_fallback(struct io_ring_ctx *ctx) -{ - return io_run_task_work_head(&ctx->exit_task_work); -} - struct io_tctx_exit { struct callback_head task_work; struct completion completion; @@ -8837,7 +8783,7 @@ static void io_ring_exit_work(struct work_struct *work) /* * Some may use context even when all refs and requests have been put, * and they are free to do so while still holding uring_lock or - * completion_lock, see __io_req_task_submit(). Apart from other work, + * completion_lock, see io_req_task_submit(). Apart from other work, * this lock/unlock section also waits them to finish. */ mutex_lock(&ctx->uring_lock); @@ -9036,7 +8982,6 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, ret |= io_kill_timeouts(ctx, task, cancel_all); if (task) ret |= io_run_task_work(); - ret |= io_run_ctx_fallback(ctx); if (!ret) break; cond_resched(); diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 51d1eb2ffeb9..746132998c57 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -701,7 +701,7 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) __buffer_unlink(jh); jh->b_cp_transaction = NULL; - percpu_counter_dec(&journal->j_jh_shrink_count); + percpu_counter_dec(&journal->j_checkpoint_jh_count); jbd2_journal_put_journal_head(jh); /* Is this transaction empty? */ @@ -764,7 +764,7 @@ void __jbd2_journal_insert_checkpoint(struct journal_head *jh, jh->b_cpnext->b_cpprev = jh; } transaction->t_checkpoint_list = jh; - percpu_counter_inc(&transaction->t_journal->j_jh_shrink_count); + percpu_counter_inc(&transaction->t_journal->j_checkpoint_jh_count); } /* diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 152880c298ca..35302bc192eb 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1283,6 +1283,48 @@ static int jbd2_min_tag_size(void) return sizeof(journal_block_tag_t) - 4; } +/** + * jbd2_journal_shrink_scan() + * + * Scan the checkpointed buffer on the checkpoint list and release the + * journal_head. + */ +static unsigned long jbd2_journal_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + journal_t *journal = container_of(shrink, journal_t, j_shrinker); + unsigned long nr_to_scan = sc->nr_to_scan; + unsigned long nr_shrunk; + unsigned long count; + + count = percpu_counter_read_positive(&journal->j_checkpoint_jh_count); + trace_jbd2_shrink_scan_enter(journal, sc->nr_to_scan, count); + + nr_shrunk = jbd2_journal_shrink_checkpoint_list(journal, &nr_to_scan); + + count = percpu_counter_read_positive(&journal->j_checkpoint_jh_count); + trace_jbd2_shrink_scan_exit(journal, nr_to_scan, nr_shrunk, count); + + return nr_shrunk; +} + +/** + * jbd2_journal_shrink_count() + * + * Count the number of checkpoint buffers on the checkpoint list. + */ +static unsigned long jbd2_journal_shrink_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + journal_t *journal = container_of(shrink, journal_t, j_shrinker); + unsigned long count; + + count = percpu_counter_read_positive(&journal->j_checkpoint_jh_count); + trace_jbd2_shrink_count(journal, sc->nr_to_scan, count); + + return count; +} + /* * Management for journal control blocks: functions to create and * destroy journal_t structures, and to initialise and read existing @@ -1361,9 +1403,23 @@ static journal_t *journal_init_common(struct block_device *bdev, journal->j_sb_buffer = bh; journal->j_superblock = (journal_superblock_t *)bh->b_data; + journal->j_shrink_transaction = NULL; + journal->j_shrinker.scan_objects = jbd2_journal_shrink_scan; + journal->j_shrinker.count_objects = jbd2_journal_shrink_count; + journal->j_shrinker.seeks = DEFAULT_SEEKS; + journal->j_shrinker.batch = journal->j_max_transaction_buffers; + + if (percpu_counter_init(&journal->j_checkpoint_jh_count, 0, GFP_KERNEL)) + goto err_cleanup; + + if (register_shrinker(&journal->j_shrinker)) { + percpu_counter_destroy(&journal->j_checkpoint_jh_count); + goto err_cleanup; + } return journal; err_cleanup: + brelse(journal->j_sb_buffer); kfree(journal->j_wbuf); jbd2_journal_destroy_revoke(journal); kfree(journal); @@ -2051,93 +2107,6 @@ recovery_error: } /** - * jbd2_journal_shrink_scan() - * - * Scan the checkpointed buffer on the checkpoint list and release the - * journal_head. - */ -static unsigned long jbd2_journal_shrink_scan(struct shrinker *shrink, - struct shrink_control *sc) -{ - journal_t *journal = container_of(shrink, journal_t, j_shrinker); - unsigned long nr_to_scan = sc->nr_to_scan; - unsigned long nr_shrunk; - unsigned long count; - - count = percpu_counter_read_positive(&journal->j_jh_shrink_count); - trace_jbd2_shrink_scan_enter(journal, sc->nr_to_scan, count); - - nr_shrunk = jbd2_journal_shrink_checkpoint_list(journal, &nr_to_scan); - - count = percpu_counter_read_positive(&journal->j_jh_shrink_count); - trace_jbd2_shrink_scan_exit(journal, nr_to_scan, nr_shrunk, count); - - return nr_shrunk; -} - -/** - * jbd2_journal_shrink_count() - * - * Count the number of checkpoint buffers on the checkpoint list. - */ -static unsigned long jbd2_journal_shrink_count(struct shrinker *shrink, - struct shrink_control *sc) -{ - journal_t *journal = container_of(shrink, journal_t, j_shrinker); - unsigned long count; - - count = percpu_counter_read_positive(&journal->j_jh_shrink_count); - trace_jbd2_shrink_count(journal, sc->nr_to_scan, count); - - return count; -} - -/** - * jbd2_journal_register_shrinker() - * @journal: Journal to act on. - * - * Init a percpu counter to record the checkpointed buffers on the checkpoint - * list and register a shrinker to release their journal_head. - */ -int jbd2_journal_register_shrinker(journal_t *journal) -{ - int err; - - journal->j_shrink_transaction = NULL; - - err = percpu_counter_init(&journal->j_jh_shrink_count, 0, GFP_KERNEL); - if (err) - return err; - - journal->j_shrinker.scan_objects = jbd2_journal_shrink_scan; - journal->j_shrinker.count_objects = jbd2_journal_shrink_count; - journal->j_shrinker.seeks = DEFAULT_SEEKS; - journal->j_shrinker.batch = journal->j_max_transaction_buffers; - - err = register_shrinker(&journal->j_shrinker); - if (err) { - percpu_counter_destroy(&journal->j_jh_shrink_count); - return err; - } - - return 0; -} -EXPORT_SYMBOL(jbd2_journal_register_shrinker); - -/** - * jbd2_journal_unregister_shrinker() - * @journal: Journal to act on. - * - * Unregister the checkpointed buffer shrinker and destroy the percpu counter. - */ -void jbd2_journal_unregister_shrinker(journal_t *journal) -{ - percpu_counter_destroy(&journal->j_jh_shrink_count); - unregister_shrinker(&journal->j_shrinker); -} -EXPORT_SYMBOL(jbd2_journal_unregister_shrinker); - -/** * jbd2_journal_destroy() - Release a journal_t structure. * @journal: Journal to act on. * @@ -2209,8 +2178,10 @@ int jbd2_journal_destroy(journal_t *journal) brelse(journal->j_sb_buffer); } - jbd2_journal_unregister_shrinker(journal); - + if (journal->j_shrinker.flags & SHRINKER_REGISTERED) { + percpu_counter_destroy(&journal->j_checkpoint_jh_count); + unregister_shrinker(&journal->j_shrinker); + } if (journal->j_proc_entry) jbd2_stats_proc_exit(journal); iput(journal->j_inode); diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 7e0e62deab53..33166ec90a11 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -548,49 +548,6 @@ void kernfs_put(struct kernfs_node *kn) } EXPORT_SYMBOL_GPL(kernfs_put); -static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags) -{ - struct kernfs_node *kn; - - if (flags & LOOKUP_RCU) - return -ECHILD; - - /* Always perform fresh lookup for negatives */ - if (d_really_is_negative(dentry)) - goto out_bad_unlocked; - - kn = kernfs_dentry_node(dentry); - mutex_lock(&kernfs_mutex); - - /* The kernfs node has been deactivated */ - if (!kernfs_active(kn)) - goto out_bad; - - /* The kernfs node has been moved? */ - if (kernfs_dentry_node(dentry->d_parent) != kn->parent) - goto out_bad; - - /* The kernfs node has been renamed */ - if (strcmp(dentry->d_name.name, kn->name) != 0) - goto out_bad; - - /* The kernfs node has been moved to a different namespace */ - if (kn->parent && kernfs_ns_enabled(kn->parent) && - kernfs_info(dentry->d_sb)->ns != kn->ns) - goto out_bad; - - mutex_unlock(&kernfs_mutex); - return 1; -out_bad: - mutex_unlock(&kernfs_mutex); -out_bad_unlocked: - return 0; -} - -const struct dentry_operations kernfs_dops = { - .d_revalidate = kernfs_dop_revalidate, -}; - /** * kernfs_node_from_dentry - determine kernfs_node associated with a dentry * @dentry: the dentry in question @@ -1073,6 +1030,49 @@ struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent, return ERR_PTR(rc); } +static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags) +{ + struct kernfs_node *kn; + + if (flags & LOOKUP_RCU) + return -ECHILD; + + /* Always perform fresh lookup for negatives */ + if (d_really_is_negative(dentry)) + goto out_bad_unlocked; + + kn = kernfs_dentry_node(dentry); + mutex_lock(&kernfs_mutex); + + /* The kernfs node has been deactivated */ + if (!kernfs_active(kn)) + goto out_bad; + + /* The kernfs node has been moved? */ + if (kernfs_dentry_node(dentry->d_parent) != kn->parent) + goto out_bad; + + /* The kernfs node has been renamed */ + if (strcmp(dentry->d_name.name, kn->name) != 0) + goto out_bad; + + /* The kernfs node has been moved to a different namespace */ + if (kn->parent && kernfs_ns_enabled(kn->parent) && + kernfs_info(dentry->d_sb)->ns != kn->ns) + goto out_bad; + + mutex_unlock(&kernfs_mutex); + return 1; +out_bad: + mutex_unlock(&kernfs_mutex); +out_bad_unlocked: + return 0; +} + +const struct dentry_operations kernfs_dops = { + .d_revalidate = kernfs_dop_revalidate, +}; + static struct dentry *kernfs_iop_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 1a639e34847d..2de048f80eb8 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -766,6 +766,46 @@ static void __exit exit_nlm(void) module_init(init_nlm); module_exit(exit_nlm); +/** + * nlmsvc_dispatch - Process an NLM Request + * @rqstp: incoming request + * @statp: pointer to location of accept_stat field in RPC Reply buffer + * + * Return values: + * %0: Processing complete; do not send a Reply + * %1: Processing complete; send Reply in rqstp->rq_res + */ +static int nlmsvc_dispatch(struct svc_rqst *rqstp, __be32 *statp) +{ + const struct svc_procedure *procp = rqstp->rq_procinfo; + struct kvec *argv = rqstp->rq_arg.head; + struct kvec *resv = rqstp->rq_res.head; + + svcxdr_init_decode(rqstp); + if (!procp->pc_decode(rqstp, argv->iov_base)) + goto out_decode_err; + + *statp = procp->pc_func(rqstp); + if (*statp == rpc_drop_reply) + return 0; + if (*statp != rpc_success) + return 1; + + svcxdr_init_encode(rqstp); + if (!procp->pc_encode(rqstp, resv->iov_base + resv->iov_len)) + goto out_encode_err; + + return 1; + +out_decode_err: + *statp = rpc_garbage_args; + return 1; + +out_encode_err: + *statp = rpc_system_err; + return 1; +} + /* * Define NLM program and procedures */ @@ -775,6 +815,7 @@ static const struct svc_version nlmsvc_version1 = { .vs_nproc = 17, .vs_proc = nlmsvc_procedures, .vs_count = nlmsvc_version1_count, + .vs_dispatch = nlmsvc_dispatch, .vs_xdrsize = NLMSVC_XDRSIZE, }; static unsigned int nlmsvc_version3_count[24]; @@ -783,6 +824,7 @@ static const struct svc_version nlmsvc_version3 = { .vs_nproc = 24, .vs_proc = nlmsvc_procedures, .vs_count = nlmsvc_version3_count, + .vs_dispatch = nlmsvc_dispatch, .vs_xdrsize = NLMSVC_XDRSIZE, }; #ifdef CONFIG_LOCKD_V4 @@ -792,6 +834,7 @@ static const struct svc_version nlmsvc_version4 = { .vs_nproc = 24, .vs_proc = nlmsvc_procedures4, .vs_count = nlmsvc_version4_count, + .vs_dispatch = nlmsvc_dispatch, .vs_xdrsize = NLMSVC_XDRSIZE, }; #endif diff --git a/fs/lockd/svcxdr.h b/fs/lockd/svcxdr.h new file mode 100644 index 000000000000..c69a0bb76c94 --- /dev/null +++ b/fs/lockd/svcxdr.h @@ -0,0 +1,151 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Encode/decode NLM basic data types + * + * Basic NLMv3 XDR data types are not defined in an IETF standards + * document. X/Open has a description of these data types that + * is useful. See Chapter 10 of "Protocols for Interworking: + * XNFS, Version 3W". + * + * Basic NLMv4 XDR data types are defined in Appendix II.1.4 of + * RFC 1813: "NFS Version 3 Protocol Specification". + * + * Author: Chuck Lever <chuck.lever@oracle.com> + * + * Copyright (c) 2020, Oracle and/or its affiliates. + */ + +#ifndef _LOCKD_SVCXDR_H_ +#define _LOCKD_SVCXDR_H_ + +static inline bool +svcxdr_decode_stats(struct xdr_stream *xdr, __be32 *status) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, XDR_UNIT); + if (!p) + return false; + *status = *p; + + return true; +} + +static inline bool +svcxdr_encode_stats(struct xdr_stream *xdr, __be32 status) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, XDR_UNIT); + if (!p) + return false; + *p = status; + + return true; +} + +static inline bool +svcxdr_decode_string(struct xdr_stream *xdr, char **data, unsigned int *data_len) +{ + __be32 *p; + u32 len; + + if (xdr_stream_decode_u32(xdr, &len) < 0) + return false; + if (len > NLM_MAXSTRLEN) + return false; + p = xdr_inline_decode(xdr, len); + if (!p) + return false; + *data_len = len; + *data = (char *)p; + + return true; +} + +/* + * NLM cookies are defined by specification to be a variable-length + * XDR opaque no longer than 1024 bytes. However, this implementation + * limits their length to 32 bytes, and treats zero-length cookies + * specially. + */ +static inline bool +svcxdr_decode_cookie(struct xdr_stream *xdr, struct nlm_cookie *cookie) +{ + __be32 *p; + u32 len; + + if (xdr_stream_decode_u32(xdr, &len) < 0) + return false; + if (len > NLM_MAXCOOKIELEN) + return false; + if (!len) + goto out_hpux; + + p = xdr_inline_decode(xdr, len); + if (!p) + return false; + cookie->len = len; + memcpy(cookie->data, p, len); + + return true; + + /* apparently HPUX can return empty cookies */ +out_hpux: + cookie->len = 4; + memset(cookie->data, 0, 4); + return true; +} + +static inline bool +svcxdr_encode_cookie(struct xdr_stream *xdr, const struct nlm_cookie *cookie) +{ + __be32 *p; + + if (xdr_stream_encode_u32(xdr, cookie->len) < 0) + return false; + p = xdr_reserve_space(xdr, cookie->len); + if (!p) + return false; + memcpy(p, cookie->data, cookie->len); + + return true; +} + +static inline bool +svcxdr_decode_owner(struct xdr_stream *xdr, struct xdr_netobj *obj) +{ + __be32 *p; + u32 len; + + if (xdr_stream_decode_u32(xdr, &len) < 0) + return false; + if (len > XDR_MAX_NETOBJ) + return false; + p = xdr_inline_decode(xdr, len); + if (!p) + return false; + obj->len = len; + obj->data = (u8 *)p; + + return true; +} + +static inline bool +svcxdr_encode_owner(struct xdr_stream *xdr, const struct xdr_netobj *obj) +{ + unsigned int quadlen = XDR_QUADLEN(obj->len); + __be32 *p; + + if (xdr_stream_encode_u32(xdr, obj->len) < 0) + return false; + p = xdr_reserve_space(xdr, obj->len); + if (!p) + return false; + p[quadlen - 1] = 0; /* XDR pad */ + memcpy(p, obj->data, obj->len); + + return true; +} + +#endif /* _LOCKD_SVCXDR_H_ */ diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c index 982629f7b120..9235e60b1769 100644 --- a/fs/lockd/xdr.c +++ b/fs/lockd/xdr.c @@ -19,7 +19,7 @@ #include <uapi/linux/nfs2.h> -#define NLMDBG_FACILITY NLMDBG_XDR +#include "svcxdr.h" static inline loff_t @@ -42,311 +42,323 @@ loff_t_to_s32(loff_t offset) } /* - * XDR functions for basic NLM types + * NLM file handles are defined by specification to be a variable-length + * XDR opaque no longer than 1024 bytes. However, this implementation + * constrains their length to exactly the length of an NFSv2 file + * handle. */ -static __be32 *nlm_decode_cookie(__be32 *p, struct nlm_cookie *c) +static bool +svcxdr_decode_fhandle(struct xdr_stream *xdr, struct nfs_fh *fh) { - unsigned int len; - - len = ntohl(*p++); - - if(len==0) - { - c->len=4; - memset(c->data, 0, 4); /* hockeypux brain damage */ - } - else if(len<=NLM_MAXCOOKIELEN) - { - c->len=len; - memcpy(c->data, p, len); - p+=XDR_QUADLEN(len); - } - else - { - dprintk("lockd: bad cookie size %d (only cookies under " - "%d bytes are supported.)\n", - len, NLM_MAXCOOKIELEN); - return NULL; - } - return p; -} - -static inline __be32 * -nlm_encode_cookie(__be32 *p, struct nlm_cookie *c) -{ - *p++ = htonl(c->len); - memcpy(p, c->data, c->len); - p+=XDR_QUADLEN(c->len); - return p; -} - -static __be32 * -nlm_decode_fh(__be32 *p, struct nfs_fh *f) -{ - unsigned int len; - - if ((len = ntohl(*p++)) != NFS2_FHSIZE) { - dprintk("lockd: bad fhandle size %d (should be %d)\n", - len, NFS2_FHSIZE); - return NULL; - } - f->size = NFS2_FHSIZE; - memset(f->data, 0, sizeof(f->data)); - memcpy(f->data, p, NFS2_FHSIZE); - return p + XDR_QUADLEN(NFS2_FHSIZE); -} - -/* - * Encode and decode owner handle - */ -static inline __be32 * -nlm_decode_oh(__be32 *p, struct xdr_netobj *oh) -{ - return xdr_decode_netobj(p, oh); -} - -static inline __be32 * -nlm_encode_oh(__be32 *p, struct xdr_netobj *oh) -{ - return xdr_encode_netobj(p, oh); + __be32 *p; + u32 len; + + if (xdr_stream_decode_u32(xdr, &len) < 0) + return false; + if (len != NFS2_FHSIZE) + return false; + + p = xdr_inline_decode(xdr, len); + if (!p) + return false; + fh->size = NFS2_FHSIZE; + memcpy(fh->data, p, len); + memset(fh->data + NFS2_FHSIZE, 0, sizeof(fh->data) - NFS2_FHSIZE); + + return true; } -static __be32 * -nlm_decode_lock(__be32 *p, struct nlm_lock *lock) +static bool +svcxdr_decode_lock(struct xdr_stream *xdr, struct nlm_lock *lock) { - struct file_lock *fl = &lock->fl; - s32 start, len, end; - - if (!(p = xdr_decode_string_inplace(p, &lock->caller, - &lock->len, - NLM_MAXSTRLEN)) - || !(p = nlm_decode_fh(p, &lock->fh)) - || !(p = nlm_decode_oh(p, &lock->oh))) - return NULL; - lock->svid = ntohl(*p++); + struct file_lock *fl = &lock->fl; + s32 start, len, end; + + if (!svcxdr_decode_string(xdr, &lock->caller, &lock->len)) + return false; + if (!svcxdr_decode_fhandle(xdr, &lock->fh)) + return false; + if (!svcxdr_decode_owner(xdr, &lock->oh)) + return false; + if (xdr_stream_decode_u32(xdr, &lock->svid) < 0) + return false; + if (xdr_stream_decode_u32(xdr, &start) < 0) + return false; + if (xdr_stream_decode_u32(xdr, &len) < 0) + return false; locks_init_lock(fl); fl->fl_flags = FL_POSIX; - fl->fl_type = F_RDLCK; /* as good as anything else */ - start = ntohl(*p++); - len = ntohl(*p++); + fl->fl_type = F_RDLCK; end = start + len - 1; - fl->fl_start = s32_to_loff_t(start); - if (len == 0 || end < 0) fl->fl_end = OFFSET_MAX; else fl->fl_end = s32_to_loff_t(end); - return p; + + return true; } -/* - * Encode result of a TEST/TEST_MSG call - */ -static __be32 * -nlm_encode_testres(__be32 *p, struct nlm_res *resp) +static bool +svcxdr_encode_holder(struct xdr_stream *xdr, const struct nlm_lock *lock) { - s32 start, len; - - if (!(p = nlm_encode_cookie(p, &resp->cookie))) - return NULL; - *p++ = resp->status; - - if (resp->status == nlm_lck_denied) { - struct file_lock *fl = &resp->lock.fl; - - *p++ = (fl->fl_type == F_RDLCK)? xdr_zero : xdr_one; - *p++ = htonl(resp->lock.svid); - - /* Encode owner handle. */ - if (!(p = xdr_encode_netobj(p, &resp->lock.oh))) - return NULL; + const struct file_lock *fl = &lock->fl; + s32 start, len; + + /* exclusive */ + if (xdr_stream_encode_bool(xdr, fl->fl_type != F_RDLCK) < 0) + return false; + if (xdr_stream_encode_u32(xdr, lock->svid) < 0) + return false; + if (!svcxdr_encode_owner(xdr, &lock->oh)) + return false; + start = loff_t_to_s32(fl->fl_start); + if (fl->fl_end == OFFSET_MAX) + len = 0; + else + len = loff_t_to_s32(fl->fl_end - fl->fl_start + 1); + if (xdr_stream_encode_u32(xdr, start) < 0) + return false; + if (xdr_stream_encode_u32(xdr, len) < 0) + return false; - start = loff_t_to_s32(fl->fl_start); - if (fl->fl_end == OFFSET_MAX) - len = 0; - else - len = loff_t_to_s32(fl->fl_end - fl->fl_start + 1); + return true; +} - *p++ = htonl(start); - *p++ = htonl(len); +static bool +svcxdr_encode_testrply(struct xdr_stream *xdr, const struct nlm_res *resp) +{ + if (!svcxdr_encode_stats(xdr, resp->status)) + return false; + switch (resp->status) { + case nlm_lck_denied: + if (!svcxdr_encode_holder(xdr, &resp->lock)) + return false; } - return p; + return true; } /* - * First, the server side XDR functions + * Decode Call arguments */ + +int +nlmsvc_decode_void(struct svc_rqst *rqstp, __be32 *p) +{ + return 1; +} + int nlmsvc_decode_testargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nlm_args *argp = rqstp->rq_argp; - u32 exclusive; + u32 exclusive; - if (!(p = nlm_decode_cookie(p, &argp->cookie))) + if (!svcxdr_decode_cookie(xdr, &argp->cookie)) return 0; - - exclusive = ntohl(*p++); - if (!(p = nlm_decode_lock(p, &argp->lock))) + if (xdr_stream_decode_bool(xdr, &exclusive) < 0) + return 0; + if (!svcxdr_decode_lock(xdr, &argp->lock)) return 0; if (exclusive) argp->lock.fl.fl_type = F_WRLCK; - return xdr_argsize_check(rqstp, p); -} - -int -nlmsvc_encode_testres(struct svc_rqst *rqstp, __be32 *p) -{ - struct nlm_res *resp = rqstp->rq_resp; - - if (!(p = nlm_encode_testres(p, resp))) - return 0; - return xdr_ressize_check(rqstp, p); + return 1; } int nlmsvc_decode_lockargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nlm_args *argp = rqstp->rq_argp; - u32 exclusive; + u32 exclusive; - if (!(p = nlm_decode_cookie(p, &argp->cookie))) + if (!svcxdr_decode_cookie(xdr, &argp->cookie)) + return 0; + if (xdr_stream_decode_bool(xdr, &argp->block) < 0) return 0; - argp->block = ntohl(*p++); - exclusive = ntohl(*p++); - if (!(p = nlm_decode_lock(p, &argp->lock))) + if (xdr_stream_decode_bool(xdr, &exclusive) < 0) + return 0; + if (!svcxdr_decode_lock(xdr, &argp->lock)) return 0; if (exclusive) argp->lock.fl.fl_type = F_WRLCK; - argp->reclaim = ntohl(*p++); - argp->state = ntohl(*p++); + if (xdr_stream_decode_bool(xdr, &argp->reclaim) < 0) + return 0; + if (xdr_stream_decode_u32(xdr, &argp->state) < 0) + return 0; argp->monitor = 1; /* monitor client by default */ - return xdr_argsize_check(rqstp, p); + return 1; } int nlmsvc_decode_cancargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nlm_args *argp = rqstp->rq_argp; - u32 exclusive; + u32 exclusive; - if (!(p = nlm_decode_cookie(p, &argp->cookie))) + if (!svcxdr_decode_cookie(xdr, &argp->cookie)) + return 0; + if (xdr_stream_decode_bool(xdr, &argp->block) < 0) return 0; - argp->block = ntohl(*p++); - exclusive = ntohl(*p++); - if (!(p = nlm_decode_lock(p, &argp->lock))) + if (xdr_stream_decode_bool(xdr, &exclusive) < 0) + return 0; + if (!svcxdr_decode_lock(xdr, &argp->lock)) return 0; if (exclusive) argp->lock.fl.fl_type = F_WRLCK; - return xdr_argsize_check(rqstp, p); + + return 1; } int nlmsvc_decode_unlockargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nlm_args *argp = rqstp->rq_argp; - if (!(p = nlm_decode_cookie(p, &argp->cookie)) - || !(p = nlm_decode_lock(p, &argp->lock))) + if (!svcxdr_decode_cookie(xdr, &argp->cookie)) + return 0; + if (!svcxdr_decode_lock(xdr, &argp->lock)) return 0; argp->lock.fl.fl_type = F_UNLCK; - return xdr_argsize_check(rqstp, p); + + return 1; } int -nlmsvc_decode_shareargs(struct svc_rqst *rqstp, __be32 *p) +nlmsvc_decode_res(struct svc_rqst *rqstp, __be32 *p) { - struct nlm_args *argp = rqstp->rq_argp; - struct nlm_lock *lock = &argp->lock; - - memset(lock, 0, sizeof(*lock)); - locks_init_lock(&lock->fl); - lock->svid = ~(u32) 0; + struct xdr_stream *xdr = &rqstp->rq_arg_stream; + struct nlm_res *resp = rqstp->rq_argp; - if (!(p = nlm_decode_cookie(p, &argp->cookie)) - || !(p = xdr_decode_string_inplace(p, &lock->caller, - &lock->len, NLM_MAXSTRLEN)) - || !(p = nlm_decode_fh(p, &lock->fh)) - || !(p = nlm_decode_oh(p, &lock->oh))) + if (!svcxdr_decode_cookie(xdr, &resp->cookie)) + return 0; + if (!svcxdr_decode_stats(xdr, &resp->status)) return 0; - argp->fsm_mode = ntohl(*p++); - argp->fsm_access = ntohl(*p++); - return xdr_argsize_check(rqstp, p); + + return 1; } int -nlmsvc_encode_shareres(struct svc_rqst *rqstp, __be32 *p) +nlmsvc_decode_reboot(struct svc_rqst *rqstp, __be32 *p) { - struct nlm_res *resp = rqstp->rq_resp; + struct xdr_stream *xdr = &rqstp->rq_arg_stream; + struct nlm_reboot *argp = rqstp->rq_argp; + u32 len; - if (!(p = nlm_encode_cookie(p, &resp->cookie))) + if (xdr_stream_decode_u32(xdr, &len) < 0) + return 0; + if (len > SM_MAXSTRLEN) + return 0; + p = xdr_inline_decode(xdr, len); + if (!p) + return 0; + argp->len = len; + argp->mon = (char *)p; + if (xdr_stream_decode_u32(xdr, &argp->state) < 0) + return 0; + p = xdr_inline_decode(xdr, SM_PRIV_SIZE); + if (!p) return 0; - *p++ = resp->status; - *p++ = xdr_zero; /* sequence argument */ - return xdr_ressize_check(rqstp, p); + memcpy(&argp->priv.data, p, sizeof(argp->priv.data)); + + return 1; } int -nlmsvc_encode_res(struct svc_rqst *rqstp, __be32 *p) +nlmsvc_decode_shareargs(struct svc_rqst *rqstp, __be32 *p) { - struct nlm_res *resp = rqstp->rq_resp; + struct xdr_stream *xdr = &rqstp->rq_arg_stream; + struct nlm_args *argp = rqstp->rq_argp; + struct nlm_lock *lock = &argp->lock; - if (!(p = nlm_encode_cookie(p, &resp->cookie))) + memset(lock, 0, sizeof(*lock)); + locks_init_lock(&lock->fl); + lock->svid = ~(u32)0; + + if (!svcxdr_decode_cookie(xdr, &argp->cookie)) + return 0; + if (!svcxdr_decode_string(xdr, &lock->caller, &lock->len)) + return 0; + if (!svcxdr_decode_fhandle(xdr, &lock->fh)) + return 0; + if (!svcxdr_decode_owner(xdr, &lock->oh)) + return 0; + /* XXX: Range checks are missing in the original code */ + if (xdr_stream_decode_u32(xdr, &argp->fsm_mode) < 0) + return 0; + if (xdr_stream_decode_u32(xdr, &argp->fsm_access) < 0) return 0; - *p++ = resp->status; - return xdr_ressize_check(rqstp, p); + + return 1; } int nlmsvc_decode_notify(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nlm_args *argp = rqstp->rq_argp; struct nlm_lock *lock = &argp->lock; - if (!(p = xdr_decode_string_inplace(p, &lock->caller, - &lock->len, NLM_MAXSTRLEN))) + if (!svcxdr_decode_string(xdr, &lock->caller, &lock->len)) + return 0; + if (xdr_stream_decode_u32(xdr, &argp->state) < 0) return 0; - argp->state = ntohl(*p++); - return xdr_argsize_check(rqstp, p); + + return 1; } + +/* + * Encode Reply results + */ + int -nlmsvc_decode_reboot(struct svc_rqst *rqstp, __be32 *p) +nlmsvc_encode_void(struct svc_rqst *rqstp, __be32 *p) { - struct nlm_reboot *argp = rqstp->rq_argp; - - if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN))) - return 0; - argp->state = ntohl(*p++); - memcpy(&argp->priv.data, p, sizeof(argp->priv.data)); - p += XDR_QUADLEN(SM_PRIV_SIZE); - return xdr_argsize_check(rqstp, p); + return 1; } int -nlmsvc_decode_res(struct svc_rqst *rqstp, __be32 *p) +nlmsvc_encode_testres(struct svc_rqst *rqstp, __be32 *p) { - struct nlm_res *resp = rqstp->rq_argp; + struct xdr_stream *xdr = &rqstp->rq_res_stream; + struct nlm_res *resp = rqstp->rq_resp; - if (!(p = nlm_decode_cookie(p, &resp->cookie))) - return 0; - resp->status = *p++; - return xdr_argsize_check(rqstp, p); + return svcxdr_encode_cookie(xdr, &resp->cookie) && + svcxdr_encode_testrply(xdr, resp); } int -nlmsvc_decode_void(struct svc_rqst *rqstp, __be32 *p) +nlmsvc_encode_res(struct svc_rqst *rqstp, __be32 *p) { - return xdr_argsize_check(rqstp, p); + struct xdr_stream *xdr = &rqstp->rq_res_stream; + struct nlm_res *resp = rqstp->rq_resp; + + return svcxdr_encode_cookie(xdr, &resp->cookie) && + svcxdr_encode_stats(xdr, resp->status); } int -nlmsvc_encode_void(struct svc_rqst *rqstp, __be32 *p) +nlmsvc_encode_shareres(struct svc_rqst *rqstp, __be32 *p) { - return xdr_ressize_check(rqstp, p); + struct xdr_stream *xdr = &rqstp->rq_res_stream; + struct nlm_res *resp = rqstp->rq_resp; + + if (!svcxdr_encode_cookie(xdr, &resp->cookie)) + return 0; + if (!svcxdr_encode_stats(xdr, resp->status)) + return 0; + /* sequence */ + if (xdr_stream_encode_u32(xdr, 0) < 0) + return 0; + + return 1; } diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c index 5fa9f48a9dba..98e957e4566c 100644 --- a/fs/lockd/xdr4.c +++ b/fs/lockd/xdr4.c @@ -18,7 +18,7 @@ #include <linux/sunrpc/stats.h> #include <linux/lockd/lockd.h> -#define NLMDBG_FACILITY NLMDBG_XDR +#include "svcxdr.h" static inline loff_t s64_to_loff_t(__s64 offset) @@ -41,309 +41,322 @@ loff_t_to_s64(loff_t offset) } /* - * XDR functions for basic NLM types + * NLM file handles are defined by specification to be a variable-length + * XDR opaque no longer than 1024 bytes. However, this implementation + * limits their length to the size of an NFSv3 file handle. */ -static __be32 * -nlm4_decode_cookie(__be32 *p, struct nlm_cookie *c) +static bool +svcxdr_decode_fhandle(struct xdr_stream *xdr, struct nfs_fh *fh) { - unsigned int len; - - len = ntohl(*p++); - - if(len==0) - { - c->len=4; - memset(c->data, 0, 4); /* hockeypux brain damage */ - } - else if(len<=NLM_MAXCOOKIELEN) - { - c->len=len; - memcpy(c->data, p, len); - p+=XDR_QUADLEN(len); - } - else - { - dprintk("lockd: bad cookie size %d (only cookies under " - "%d bytes are supported.)\n", - len, NLM_MAXCOOKIELEN); - return NULL; - } - return p; -} - -static __be32 * -nlm4_encode_cookie(__be32 *p, struct nlm_cookie *c) -{ - *p++ = htonl(c->len); - memcpy(p, c->data, c->len); - p+=XDR_QUADLEN(c->len); - return p; -} - -static __be32 * -nlm4_decode_fh(__be32 *p, struct nfs_fh *f) -{ - memset(f->data, 0, sizeof(f->data)); - f->size = ntohl(*p++); - if (f->size > NFS_MAXFHSIZE) { - dprintk("lockd: bad fhandle size %d (should be <=%d)\n", - f->size, NFS_MAXFHSIZE); - return NULL; - } - memcpy(f->data, p, f->size); - return p + XDR_QUADLEN(f->size); -} - -/* - * Encode and decode owner handle - */ -static __be32 * -nlm4_decode_oh(__be32 *p, struct xdr_netobj *oh) -{ - return xdr_decode_netobj(p, oh); + __be32 *p; + u32 len; + + if (xdr_stream_decode_u32(xdr, &len) < 0) + return false; + if (len > NFS_MAXFHSIZE) + return false; + + p = xdr_inline_decode(xdr, len); + if (!p) + return false; + fh->size = len; + memcpy(fh->data, p, len); + memset(fh->data + len, 0, sizeof(fh->data) - len); + + return true; } -static __be32 * -nlm4_decode_lock(__be32 *p, struct nlm_lock *lock) +static bool +svcxdr_decode_lock(struct xdr_stream *xdr, struct nlm_lock *lock) { - struct file_lock *fl = &lock->fl; - __u64 len, start; - __s64 end; - - if (!(p = xdr_decode_string_inplace(p, &lock->caller, - &lock->len, NLM_MAXSTRLEN)) - || !(p = nlm4_decode_fh(p, &lock->fh)) - || !(p = nlm4_decode_oh(p, &lock->oh))) - return NULL; - lock->svid = ntohl(*p++); + struct file_lock *fl = &lock->fl; + u64 len, start; + s64 end; + + if (!svcxdr_decode_string(xdr, &lock->caller, &lock->len)) + return false; + if (!svcxdr_decode_fhandle(xdr, &lock->fh)) + return false; + if (!svcxdr_decode_owner(xdr, &lock->oh)) + return false; + if (xdr_stream_decode_u32(xdr, &lock->svid) < 0) + return false; + if (xdr_stream_decode_u64(xdr, &start) < 0) + return false; + if (xdr_stream_decode_u64(xdr, &len) < 0) + return false; locks_init_lock(fl); fl->fl_flags = FL_POSIX; - fl->fl_type = F_RDLCK; /* as good as anything else */ - p = xdr_decode_hyper(p, &start); - p = xdr_decode_hyper(p, &len); + fl->fl_type = F_RDLCK; end = start + len - 1; - fl->fl_start = s64_to_loff_t(start); - if (len == 0 || end < 0) fl->fl_end = OFFSET_MAX; else fl->fl_end = s64_to_loff_t(end); - return p; + + return true; } -/* - * Encode result of a TEST/TEST_MSG call - */ -static __be32 * -nlm4_encode_testres(__be32 *p, struct nlm_res *resp) +static bool +svcxdr_encode_holder(struct xdr_stream *xdr, const struct nlm_lock *lock) +{ + const struct file_lock *fl = &lock->fl; + s64 start, len; + + /* exclusive */ + if (xdr_stream_encode_bool(xdr, fl->fl_type != F_RDLCK) < 0) + return false; + if (xdr_stream_encode_u32(xdr, lock->svid) < 0) + return false; + if (!svcxdr_encode_owner(xdr, &lock->oh)) + return false; + start = loff_t_to_s64(fl->fl_start); + if (fl->fl_end == OFFSET_MAX) + len = 0; + else + len = loff_t_to_s64(fl->fl_end - fl->fl_start + 1); + if (xdr_stream_encode_u64(xdr, start) < 0) + return false; + if (xdr_stream_encode_u64(xdr, len) < 0) + return false; + + return true; +} + +static bool +svcxdr_encode_testrply(struct xdr_stream *xdr, const struct nlm_res *resp) { - s64 start, len; - - dprintk("xdr: before encode_testres (p %p resp %p)\n", p, resp); - if (!(p = nlm4_encode_cookie(p, &resp->cookie))) - return NULL; - *p++ = resp->status; - - if (resp->status == nlm_lck_denied) { - struct file_lock *fl = &resp->lock.fl; - - *p++ = (fl->fl_type == F_RDLCK)? xdr_zero : xdr_one; - *p++ = htonl(resp->lock.svid); - - /* Encode owner handle. */ - if (!(p = xdr_encode_netobj(p, &resp->lock.oh))) - return NULL; - - start = loff_t_to_s64(fl->fl_start); - if (fl->fl_end == OFFSET_MAX) - len = 0; - else - len = loff_t_to_s64(fl->fl_end - fl->fl_start + 1); - - p = xdr_encode_hyper(p, start); - p = xdr_encode_hyper(p, len); - dprintk("xdr: encode_testres (status %u pid %d type %d start %Ld end %Ld)\n", - resp->status, (int)resp->lock.svid, fl->fl_type, - (long long)fl->fl_start, (long long)fl->fl_end); + if (!svcxdr_encode_stats(xdr, resp->status)) + return false; + switch (resp->status) { + case nlm_lck_denied: + if (!svcxdr_encode_holder(xdr, &resp->lock)) + return false; } - dprintk("xdr: after encode_testres (p %p resp %p)\n", p, resp); - return p; + return true; } /* - * First, the server side XDR functions + * Decode Call arguments */ + +int +nlm4svc_decode_void(struct svc_rqst *rqstp, __be32 *p) +{ + return 1; +} + int nlm4svc_decode_testargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nlm_args *argp = rqstp->rq_argp; - u32 exclusive; + u32 exclusive; - if (!(p = nlm4_decode_cookie(p, &argp->cookie))) + if (!svcxdr_decode_cookie(xdr, &argp->cookie)) return 0; - - exclusive = ntohl(*p++); - if (!(p = nlm4_decode_lock(p, &argp->lock))) + if (xdr_stream_decode_bool(xdr, &exclusive) < 0) + return 0; + if (!svcxdr_decode_lock(xdr, &argp->lock)) return 0; if (exclusive) argp->lock.fl.fl_type = F_WRLCK; - return xdr_argsize_check(rqstp, p); -} - -int -nlm4svc_encode_testres(struct svc_rqst *rqstp, __be32 *p) -{ - struct nlm_res *resp = rqstp->rq_resp; - - if (!(p = nlm4_encode_testres(p, resp))) - return 0; - return xdr_ressize_check(rqstp, p); + return 1; } int nlm4svc_decode_lockargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nlm_args *argp = rqstp->rq_argp; - u32 exclusive; + u32 exclusive; - if (!(p = nlm4_decode_cookie(p, &argp->cookie))) + if (!svcxdr_decode_cookie(xdr, &argp->cookie)) + return 0; + if (xdr_stream_decode_bool(xdr, &argp->block) < 0) return 0; - argp->block = ntohl(*p++); - exclusive = ntohl(*p++); - if (!(p = nlm4_decode_lock(p, &argp->lock))) + if (xdr_stream_decode_bool(xdr, &exclusive) < 0) + return 0; + if (!svcxdr_decode_lock(xdr, &argp->lock)) return 0; if (exclusive) argp->lock.fl.fl_type = F_WRLCK; - argp->reclaim = ntohl(*p++); - argp->state = ntohl(*p++); + if (xdr_stream_decode_bool(xdr, &argp->reclaim) < 0) + return 0; + if (xdr_stream_decode_u32(xdr, &argp->state) < 0) + return 0; argp->monitor = 1; /* monitor client by default */ - return xdr_argsize_check(rqstp, p); + return 1; } int nlm4svc_decode_cancargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nlm_args *argp = rqstp->rq_argp; - u32 exclusive; + u32 exclusive; - if (!(p = nlm4_decode_cookie(p, &argp->cookie))) + if (!svcxdr_decode_cookie(xdr, &argp->cookie)) + return 0; + if (xdr_stream_decode_bool(xdr, &argp->block) < 0) return 0; - argp->block = ntohl(*p++); - exclusive = ntohl(*p++); - if (!(p = nlm4_decode_lock(p, &argp->lock))) + if (xdr_stream_decode_bool(xdr, &exclusive) < 0) + return 0; + if (!svcxdr_decode_lock(xdr, &argp->lock)) return 0; if (exclusive) argp->lock.fl.fl_type = F_WRLCK; - return xdr_argsize_check(rqstp, p); + return 1; } int nlm4svc_decode_unlockargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nlm_args *argp = rqstp->rq_argp; - if (!(p = nlm4_decode_cookie(p, &argp->cookie)) - || !(p = nlm4_decode_lock(p, &argp->lock))) + if (!svcxdr_decode_cookie(xdr, &argp->cookie)) + return 0; + if (!svcxdr_decode_lock(xdr, &argp->lock)) return 0; argp->lock.fl.fl_type = F_UNLCK; - return xdr_argsize_check(rqstp, p); + + return 1; } int -nlm4svc_decode_shareargs(struct svc_rqst *rqstp, __be32 *p) +nlm4svc_decode_res(struct svc_rqst *rqstp, __be32 *p) { - struct nlm_args *argp = rqstp->rq_argp; - struct nlm_lock *lock = &argp->lock; - - memset(lock, 0, sizeof(*lock)); - locks_init_lock(&lock->fl); - lock->svid = ~(u32) 0; + struct xdr_stream *xdr = &rqstp->rq_arg_stream; + struct nlm_res *resp = rqstp->rq_argp; - if (!(p = nlm4_decode_cookie(p, &argp->cookie)) - || !(p = xdr_decode_string_inplace(p, &lock->caller, - &lock->len, NLM_MAXSTRLEN)) - || !(p = nlm4_decode_fh(p, &lock->fh)) - || !(p = nlm4_decode_oh(p, &lock->oh))) + if (!svcxdr_decode_cookie(xdr, &resp->cookie)) + return 0; + if (!svcxdr_decode_stats(xdr, &resp->status)) return 0; - argp->fsm_mode = ntohl(*p++); - argp->fsm_access = ntohl(*p++); - return xdr_argsize_check(rqstp, p); + + return 1; } int -nlm4svc_encode_shareres(struct svc_rqst *rqstp, __be32 *p) +nlm4svc_decode_reboot(struct svc_rqst *rqstp, __be32 *p) { - struct nlm_res *resp = rqstp->rq_resp; + struct xdr_stream *xdr = &rqstp->rq_arg_stream; + struct nlm_reboot *argp = rqstp->rq_argp; + u32 len; - if (!(p = nlm4_encode_cookie(p, &resp->cookie))) + if (xdr_stream_decode_u32(xdr, &len) < 0) return 0; - *p++ = resp->status; - *p++ = xdr_zero; /* sequence argument */ - return xdr_ressize_check(rqstp, p); + if (len > SM_MAXSTRLEN) + return 0; + p = xdr_inline_decode(xdr, len); + if (!p) + return 0; + argp->len = len; + argp->mon = (char *)p; + if (xdr_stream_decode_u32(xdr, &argp->state) < 0) + return 0; + p = xdr_inline_decode(xdr, SM_PRIV_SIZE); + if (!p) + return 0; + memcpy(&argp->priv.data, p, sizeof(argp->priv.data)); + + return 1; } int -nlm4svc_encode_res(struct svc_rqst *rqstp, __be32 *p) +nlm4svc_decode_shareargs(struct svc_rqst *rqstp, __be32 *p) { - struct nlm_res *resp = rqstp->rq_resp; + struct xdr_stream *xdr = &rqstp->rq_arg_stream; + struct nlm_args *argp = rqstp->rq_argp; + struct nlm_lock *lock = &argp->lock; + + memset(lock, 0, sizeof(*lock)); + locks_init_lock(&lock->fl); + lock->svid = ~(u32)0; - if (!(p = nlm4_encode_cookie(p, &resp->cookie))) + if (!svcxdr_decode_cookie(xdr, &argp->cookie)) return 0; - *p++ = resp->status; - return xdr_ressize_check(rqstp, p); + if (!svcxdr_decode_string(xdr, &lock->caller, &lock->len)) + return 0; + if (!svcxdr_decode_fhandle(xdr, &lock->fh)) + return 0; + if (!svcxdr_decode_owner(xdr, &lock->oh)) + return 0; + /* XXX: Range checks are missing in the original code */ + if (xdr_stream_decode_u32(xdr, &argp->fsm_mode) < 0) + return 0; + if (xdr_stream_decode_u32(xdr, &argp->fsm_access) < 0) + return 0; + + return 1; } int nlm4svc_decode_notify(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nlm_args *argp = rqstp->rq_argp; struct nlm_lock *lock = &argp->lock; - if (!(p = xdr_decode_string_inplace(p, &lock->caller, - &lock->len, NLM_MAXSTRLEN))) + if (!svcxdr_decode_string(xdr, &lock->caller, &lock->len)) + return 0; + if (xdr_stream_decode_u32(xdr, &argp->state) < 0) return 0; - argp->state = ntohl(*p++); - return xdr_argsize_check(rqstp, p); + + return 1; } + +/* + * Encode Reply results + */ + int -nlm4svc_decode_reboot(struct svc_rqst *rqstp, __be32 *p) +nlm4svc_encode_void(struct svc_rqst *rqstp, __be32 *p) { - struct nlm_reboot *argp = rqstp->rq_argp; - - if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN))) - return 0; - argp->state = ntohl(*p++); - memcpy(&argp->priv.data, p, sizeof(argp->priv.data)); - p += XDR_QUADLEN(SM_PRIV_SIZE); - return xdr_argsize_check(rqstp, p); + return 1; } int -nlm4svc_decode_res(struct svc_rqst *rqstp, __be32 *p) +nlm4svc_encode_testres(struct svc_rqst *rqstp, __be32 *p) { - struct nlm_res *resp = rqstp->rq_argp; + struct xdr_stream *xdr = &rqstp->rq_res_stream; + struct nlm_res *resp = rqstp->rq_resp; - if (!(p = nlm4_decode_cookie(p, &resp->cookie))) - return 0; - resp->status = *p++; - return xdr_argsize_check(rqstp, p); + return svcxdr_encode_cookie(xdr, &resp->cookie) && + svcxdr_encode_testrply(xdr, resp); } int -nlm4svc_decode_void(struct svc_rqst *rqstp, __be32 *p) +nlm4svc_encode_res(struct svc_rqst *rqstp, __be32 *p) { - return xdr_argsize_check(rqstp, p); + struct xdr_stream *xdr = &rqstp->rq_res_stream; + struct nlm_res *resp = rqstp->rq_resp; + + return svcxdr_encode_cookie(xdr, &resp->cookie) && + svcxdr_encode_stats(xdr, resp->status); } int -nlm4svc_encode_void(struct svc_rqst *rqstp, __be32 *p) +nlm4svc_encode_shareres(struct svc_rqst *rqstp, __be32 *p) { - return xdr_ressize_check(rqstp, p); + struct xdr_stream *xdr = &rqstp->rq_res_stream; + struct nlm_res *resp = rqstp->rq_resp; + + if (!svcxdr_encode_cookie(xdr, &resp->cookie)) + return 0; + if (!svcxdr_encode_stats(xdr, resp->status)) + return 0; + /* sequence */ + if (xdr_stream_encode_u32(xdr, 0) < 0) + return 0; + + return 1; } diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index e6ec6f09ac6e..11118398f495 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -75,6 +75,13 @@ void nfs_mark_delegation_referenced(struct nfs_delegation *delegation) set_bit(NFS_DELEGATION_REFERENCED, &delegation->flags); } +static void nfs_mark_return_delegation(struct nfs_server *server, + struct nfs_delegation *delegation) +{ + set_bit(NFS_DELEGATION_RETURN, &delegation->flags); + set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state); +} + static bool nfs4_is_valid_delegation(const struct nfs_delegation *delegation, fmode_t flags) @@ -293,6 +300,7 @@ nfs_start_delegation_return_locked(struct nfs_inode *nfsi) goto out; spin_lock(&delegation->lock); if (!test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) { + clear_bit(NFS_DELEGATION_RETURN_DELAYED, &delegation->flags); /* Refcount matched in nfs_end_delegation_return() */ ret = nfs_get_delegation(delegation); } @@ -314,16 +322,17 @@ nfs_start_delegation_return(struct nfs_inode *nfsi) return delegation; } -static void -nfs_abort_delegation_return(struct nfs_delegation *delegation, - struct nfs_client *clp) +static void nfs_abort_delegation_return(struct nfs_delegation *delegation, + struct nfs_client *clp, int err) { spin_lock(&delegation->lock); clear_bit(NFS_DELEGATION_RETURNING, &delegation->flags); - set_bit(NFS_DELEGATION_RETURN, &delegation->flags); + if (err == -EAGAIN) { + set_bit(NFS_DELEGATION_RETURN_DELAYED, &delegation->flags); + set_bit(NFS4CLNT_DELEGRETURN_DELAYED, &clp->cl_state); + } spin_unlock(&delegation->lock); - set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state); } static struct nfs_delegation * @@ -521,11 +530,18 @@ out: static int nfs_end_delegation_return(struct inode *inode, struct nfs_delegation *delegation, int issync) { struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; + unsigned int mode = O_WRONLY | O_RDWR; int err = 0; if (delegation == NULL) return 0; - do { + + if (!issync) + mode |= O_NONBLOCK; + /* Recall of any remaining application leases */ + err = break_lease(inode, mode); + + while (err == 0) { if (test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) break; err = nfs_delegation_claim_opens(inode, &delegation->stateid, @@ -536,10 +552,10 @@ static int nfs_end_delegation_return(struct inode *inode, struct nfs_delegation * Guard against state recovery */ err = nfs4_wait_clnt_recover(clp); - } while (err == 0); + } if (err) { - nfs_abort_delegation_return(delegation, clp); + nfs_abort_delegation_return(delegation, clp, err); goto out; } @@ -568,6 +584,7 @@ static bool nfs_delegation_need_return(struct nfs_delegation *delegation) if (ret) clear_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags); if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags) || + test_bit(NFS_DELEGATION_RETURN_DELAYED, &delegation->flags) || test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) ret = false; @@ -647,6 +664,38 @@ out: return err; } +static bool nfs_server_clear_delayed_delegations(struct nfs_server *server) +{ + struct nfs_delegation *d; + bool ret = false; + + list_for_each_entry_rcu (d, &server->delegations, super_list) { + if (!test_bit(NFS_DELEGATION_RETURN_DELAYED, &d->flags)) + continue; + nfs_mark_return_delegation(server, d); + clear_bit(NFS_DELEGATION_RETURN_DELAYED, &d->flags); + ret = true; + } + return ret; +} + +static bool nfs_client_clear_delayed_delegations(struct nfs_client *clp) +{ + struct nfs_server *server; + bool ret = false; + + if (!test_and_clear_bit(NFS4CLNT_DELEGRETURN_DELAYED, &clp->cl_state)) + goto out; + rcu_read_lock(); + list_for_each_entry_rcu (server, &clp->cl_superblocks, client_link) { + if (nfs_server_clear_delayed_delegations(server)) + ret = true; + } + rcu_read_unlock(); +out: + return ret; +} + /** * nfs_client_return_marked_delegations - return previously marked delegations * @clp: nfs_client to process @@ -659,8 +708,14 @@ out: */ int nfs_client_return_marked_delegations(struct nfs_client *clp) { - return nfs_client_for_each_server(clp, - nfs_server_return_marked_delegations, NULL); + int err = nfs_client_for_each_server( + clp, nfs_server_return_marked_delegations, NULL); + if (err) + return err; + /* If a return was delayed, sleep to prevent hard looping */ + if (nfs_client_clear_delayed_delegations(clp)) + ssleep(1); + return 0; } /** @@ -698,13 +753,14 @@ int nfs4_inode_return_delegation(struct inode *inode) { struct nfs_inode *nfsi = NFS_I(inode); struct nfs_delegation *delegation; - int err = 0; - nfs_wb_all(inode); delegation = nfs_start_delegation_return(nfsi); + /* Synchronous recall of any application leases */ + break_lease(inode, O_WRONLY | O_RDWR); + nfs_wb_all(inode); if (delegation != NULL) - err = nfs_end_delegation_return(inode, delegation, 1); - return err; + return nfs_end_delegation_return(inode, delegation, 1); + return 0; } /** @@ -775,13 +831,6 @@ static void nfs_mark_return_if_closed_delegation(struct nfs_server *server, set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state); } -static void nfs_mark_return_delegation(struct nfs_server *server, - struct nfs_delegation *delegation) -{ - set_bit(NFS_DELEGATION_RETURN, &delegation->flags); - set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state); -} - static bool nfs_server_mark_return_all_delegations(struct nfs_server *server) { struct nfs_delegation *delegation; @@ -1010,6 +1059,9 @@ int nfs_async_inode_return_delegation(struct inode *inode, nfs_mark_return_delegation(server, delegation); rcu_read_unlock(); + /* If there are any application leases or delegations, recall them */ + break_lease(inode, O_WRONLY | O_RDWR | O_NONBLOCK); + nfs_delegation_run_state_manager(clp); return 0; out_enoent: diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index c19b4fd20781..1c378992b7c0 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -36,6 +36,7 @@ enum { NFS_DELEGATION_REVOKED, NFS_DELEGATION_TEST_EXPIRED, NFS_DELEGATION_INODE_FREEING, + NFS_DELEGATION_RETURN_DELAYED, }; int nfs_inode_set_delegation(struct inode *inode, const struct cred *cred, diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 2d30a4da49fa..2e894fec036b 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -700,8 +700,8 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) { struct nfs_direct_req *dreq = hdr->dreq; struct nfs_commit_info cinfo; - bool request_commit = false; struct nfs_page *req = nfs_list_entry(hdr->pages.next); + int flags = NFS_ODIRECT_DONE; nfs_init_cinfo_from_dreq(&cinfo, dreq); @@ -713,15 +713,9 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) nfs_direct_count_bytes(dreq, hdr); if (hdr->good_bytes != 0 && nfs_write_need_commit(hdr)) { - switch (dreq->flags) { - case 0: + if (!dreq->flags) dreq->flags = NFS_ODIRECT_DO_COMMIT; - request_commit = true; - break; - case NFS_ODIRECT_RESCHED_WRITES: - case NFS_ODIRECT_DO_COMMIT: - request_commit = true; - } + flags = dreq->flags; } spin_unlock(&dreq->lock); @@ -729,12 +723,15 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) req = nfs_list_entry(hdr->pages.next); nfs_list_remove_request(req); - if (request_commit) { + if (flags == NFS_ODIRECT_DO_COMMIT) { kref_get(&req->wb_kref); memcpy(&req->wb_verf, &hdr->verf.verifier, sizeof(req->wb_verf)); nfs_mark_request_commit(req, hdr->lseg, &cinfo, hdr->ds_commit_idx); + } else if (flags == NFS_ODIRECT_RESCHED_WRITES) { + kref_get(&req->wb_kref); + nfs_mark_request_commit(req, NULL, &cinfo, 0); } nfs_unlock_and_release_request(req); } diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index c4c021c6ebbd..d743629e05e1 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -385,12 +385,15 @@ static void nfs_readpage_from_fscache_complete(struct page *page, "NFS: readpage_from_fscache_complete (0x%p/0x%p/%d)\n", page, context, error); - /* if the read completes with an error, we just unlock the page and let - * the VM reissue the readpage */ - if (!error) { + /* + * If the read completes with an error, mark the page with PG_checked, + * unlock the page, and let the VM reissue the readpage. + */ + if (!error) SetPageUptodate(page); - unlock_page(page); - } + else + SetPageChecked(page); + unlock_page(page); } /* @@ -405,6 +408,11 @@ int __nfs_readpage_from_fscache(struct nfs_open_context *ctx, "NFS: readpage_from_fscache(fsc:%p/p:%p(i:%lx f:%lx)/0x%p)\n", nfs_i_fscache(inode), page, page->index, page->flags, inode); + if (PageChecked(page)) { + ClearPageChecked(page); + return 1; + } + ret = fscache_read_or_alloc_page(nfs_i_fscache(inode), page, nfs_readpage_from_fscache_complete, diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index aaeeb4659bff..59355c106ece 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -67,7 +67,7 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i int nfs_get_root(struct super_block *s, struct fs_context *fc) { struct nfs_fs_context *ctx = nfs_fc2context(fc); - struct nfs_server *server = NFS_SB(s); + struct nfs_server *server = NFS_SB(s), *clone_server; struct nfs_fsinfo fsinfo; struct dentry *root; struct inode *inode; @@ -127,7 +127,7 @@ int nfs_get_root(struct super_block *s, struct fs_context *fc) } spin_unlock(&root->d_lock); fc->root = root; - if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL) + if (server->caps & NFS_CAP_SECURITY_LABEL) kflags |= SECURITY_LSM_NATIVE_LABELS; if (ctx->clone_data.sb) { if (d_inode(fc->root)->i_fop != &nfs_dir_operations) { @@ -137,15 +137,19 @@ int nfs_get_root(struct super_block *s, struct fs_context *fc) /* clone lsm security options from the parent to the new sb */ error = security_sb_clone_mnt_opts(ctx->clone_data.sb, s, kflags, &kflags_out); + if (error) + goto error_splat_root; + clone_server = NFS_SB(ctx->clone_data.sb); + server->has_sec_mnt_opts = clone_server->has_sec_mnt_opts; } else { error = security_sb_set_mnt_opts(s, fc->security, kflags, &kflags_out); } if (error) goto error_splat_root; - if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL && + if (server->caps & NFS_CAP_SECURITY_LABEL && !(kflags_out & SECURITY_LSM_NATIVE_LABELS)) - NFS_SB(s)->caps &= ~NFS_CAP_SECURITY_LABEL; + server->caps &= ~NFS_CAP_SECURITY_LABEL; nfs_setsecurity(inode, fsinfo.fattr, fsinfo.fattr->label); error = 0; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 529c4099f482..853213b3a209 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1101,6 +1101,7 @@ EXPORT_SYMBOL_GPL(nfs_inode_attach_open_context); void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx) { filp->private_data = get_nfs_open_context(ctx); + set_bit(NFS_CONTEXT_FILE_OPEN, &ctx->flags); if (list_empty(&ctx->list)) nfs_inode_attach_open_context(ctx); } @@ -1120,6 +1121,8 @@ struct nfs_open_context *nfs_find_open_context(struct inode *inode, const struct continue; if ((pos->mode & (FMODE_READ|FMODE_WRITE)) != mode) continue; + if (!test_bit(NFS_CONTEXT_FILE_OPEN, &pos->flags)) + continue; ctx = get_nfs_open_context(pos); if (ctx) break; @@ -1135,6 +1138,7 @@ void nfs_file_clear_open_context(struct file *filp) if (ctx) { struct inode *inode = d_inode(ctx->dentry); + clear_bit(NFS_CONTEXT_FILE_OPEN, &ctx->flags); /* * We fatal error on write before. Try to writeback * every page again. @@ -2055,35 +2059,33 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) | NFS_INO_INVALID_OTHER; if (S_ISDIR(inode->i_mode)) nfs_force_lookup_revalidate(inode); + attr_changed = true; dprintk("NFS: change_attr change on server for file %s/%ld\n", inode->i_sb->s_id, inode->i_ino); } else if (!have_delegation) nfsi->cache_validity |= NFS_INO_DATA_INVAL_DEFER; inode_set_iversion_raw(inode, fattr->change_attr); - attr_changed = true; } } else { nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_CHANGE; - cache_revalidated = false; + if (!have_delegation || + (nfsi->cache_validity & NFS_INO_INVALID_CHANGE) != 0) + cache_revalidated = false; } - if (fattr->valid & NFS_ATTR_FATTR_MTIME) { + if (fattr->valid & NFS_ATTR_FATTR_MTIME) inode->i_mtime = fattr->mtime; - } else if (fattr_supported & NFS_ATTR_FATTR_MTIME) { + else if (fattr_supported & NFS_ATTR_FATTR_MTIME) nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_MTIME; - cache_revalidated = false; - } - if (fattr->valid & NFS_ATTR_FATTR_CTIME) { + if (fattr->valid & NFS_ATTR_FATTR_CTIME) inode->i_ctime = fattr->ctime; - } else if (fattr_supported & NFS_ATTR_FATTR_CTIME) { + else if (fattr_supported & NFS_ATTR_FATTR_CTIME) nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_CTIME; - cache_revalidated = false; - } /* Check if our cached file size is stale */ if (fattr->valid & NFS_ATTR_FATTR_SIZE) { @@ -2096,7 +2098,6 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) i_size_write(inode, new_isize); if (!have_writers) invalid |= NFS_INO_INVALID_DATA; - attr_changed = true; } dprintk("NFS: isize change on server for file %s/%ld " "(%Ld to %Ld)\n", @@ -2111,19 +2112,15 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) fattr->du.nfs3.used = 0; fattr->valid |= NFS_ATTR_FATTR_SPACE_USED; } - } else { + } else nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_SIZE; - cache_revalidated = false; - } if (fattr->valid & NFS_ATTR_FATTR_ATIME) inode->i_atime = fattr->atime; - else if (fattr_supported & NFS_ATTR_FATTR_ATIME) { + else if (fattr_supported & NFS_ATTR_FATTR_ATIME) nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_ATIME; - cache_revalidated = false; - } if (fattr->valid & NFS_ATTR_FATTR_MODE) { if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) { @@ -2132,71 +2129,55 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) inode->i_mode = newmode; invalid |= NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; - attr_changed = true; } - } else if (fattr_supported & NFS_ATTR_FATTR_MODE) { + } else if (fattr_supported & NFS_ATTR_FATTR_MODE) nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_MODE; - cache_revalidated = false; - } if (fattr->valid & NFS_ATTR_FATTR_OWNER) { if (!uid_eq(inode->i_uid, fattr->uid)) { invalid |= NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; inode->i_uid = fattr->uid; - attr_changed = true; } - } else if (fattr_supported & NFS_ATTR_FATTR_OWNER) { + } else if (fattr_supported & NFS_ATTR_FATTR_OWNER) nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_OTHER; - cache_revalidated = false; - } if (fattr->valid & NFS_ATTR_FATTR_GROUP) { if (!gid_eq(inode->i_gid, fattr->gid)) { invalid |= NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; inode->i_gid = fattr->gid; - attr_changed = true; } - } else if (fattr_supported & NFS_ATTR_FATTR_GROUP) { + } else if (fattr_supported & NFS_ATTR_FATTR_GROUP) nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_OTHER; - cache_revalidated = false; - } if (fattr->valid & NFS_ATTR_FATTR_NLINK) { if (inode->i_nlink != fattr->nlink) { if (S_ISDIR(inode->i_mode)) invalid |= NFS_INO_INVALID_DATA; set_nlink(inode, fattr->nlink); - attr_changed = true; } - } else if (fattr_supported & NFS_ATTR_FATTR_NLINK) { + } else if (fattr_supported & NFS_ATTR_FATTR_NLINK) nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_NLINK; - cache_revalidated = false; - } if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { /* * report the blocks in 512byte units */ inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used); - } else if (fattr_supported & NFS_ATTR_FATTR_SPACE_USED) { + } else if (fattr_supported & NFS_ATTR_FATTR_SPACE_USED) nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_BLOCKS; - cache_revalidated = false; - } - if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) { + if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) inode->i_blocks = fattr->du.nfs2.blocks; - } else if (fattr_supported & NFS_ATTR_FATTR_BLOCKS_USED) { + else if (fattr_supported & NFS_ATTR_FATTR_BLOCKS_USED) nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_BLOCKS; - cache_revalidated = false; - } /* Update attrtimeo value if we're out of the unstable period */ if (attr_changed) { diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 5c4e23abc345..2299446b3b89 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -385,7 +385,7 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, break; case NFS3_CREATE_UNCHECKED: - goto out; + goto out_release_acls; } nfs_fattr_init(data->res.dir_attr); nfs_fattr_init(data->res.fattr); @@ -751,7 +751,7 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, break; default: status = -EINVAL; - goto out; + goto out_release_acls; } d_alias = nfs3_do_create(dir, dentry, data); diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 543d916f79ab..ba78df4b13d9 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -45,6 +45,7 @@ enum nfs4_client_state { NFS4CLNT_RECALL_RUNNING, NFS4CLNT_RECALL_ANY_LAYOUT_READ, NFS4CLNT_RECALL_ANY_LAYOUT_RW, + NFS4CLNT_DELEGRETURN_DELAYED, }; #define NFS4_RENEW_TIMEOUT 0x01 @@ -322,7 +323,8 @@ extern int update_open_stateid(struct nfs4_state *state, const nfs4_stateid *open_stateid, const nfs4_stateid *deleg_stateid, fmode_t fmode); - +extern int nfs4_proc_setlease(struct file *file, long arg, + struct file_lock **lease, void **priv); extern int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo); extern void nfs4_update_changeattr(struct inode *dir, diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 42719384e25f..28431acd1230 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -197,8 +197,11 @@ void nfs40_shutdown_client(struct nfs_client *clp) struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init) { - int err; + char buf[INET6_ADDRSTRLEN + 1]; + const char *ip_addr = cl_init->ip_addr; struct nfs_client *clp = nfs_alloc_client(cl_init); + int err; + if (IS_ERR(clp)) return clp; @@ -222,6 +225,44 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init) init_waitqueue_head(&clp->cl_lock_waitq); #endif INIT_LIST_HEAD(&clp->pending_cb_stateids); + + if (cl_init->minorversion != 0) + __set_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags); + __set_bit(NFS_CS_DISCRTRY, &clp->cl_flags); + __set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags); + + /* + * Set up the connection to the server before we add add to the + * global list. + */ + err = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_GSS_KRB5I); + if (err == -EINVAL) + err = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_UNIX); + if (err < 0) + goto error; + + /* If no clientaddr= option was specified, find a usable cb address */ + if (ip_addr == NULL) { + struct sockaddr_storage cb_addr; + struct sockaddr *sap = (struct sockaddr *)&cb_addr; + + err = rpc_localaddr(clp->cl_rpcclient, sap, sizeof(cb_addr)); + if (err < 0) + goto error; + err = rpc_ntop(sap, buf, sizeof(buf)); + if (err < 0) + goto error; + ip_addr = (const char *)buf; + } + strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr)); + + err = nfs_idmap_new(clp); + if (err < 0) { + dprintk("%s: failed to create idmapper. Error = %d\n", + __func__, err); + goto error; + } + __set_bit(NFS_CS_IDMAP, &clp->cl_res_state); return clp; error: @@ -372,8 +413,6 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp) struct nfs_client *nfs4_init_client(struct nfs_client *clp, const struct nfs_client_initdata *cl_init) { - char buf[INET6_ADDRSTRLEN + 1]; - const char *ip_addr = cl_init->ip_addr; struct nfs_client *old; int error; @@ -381,43 +420,6 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp, /* the client is initialised already */ return clp; - /* Check NFS protocol revision and initialize RPC op vector */ - clp->rpc_ops = &nfs_v4_clientops; - - if (clp->cl_minorversion != 0) - __set_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags); - __set_bit(NFS_CS_DISCRTRY, &clp->cl_flags); - __set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags); - - error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_GSS_KRB5I); - if (error == -EINVAL) - error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_UNIX); - if (error < 0) - goto error; - - /* If no clientaddr= option was specified, find a usable cb address */ - if (ip_addr == NULL) { - struct sockaddr_storage cb_addr; - struct sockaddr *sap = (struct sockaddr *)&cb_addr; - - error = rpc_localaddr(clp->cl_rpcclient, sap, sizeof(cb_addr)); - if (error < 0) - goto error; - error = rpc_ntop(sap, buf, sizeof(buf)); - if (error < 0) - goto error; - ip_addr = (const char *)buf; - } - strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr)); - - error = nfs_idmap_new(clp); - if (error < 0) { - dprintk("%s: failed to create idmapper. Error = %d\n", - __func__, error); - goto error; - } - __set_bit(NFS_CS_IDMAP, &clp->cl_res_state); - error = nfs4_init_client_minor_version(clp); if (error < 0) goto error; diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index a1e5c6b85ded..c820de58a661 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -435,6 +435,12 @@ void nfs42_ssc_unregister_ops(void) } #endif /* CONFIG_NFS_V4_2 */ +static int nfs4_setlease(struct file *file, long arg, struct file_lock **lease, + void **priv) +{ + return nfs4_proc_setlease(file, arg, lease, priv); +} + const struct file_operations nfs4_file_operations = { .read_iter = nfs_file_read, .write_iter = nfs_file_write, @@ -448,7 +454,7 @@ const struct file_operations nfs4_file_operations = { .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .check_flags = nfs_check_flags, - .setlease = simple_nosetlease, + .setlease = nfs4_setlease, #ifdef CONFIG_NFS_V4_2 .copy_file_range = nfs4_copy_file_range, .llseek = nfs4_file_llseek, diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index e653654c10bc..e1214bb6b7ee 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1155,7 +1155,11 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res) { - return nfs4_do_call_sync(clnt, server, msg, args, res, 0); + unsigned short task_flags = 0; + + if (server->nfs_client->cl_minorversion) + task_flags = RPC_TASK_MOVEABLE; + return nfs4_do_call_sync(clnt, server, msg, args, res, task_flags); } @@ -1205,12 +1209,12 @@ nfs4_update_changeattr_locked(struct inode *inode, u64 change_attr = inode_peek_iversion_raw(inode); cache_validity |= NFS_INO_INVALID_CTIME | NFS_INO_INVALID_MTIME; + if (S_ISDIR(inode->i_mode)) + cache_validity |= NFS_INO_INVALID_DATA; switch (NFS_SERVER(inode)->change_attr_type) { case NFS4_CHANGE_TYPE_IS_UNDEFINED: - break; - case NFS4_CHANGE_TYPE_IS_TIME_METADATA: - if ((s64)(change_attr - cinfo->after) > 0) + if (cinfo->after == change_attr) goto out; break; default: @@ -1218,24 +1222,21 @@ nfs4_update_changeattr_locked(struct inode *inode, goto out; } - if (cinfo->atomic && cinfo->before == change_attr) { - nfsi->attrtimeo_timestamp = jiffies; - } else { - if (S_ISDIR(inode->i_mode)) { - cache_validity |= NFS_INO_INVALID_DATA; + inode_set_iversion_raw(inode, cinfo->after); + if (!cinfo->atomic || cinfo->before != change_attr) { + if (S_ISDIR(inode->i_mode)) nfs_force_lookup_revalidate(inode); - } else { - if (!NFS_PROTO(inode)->have_delegation(inode, - FMODE_READ)) - cache_validity |= NFS_INO_REVAL_PAGECACHE; - } - if (cinfo->before != change_attr) - cache_validity |= NFS_INO_INVALID_ACCESS | - NFS_INO_INVALID_ACL | - NFS_INO_INVALID_XATTR; + if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) + cache_validity |= + NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL | + NFS_INO_INVALID_SIZE | NFS_INO_INVALID_OTHER | + NFS_INO_INVALID_BLOCKS | NFS_INO_INVALID_NLINK | + NFS_INO_INVALID_MODE | NFS_INO_INVALID_XATTR | + NFS_INO_REVAL_PAGECACHE; + nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); } - inode_set_iversion_raw(inode, cinfo->after); + nfsi->attrtimeo_timestamp = jiffies; nfsi->read_cache_jiffies = timestamp; nfsi->attr_gencount = nfs_inc_attr_generation_counter(); nfsi->cache_validity &= ~NFS_INO_INVALID_CHANGE; @@ -2569,6 +2570,9 @@ static int nfs4_run_open_task(struct nfs4_opendata *data, }; int status; + if (server->nfs_client->cl_minorversion) + task_setup_data.flags |= RPC_TASK_MOVEABLE; + kref_get(&data->kref); data->rpc_done = false; data->rpc_status = 0; @@ -3749,6 +3753,9 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait) }; int status = -ENOMEM; + if (server->nfs_client->cl_minorversion) + task_setup_data.flags |= RPC_TASK_MOVEABLE; + nfs4_state_protect(server->nfs_client, NFS_SP4_MACH_CRED_CLEANUP, &task_setup_data.rpc_client, &msg); @@ -4188,6 +4195,9 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, }; unsigned short task_flags = 0; + if (nfs4_has_session(server->nfs_client)) + task_flags = RPC_TASK_MOVEABLE; + /* Is this is an attribute revalidation, subject to softreval? */ if (inode && (server->flags & NFS_MOUNT_SOFTREVAL)) task_flags |= RPC_TASK_TIMEOUT; @@ -4307,6 +4317,9 @@ static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, }; unsigned short task_flags = 0; + if (server->nfs_client->cl_minorversion) + task_flags = RPC_TASK_MOVEABLE; + /* Is this is an attribute revalidation, subject to softreval? */ if (nfs_lookup_is_soft_revalidate(dentry)) task_flags |= RPC_TASK_TIMEOUT; @@ -6538,7 +6551,7 @@ static int _nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred, .rpc_client = server->client, .rpc_message = &msg, .callback_ops = &nfs4_delegreturn_ops, - .flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT, + .flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT | RPC_TASK_MOVEABLE, }; int status = 0; @@ -6856,6 +6869,11 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl, .workqueue = nfsiod_workqueue, .flags = RPC_TASK_ASYNC, }; + struct nfs_client *client = + NFS_SERVER(lsp->ls_state->inode)->nfs_client; + + if (client->cl_minorversion) + task_setup_data.flags |= RPC_TASK_MOVEABLE; nfs4_state_protect(NFS_SERVER(lsp->ls_state->inode)->nfs_client, NFS_SP4_MACH_CRED_CLEANUP, &task_setup_data.rpc_client, &msg); @@ -7130,6 +7148,10 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF, }; int ret; + struct nfs_client *client = NFS_SERVER(state->inode)->nfs_client; + + if (client->cl_minorversion) + task_setup_data.flags |= RPC_TASK_MOVEABLE; dprintk("%s: begin!\n", __func__); data = nfs4_alloc_lockdata(fl, nfs_file_open_context(fl->fl_file), @@ -7438,6 +7460,43 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request) return nfs4_retry_setlk(state, cmd, request); } +static int nfs4_delete_lease(struct file *file, void **priv) +{ + return generic_setlease(file, F_UNLCK, NULL, priv); +} + +static int nfs4_add_lease(struct file *file, long arg, struct file_lock **lease, + void **priv) +{ + struct inode *inode = file_inode(file); + fmode_t type = arg == F_RDLCK ? FMODE_READ : FMODE_WRITE; + int ret; + + /* No delegation, no lease */ + if (!nfs4_have_delegation(inode, type)) + return -EAGAIN; + ret = generic_setlease(file, arg, lease, priv); + if (ret || nfs4_have_delegation(inode, type)) + return ret; + /* We raced with a delegation return */ + nfs4_delete_lease(file, priv); + return -EAGAIN; +} + +int nfs4_proc_setlease(struct file *file, long arg, struct file_lock **lease, + void **priv) +{ + switch (arg) { + case F_RDLCK: + case F_WRLCK: + return nfs4_add_lease(file, arg, lease, priv); + case F_UNLCK: + return nfs4_delete_lease(file, priv); + default: + return -EINVAL; + } +} + int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid) { struct nfs_server *server = NFS_SERVER(state->inode); @@ -9186,7 +9245,7 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, .rpc_client = clp->cl_rpcclient, .rpc_message = &msg, .callback_ops = &nfs41_sequence_ops, - .flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT, + .flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT | RPC_TASK_MOVEABLE, }; struct rpc_task *ret; @@ -9385,7 +9444,7 @@ nfs4_layoutget_handle_exception(struct rpc_task *task, { struct inode *inode = lgp->args.inode; struct nfs_server *server = NFS_SERVER(inode); - struct pnfs_layout_hdr *lo; + struct pnfs_layout_hdr *lo = lgp->lo; int nfs4err = task->tk_status; int err, status = 0; LIST_HEAD(head); @@ -9437,7 +9496,6 @@ nfs4_layoutget_handle_exception(struct rpc_task *task, case -NFS4ERR_BAD_STATEID: exception->timeout = 0; spin_lock(&inode->i_lock); - lo = NFS_I(inode)->layout; /* If the open stateid was bad, then recover it. */ if (!lo || test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) || !nfs4_stateid_match_other(&lgp->args.stateid, &lo->plh_stateid)) { @@ -9509,7 +9567,8 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout) .rpc_message = &msg, .callback_ops = &nfs4_layoutget_call_ops, .callback_data = lgp, - .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF, + .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF | + RPC_TASK_MOVEABLE, }; struct pnfs_layout_segment *lseg = NULL; struct nfs4_exception exception = { @@ -9520,9 +9579,6 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout) dprintk("--> %s\n", __func__); - /* nfs4_layoutget_release calls pnfs_put_layout_hdr */ - pnfs_get_layout_hdr(NFS_I(inode)->layout); - nfs4_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0, 0); task = rpc_run_task(&task_setup_data); @@ -9650,6 +9706,7 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync) .rpc_message = &msg, .callback_ops = &nfs4_layoutreturn_call_ops, .callback_data = lrp, + .flags = RPC_TASK_MOVEABLE, }; int status = 0; @@ -9804,6 +9861,7 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync) .rpc_message = &msg, .callback_ops = &nfs4_layoutcommit_ops, .callback_data = data, + .flags = RPC_TASK_MOVEABLE, }; struct rpc_task *task; int status = 0; @@ -10131,7 +10189,7 @@ static int nfs41_free_stateid(struct nfs_server *server, .rpc_client = server->client, .rpc_message = &msg, .callback_ops = &nfs41_free_stateid_ops, - .flags = RPC_TASK_ASYNC, + .flags = RPC_TASK_ASYNC | RPC_TASK_MOVEABLE, }; struct nfs_free_stateid_data *data; struct rpc_task *task; diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index cf9cc62ec48e..cc232d1f16f2 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -954,6 +954,7 @@ static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc) { struct nfs_pgio_header *hdr; int ret; + unsigned short task_flags = 0; hdr = nfs_pgio_header_alloc(desc->pg_rw_ops); if (!hdr) { @@ -962,14 +963,17 @@ static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc) } nfs_pgheader_init(desc, hdr, nfs_pgio_header_free); ret = nfs_generic_pgio(desc, hdr); - if (ret == 0) + if (ret == 0) { + if (NFS_SERVER(hdr->inode)->nfs_client->cl_minorversion) + task_flags = RPC_TASK_MOVEABLE; ret = nfs_initiate_pgio(NFS_CLIENT(hdr->inode), hdr, hdr->cred, NFS_PROTO(hdr->inode), desc->pg_rpc_callops, desc->pg_ioflags, - RPC_TASK_CRED_NOREF); + RPC_TASK_CRED_NOREF | task_flags); + } return ret; } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 2c01ee805306..ef14ea0b6ab8 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -966,10 +966,8 @@ void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, const struct cred *cred, bool update_barrier) { - u32 oldseq, newseq, new_barrier = 0; - - oldseq = be32_to_cpu(lo->plh_stateid.seqid); - newseq = be32_to_cpu(new->seqid); + u32 oldseq = be32_to_cpu(lo->plh_stateid.seqid); + u32 newseq = be32_to_cpu(new->seqid); if (!pnfs_layout_is_valid(lo)) { pnfs_set_layout_cred(lo, cred); @@ -979,19 +977,21 @@ pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); return; } - if (pnfs_seqid_is_newer(newseq, oldseq)) { + + if (pnfs_seqid_is_newer(newseq, oldseq)) nfs4_stateid_copy(&lo->plh_stateid, new); - /* - * Because of wraparound, we want to keep the barrier - * "close" to the current seqids. - */ - new_barrier = newseq - atomic_read(&lo->plh_outstanding); - } - if (update_barrier) - new_barrier = be32_to_cpu(new->seqid); - else if (new_barrier == 0) + + if (update_barrier) { + pnfs_barrier_update(lo, newseq); return; - pnfs_barrier_update(lo, new_barrier); + } + /* + * Because of wraparound, we want to keep the barrier + * "close" to the current seqids. We really only want to + * get here from a layoutget call. + */ + if (atomic_read(&lo->plh_outstanding) == 1) + pnfs_barrier_update(lo, be32_to_cpu(lo->plh_stateid.seqid)); } static bool @@ -1128,8 +1128,7 @@ void pnfs_layoutget_free(struct nfs4_layoutget *lgp) size_t max_pages = lgp->args.layout.pglen / PAGE_SIZE; nfs4_free_pages(lgp->args.layout.pages, max_pages); - if (lgp->args.inode) - pnfs_put_layout_hdr(NFS_I(lgp->args.inode)->layout); + pnfs_put_layout_hdr(lgp->lo); put_nfs_open_context(lgp->args.ctx); kfree(lgp); } @@ -2014,7 +2013,7 @@ lookup_again: * If the layout segment list is empty, but there are outstanding * layoutget calls, then they might be subject to a layoutrecall. */ - if (list_empty(&lo->plh_segs) && + if ((list_empty(&lo->plh_segs) || !pnfs_layout_is_valid(lo)) && atomic_read(&lo->plh_outstanding) != 0) { spin_unlock(&ino->i_lock); lseg = ERR_PTR(wait_var_event_killable(&lo->plh_outstanding, @@ -2124,6 +2123,9 @@ lookup_again: goto out_put_layout_hdr; } + lgp->lo = lo; + pnfs_get_layout_hdr(lo); + lseg = nfs4_proc_layoutget(lgp, &timeout); trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET); @@ -2255,6 +2257,7 @@ static void _lgopen_prepare_attached(struct nfs4_opendata *data, pnfs_put_layout_hdr(lo); return; } + lgp->lo = lo; data->lgp = lgp; data->o_arg.lg_args = &lgp->args; data->o_res.lg_res = &lgp->res; @@ -2263,6 +2266,7 @@ static void _lgopen_prepare_attached(struct nfs4_opendata *data, static void _lgopen_prepare_floating(struct nfs4_opendata *data, struct nfs_open_context *ctx) { + struct inode *ino = data->dentry->d_inode; struct pnfs_layout_range rng = { .iomode = (data->o_arg.fmode & FMODE_WRITE) ? IOMODE_RW: IOMODE_READ, @@ -2271,7 +2275,7 @@ static void _lgopen_prepare_floating(struct nfs4_opendata *data, }; struct nfs4_layoutget *lgp; - lgp = pnfs_alloc_init_layoutget_args(NULL, ctx, ¤t_stateid, + lgp = pnfs_alloc_init_layoutget_args(ino, ctx, ¤t_stateid, &rng, GFP_KERNEL); if (!lgp) return; @@ -2291,6 +2295,8 @@ void pnfs_lgopen_prepare(struct nfs4_opendata *data, /* Could check on max_ops, but currently hardcoded high enough */ if (!nfs_server_capable(data->dir->d_inode, NFS_CAP_LGOPEN)) return; + if (data->lgp) + return; if (data->state) _lgopen_prepare_attached(data, ctx); else @@ -2330,13 +2336,13 @@ void pnfs_parse_lgopen(struct inode *ino, struct nfs4_layoutget *lgp, } return; } - if (!lgp->args.inode) { + if (!lgp->lo) { lo = _pnfs_grab_empty_layout(ino, ctx); if (!lo) return; - lgp->args.inode = ino; + lgp->lo = lo; } else - lo = NFS_I(lgp->args.inode)->layout; + lo = lgp->lo; lseg = pnfs_layout_process(lgp); if (!IS_ERR(lseg)) { @@ -2349,11 +2355,9 @@ void pnfs_parse_lgopen(struct inode *ino, struct nfs4_layoutget *lgp, void nfs4_lgopen_release(struct nfs4_layoutget *lgp) { if (lgp != NULL) { - struct inode *inode = lgp->args.inode; - if (inode) { - struct pnfs_layout_hdr *lo = NFS_I(inode)->layout; - pnfs_clear_first_layoutget(lo); - nfs_layoutget_end(lo); + if (lgp->lo) { + pnfs_clear_first_layoutget(lgp->lo); + nfs_layoutget_end(lgp->lo); } pnfs_layoutget_free(lgp); } @@ -2362,7 +2366,7 @@ void nfs4_lgopen_release(struct nfs4_layoutget *lgp) struct pnfs_layout_segment * pnfs_layout_process(struct nfs4_layoutget *lgp) { - struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout; + struct pnfs_layout_hdr *lo = lgp->lo; struct nfs4_layoutget_res *res = &lgp->res; struct pnfs_layout_segment *lseg; struct inode *ino = lo->plh_inode; @@ -2390,11 +2394,13 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) goto out_forget; } + if (!pnfs_layout_is_valid(lo) && !pnfs_is_first_layoutget(lo)) + goto out_forget; + if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) { /* existing state ID, make sure the sequence number matches. */ if (pnfs_layout_stateid_blocked(lo, &res->stateid)) { - if (!pnfs_layout_is_valid(lo) && - pnfs_is_first_layoutget(lo)) + if (!pnfs_layout_is_valid(lo)) lo->plh_barrier = 0; dprintk("%s forget reply due to sequence\n", __func__); goto out_forget; @@ -2413,8 +2419,6 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) goto out_forget; } else { /* We have a completely new layout */ - if (!pnfs_is_first_layoutget(lo)) - goto out_forget; pnfs_set_layout_stateid(lo, &res->stateid, lgp->cred, true); } diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 49d3389bd813..cf19914fec81 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -805,19 +805,16 @@ out: } EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_add); -static void nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds) +static int nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds) { might_sleep(); - wait_on_bit(&ds->ds_state, NFS4DS_CONNECTING, - TASK_KILLABLE); + return wait_on_bit(&ds->ds_state, NFS4DS_CONNECTING, TASK_KILLABLE); } static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds) { smp_mb__before_atomic(); - clear_bit(NFS4DS_CONNECTING, &ds->ds_state); - smp_mb__after_atomic(); - wake_up_bit(&ds->ds_state, NFS4DS_CONNECTING); + clear_and_wake_up_bit(NFS4DS_CONNECTING, &ds->ds_state); } static struct nfs_client *(*get_v3_ds_connect)( @@ -858,7 +855,7 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv, dprintk("--> %s DS %s\n", __func__, ds->ds_remotestr); if (!load_v3_ds_connect()) - goto out; + return -EPROTONOSUPPORT; list_for_each_entry(da, &ds->ds_addrs, da_node) { dprintk("%s: DS %s: trying address %s\n", @@ -993,30 +990,33 @@ int nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds, { int err; -again: - err = 0; - if (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) == 0) { - if (version == 3) { - err = _nfs4_pnfs_v3_ds_connect(mds_srv, ds, timeo, - retrans); - } else if (version == 4) { - err = _nfs4_pnfs_v4_ds_connect(mds_srv, ds, timeo, - retrans, minor_version); - } else { - dprintk("%s: unsupported DS version %d\n", __func__, - version); - err = -EPROTONOSUPPORT; - } + do { + err = nfs4_wait_ds_connect(ds); + if (err || ds->ds_clp) + goto out; + if (nfs4_test_deviceid_unavailable(devid)) + return -ENODEV; + } while (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) != 0); - nfs4_clear_ds_conn_bit(ds); - } else { - nfs4_wait_ds_connect(ds); + if (ds->ds_clp) + goto connect_done; - /* what was waited on didn't connect AND didn't mark unavail */ - if (!ds->ds_clp && !nfs4_test_deviceid_unavailable(devid)) - goto again; + switch (version) { + case 3: + err = _nfs4_pnfs_v3_ds_connect(mds_srv, ds, timeo, retrans); + break; + case 4: + err = _nfs4_pnfs_v4_ds_connect(mds_srv, ds, timeo, retrans, + minor_version); + break; + default: + dprintk("%s: unsupported DS version %d\n", __func__, version); + err = -EPROTONOSUPPORT; } +connect_done: + nfs4_clear_ds_conn_bit(ds); +out: /* * At this point the ds->ds_clp should be ready, but it might have * hit an error. diff --git a/fs/nfs/read.c b/fs/nfs/read.c index d2b6dce1f99f..9f39e0a1a38b 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -74,8 +74,7 @@ void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, } EXPORT_SYMBOL_GPL(nfs_pageio_init_read); -static void nfs_pageio_complete_read(struct nfs_pageio_descriptor *pgio, - struct inode *inode) +static void nfs_pageio_complete_read(struct nfs_pageio_descriptor *pgio) { struct nfs_pgio_mirror *pgm; unsigned long npages; @@ -86,9 +85,9 @@ static void nfs_pageio_complete_read(struct nfs_pageio_descriptor *pgio, WARN_ON_ONCE(pgio->pg_mirror_count != 1); pgm = &pgio->pg_mirrors[0]; - NFS_I(inode)->read_io += pgm->pg_bytes_written; + NFS_I(pgio->pg_inode)->read_io += pgm->pg_bytes_written; npages = (pgm->pg_bytes_written + PAGE_SIZE - 1) >> PAGE_SHIFT; - nfs_add_stats(inode, NFSIOS_READPAGES, npages); + nfs_add_stats(pgio->pg_inode, NFSIOS_READPAGES, npages); } @@ -363,22 +362,23 @@ int nfs_readpage(struct file *file, struct page *page) } else desc.ctx = get_nfs_open_context(nfs_file_open_context(file)); + xchg(&desc.ctx->error, 0); if (!IS_SYNC(inode)) { ret = nfs_readpage_from_fscache(desc.ctx, inode, page); if (ret == 0) - goto out; + goto out_wait; } - xchg(&desc.ctx->error, 0); nfs_pageio_init_read(&desc.pgio, inode, false, &nfs_async_read_completion_ops); ret = readpage_async_filler(&desc, page); + if (ret) + goto out; - if (!ret) - nfs_pageio_complete_read(&desc.pgio, inode); - + nfs_pageio_complete_read(&desc.pgio); ret = desc.pgio.pg_error < 0 ? desc.pgio.pg_error : 0; +out_wait: if (!ret) { ret = wait_on_page_locked_killable(page); if (!PageUptodate(page) && !ret) @@ -430,7 +430,7 @@ int nfs_readpages(struct file *file, struct address_space *mapping, ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc); - nfs_pageio_complete_read(&desc.pgio, inode); + nfs_pageio_complete_read(&desc.pgio); read_complete: put_nfs_open_context(desc.ctx); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 3bf82178166a..eae9bf114041 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1810,6 +1810,7 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how, struct nfs_commit_info *cinfo) { struct nfs_commit_data *data; + unsigned short task_flags = 0; /* another commit raced with us */ if (list_empty(head)) @@ -1820,8 +1821,11 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how, /* Set up the argument struct */ nfs_init_commit(data, head, NULL, cinfo); atomic_inc(&cinfo->mds->rpcs_out); + if (NFS_SERVER(inode)->nfs_client->cl_minorversion) + task_flags = RPC_TASK_MOVEABLE; return nfs_initiate_commit(NFS_CLIENT(inode), data, NFS_PROTO(inode), - data->mds_ops, how, RPC_TASK_CRED_NOREF); + data->mds_ops, how, + RPC_TASK_CRED_NOREF | task_flags); } /* diff --git a/fs/nfs_common/grace.c b/fs/nfs_common/grace.c index 26f2a50eceac..edec45831585 100644 --- a/fs/nfs_common/grace.c +++ b/fs/nfs_common/grace.c @@ -82,6 +82,7 @@ __state_in_grace(struct net *net, bool open) /** * locks_in_grace + * @net: network namespace * * Lock managers call this function to determine when it is OK for them * to answer ordinary lock requests, and when they should accept only diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index 1058659a8d31..c99dee99a3c1 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -236,7 +236,7 @@ again: if (!buf) return -ENOMEM; - rq = blk_get_request(q, REQ_OP_SCSI_IN, 0); + rq = blk_get_request(q, REQ_OP_DRV_IN, 0); if (IS_ERR(rq)) { error = -ENOMEM; goto out_free_buf; diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index a75abeb1e698..935c1028c217 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -176,6 +176,12 @@ struct nfsd_net { unsigned int longest_chain_cachesize; struct shrinker nfsd_reply_cache_shrinker; + + /* tracking server-to-server copy mounts */ + spinlock_t nfsd_ssc_lock; + struct list_head nfsd_ssc_mount_list; + wait_queue_head_t nfsd_ssc_waitq; + /* utsname taken from the process that starts the server */ char nfsd_name[UNX_MAXNODENAME+1]; }; diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index a1591feeea22..5dfe7644a517 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -172,7 +172,7 @@ static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p) struct nfsd3_getaclres *resp = rqstp->rq_resp; struct dentry *dentry = resp->fh.fh_dentry; struct kvec *head = rqstp->rq_res.head; - struct inode *inode = d_inode(dentry); + struct inode *inode; unsigned int base; int n; int w; @@ -181,6 +181,7 @@ static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p) return 0; switch (resp->status) { case nfs_ok: + inode = d_inode(dentry); if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh)) return 0; if (xdr_stream_encode_u32(xdr, resp->mask) < 0) diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 7325592b456e..0f8b10f363e7 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -915,10 +915,8 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c args.authflavor = clp->cl_cred.cr_flavor; clp->cl_cb_ident = conn->cb_ident; } else { - if (!conn->cb_xprt) { - trace_nfsd_cb_setup_err(clp, -EINVAL); + if (!conn->cb_xprt) return -EINVAL; - } clp->cl_cb_conn.cb_xprt = conn->cb_xprt; clp->cl_cb_session = ses; args.bc_xprt = conn->cb_xprt; @@ -941,37 +939,43 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c } clp->cl_cb_client = client; clp->cl_cb_cred = cred; - trace_nfsd_cb_setup(clp); + rcu_read_lock(); + trace_nfsd_cb_setup(clp, rpc_peeraddr2str(client, RPC_DISPLAY_NETID), + args.authflavor); + rcu_read_unlock(); return 0; } +static void nfsd4_mark_cb_state(struct nfs4_client *clp, int newstate) +{ + if (clp->cl_cb_state != newstate) { + clp->cl_cb_state = newstate; + trace_nfsd_cb_state(clp); + } +} + static void nfsd4_mark_cb_down(struct nfs4_client *clp, int reason) { if (test_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags)) return; - clp->cl_cb_state = NFSD4_CB_DOWN; - trace_nfsd_cb_state(clp); + nfsd4_mark_cb_state(clp, NFSD4_CB_DOWN); } static void nfsd4_mark_cb_fault(struct nfs4_client *clp, int reason) { if (test_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags)) return; - clp->cl_cb_state = NFSD4_CB_FAULT; - trace_nfsd_cb_state(clp); + nfsd4_mark_cb_state(clp, NFSD4_CB_FAULT); } static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata) { struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null); - trace_nfsd_cb_done(clp, task->tk_status); if (task->tk_status) nfsd4_mark_cb_down(clp, task->tk_status); - else { - clp->cl_cb_state = NFSD4_CB_UP; - trace_nfsd_cb_state(clp); - } + else + nfsd4_mark_cb_state(clp, NFSD4_CB_UP); } static void nfsd4_cb_probe_release(void *calldata) @@ -995,8 +999,8 @@ static const struct rpc_call_ops nfsd4_cb_probe_ops = { */ void nfsd4_probe_callback(struct nfs4_client *clp) { - clp->cl_cb_state = NFSD4_CB_UNKNOWN; - trace_nfsd_cb_state(clp); + trace_nfsd_cb_probe(clp); + nfsd4_mark_cb_state(clp, NFSD4_CB_UNKNOWN); set_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags); nfsd4_run_cb(&clp->cl_cb_null); } @@ -1009,11 +1013,10 @@ void nfsd4_probe_callback_sync(struct nfs4_client *clp) void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn) { - clp->cl_cb_state = NFSD4_CB_UNKNOWN; + nfsd4_mark_cb_state(clp, NFSD4_CB_UNKNOWN); spin_lock(&clp->cl_lock); memcpy(&clp->cl_cb_conn, conn, sizeof(struct nfs4_cb_conn)); spin_unlock(&clp->cl_lock); - trace_nfsd_cb_state(clp); } /* @@ -1170,8 +1173,6 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata) struct nfsd4_callback *cb = calldata; struct nfs4_client *clp = cb->cb_clp; - trace_nfsd_cb_done(clp, task->tk_status); - if (!nfsd4_cb_sequence_done(task, cb)) return; @@ -1231,6 +1232,9 @@ void nfsd4_destroy_callback_queue(void) /* must be called under the state lock */ void nfsd4_shutdown_callback(struct nfs4_client *clp) { + if (clp->cl_cb_state != NFSD4_CB_UNKNOWN) + trace_nfsd_cb_shutdown(clp); + set_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags); /* * Note this won't actually result in a null callback; @@ -1276,7 +1280,6 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb) * kill the old client: */ if (clp->cl_cb_client) { - trace_nfsd_cb_shutdown(clp); rpc_shutdown_client(clp->cl_cb_client); clp->cl_cb_client = NULL; put_cred(clp->cl_cb_cred); @@ -1322,8 +1325,6 @@ nfsd4_run_cb_work(struct work_struct *work) struct rpc_clnt *clnt; int flags; - trace_nfsd_cb_work(clp, cb->cb_msg.rpc_proc->p_name); - if (cb->cb_need_restart) { cb->cb_need_restart = false; } else { @@ -1345,7 +1346,7 @@ nfsd4_run_cb_work(struct work_struct *work) * Don't send probe messages for 4.1 or later. */ if (!cb->cb_ops && clp->cl_minorversion) { - clp->cl_cb_state = NFSD4_CB_UP; + nfsd4_mark_cb_state(clp, NFSD4_CB_UP); nfsd41_destroy_cb(cb); return; } diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index f4ce93d7f26e..486c5dba4b65 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -55,6 +55,13 @@ module_param(inter_copy_offload_enable, bool, 0644); MODULE_PARM_DESC(inter_copy_offload_enable, "Enable inter server to server copy offload. Default: false"); +#ifdef CONFIG_NFSD_V4_2_INTER_SSC +static int nfsd4_ssc_umount_timeout = 900000; /* default to 15 mins */ +module_param(nfsd4_ssc_umount_timeout, int, 0644); +MODULE_PARM_DESC(nfsd4_ssc_umount_timeout, + "idle msecs before unmount export from source server"); +#endif + #ifdef CONFIG_NFSD_V4_SECURITY_LABEL #include <linux/security.h> @@ -1166,6 +1173,81 @@ extern void nfs_sb_deactive(struct super_block *sb); #define NFSD42_INTERSSC_MOUNTOPS "vers=4.2,addr=%s,sec=sys" /* + * setup a work entry in the ssc delayed unmount list. + */ +static __be32 nfsd4_ssc_setup_dul(struct nfsd_net *nn, char *ipaddr, + struct nfsd4_ssc_umount_item **retwork, struct vfsmount **ss_mnt) +{ + struct nfsd4_ssc_umount_item *ni = 0; + struct nfsd4_ssc_umount_item *work = NULL; + struct nfsd4_ssc_umount_item *tmp; + DEFINE_WAIT(wait); + + *ss_mnt = NULL; + *retwork = NULL; + work = kzalloc(sizeof(*work), GFP_KERNEL); +try_again: + spin_lock(&nn->nfsd_ssc_lock); + list_for_each_entry_safe(ni, tmp, &nn->nfsd_ssc_mount_list, nsui_list) { + if (strncmp(ni->nsui_ipaddr, ipaddr, sizeof(ni->nsui_ipaddr))) + continue; + /* found a match */ + if (ni->nsui_busy) { + /* wait - and try again */ + prepare_to_wait(&nn->nfsd_ssc_waitq, &wait, + TASK_INTERRUPTIBLE); + spin_unlock(&nn->nfsd_ssc_lock); + + /* allow 20secs for mount/unmount for now - revisit */ + if (signal_pending(current) || + (schedule_timeout(20*HZ) == 0)) { + kfree(work); + return nfserr_eagain; + } + finish_wait(&nn->nfsd_ssc_waitq, &wait); + goto try_again; + } + *ss_mnt = ni->nsui_vfsmount; + refcount_inc(&ni->nsui_refcnt); + spin_unlock(&nn->nfsd_ssc_lock); + kfree(work); + + /* return vfsmount in ss_mnt */ + return 0; + } + if (work) { + strncpy(work->nsui_ipaddr, ipaddr, sizeof(work->nsui_ipaddr)); + refcount_set(&work->nsui_refcnt, 2); + work->nsui_busy = true; + list_add_tail(&work->nsui_list, &nn->nfsd_ssc_mount_list); + *retwork = work; + } + spin_unlock(&nn->nfsd_ssc_lock); + return 0; +} + +static void nfsd4_ssc_update_dul_work(struct nfsd_net *nn, + struct nfsd4_ssc_umount_item *work, struct vfsmount *ss_mnt) +{ + /* set nsui_vfsmount, clear busy flag and wakeup waiters */ + spin_lock(&nn->nfsd_ssc_lock); + work->nsui_vfsmount = ss_mnt; + work->nsui_busy = false; + wake_up_all(&nn->nfsd_ssc_waitq); + spin_unlock(&nn->nfsd_ssc_lock); +} + +static void nfsd4_ssc_cancel_dul_work(struct nfsd_net *nn, + struct nfsd4_ssc_umount_item *work) +{ + spin_lock(&nn->nfsd_ssc_lock); + list_del(&work->nsui_list); + wake_up_all(&nn->nfsd_ssc_waitq); + spin_unlock(&nn->nfsd_ssc_lock); + kfree(work); +} + +/* * Support one copy source server for now. */ static __be32 @@ -1181,6 +1263,8 @@ nfsd4_interssc_connect(struct nl4_server *nss, struct svc_rqst *rqstp, char *ipaddr, *dev_name, *raw_data; int len, raw_len; __be32 status = nfserr_inval; + struct nfsd4_ssc_umount_item *work = NULL; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); naddr = &nss->u.nl4_addr; tmp_addrlen = rpc_uaddr2sockaddr(SVC_NET(rqstp), naddr->addr, @@ -1229,12 +1313,24 @@ nfsd4_interssc_connect(struct nl4_server *nss, struct svc_rqst *rqstp, goto out_free_rawdata; snprintf(dev_name, len + 5, "%s%s%s:/", startsep, ipaddr, endsep); + status = nfsd4_ssc_setup_dul(nn, ipaddr, &work, &ss_mnt); + if (status) + goto out_free_devname; + if (ss_mnt) + goto out_done; + /* Use an 'internal' mount: SB_KERNMOUNT -> MNT_INTERNAL */ ss_mnt = vfs_kern_mount(type, SB_KERNMOUNT, dev_name, raw_data); module_put(type->owner); - if (IS_ERR(ss_mnt)) + if (IS_ERR(ss_mnt)) { + status = nfserr_nodev; + if (work) + nfsd4_ssc_cancel_dul_work(nn, work); goto out_free_devname; - + } + if (work) + nfsd4_ssc_update_dul_work(nn, work, ss_mnt); +out_done: status = 0; *mount = ss_mnt; @@ -1301,10 +1397,42 @@ static void nfsd4_cleanup_inter_ssc(struct vfsmount *ss_mnt, struct nfsd_file *src, struct nfsd_file *dst) { + bool found = false; + long timeout; + struct nfsd4_ssc_umount_item *tmp; + struct nfsd4_ssc_umount_item *ni = NULL; + struct nfsd_net *nn = net_generic(dst->nf_net, nfsd_net_id); + nfs42_ssc_close(src->nf_file); - fput(src->nf_file); nfsd_file_put(dst); - mntput(ss_mnt); + fput(src->nf_file); + + if (!nn) { + mntput(ss_mnt); + return; + } + spin_lock(&nn->nfsd_ssc_lock); + timeout = msecs_to_jiffies(nfsd4_ssc_umount_timeout); + list_for_each_entry_safe(ni, tmp, &nn->nfsd_ssc_mount_list, nsui_list) { + if (ni->nsui_vfsmount->mnt_sb == ss_mnt->mnt_sb) { + list_del(&ni->nsui_list); + /* + * vfsmount can be shared by multiple exports, + * decrement refcnt. If the count drops to 1 it + * will be unmounted when nsui_expire expires. + */ + refcount_dec(&ni->nsui_refcnt); + ni->nsui_expire = jiffies + timeout; + list_add_tail(&ni->nsui_list, &nn->nfsd_ssc_mount_list); + found = true; + break; + } + } + spin_unlock(&nn->nfsd_ssc_lock); + if (!found) { + mntput(ss_mnt); + return; + } } #else /* CONFIG_NFSD_V4_2_INTER_SSC */ @@ -1375,7 +1503,8 @@ static const struct nfsd4_callback_ops nfsd4_cb_offload_ops = { static void nfsd4_init_copy_res(struct nfsd4_copy *copy, bool sync) { - copy->cp_res.wr_stable_how = NFS_UNSTABLE; + copy->cp_res.wr_stable_how = + copy->committed ? NFS_FILE_SYNC : NFS_UNSTABLE; copy->cp_synchronous = sync; gen_boot_verifier(©->cp_res.wr_verifier, copy->cp_clp->net); } @@ -1386,6 +1515,7 @@ static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy) u64 bytes_total = copy->cp_count; u64 src_pos = copy->cp_src_pos; u64 dst_pos = copy->cp_dst_pos; + __be32 status; /* See RFC 7862 p.67: */ if (bytes_total == 0) @@ -1403,6 +1533,16 @@ static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy) src_pos += bytes_copied; dst_pos += bytes_copied; } while (bytes_total > 0 && !copy->cp_synchronous); + /* for a non-zero asynchronous copy do a commit of data */ + if (!copy->cp_synchronous && copy->cp_res.wr_bytes_written > 0) { + down_write(©->nf_dst->nf_rwsem); + status = vfs_fsync_range(copy->nf_dst->nf_file, + copy->cp_dst_pos, + copy->cp_res.wr_bytes_written, 0); + up_write(©->nf_dst->nf_rwsem); + if (!status) + copy->committed = true; + } return bytes_copied; } @@ -1497,6 +1637,8 @@ do_callback: memcpy(&cb_copy->fh, ©->fh, sizeof(copy->fh)); nfsd4_init_cb(&cb_copy->cp_cb, cb_copy->cp_clp, &nfsd4_cb_offload_ops, NFSPROC4_CLNT_CB_OFFLOAD); + trace_nfsd_cb_offload(copy->cp_clp, ©->cp_res.cb_stateid, + ©->fh, copy->cp_count, copy->nfserr); nfsd4_run_cb(&cb_copy->cp_cb); out: if (!copy->cp_intra) @@ -3232,7 +3374,7 @@ bool nfsd4_spo_must_allow(struct svc_rqst *rqstp) { struct nfsd4_compoundres *resp = rqstp->rq_resp; struct nfsd4_compoundargs *argp = rqstp->rq_argp; - struct nfsd4_op *this = &argp->ops[resp->opcnt - 1]; + struct nfsd4_op *this; struct nfsd4_compound_state *cstate = &resp->cstate; struct nfs4_op_map *allow = &cstate->clp->cl_spo_must_allow; u32 opiter; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index cd5eac2ba054..fa67ecd5fe63 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -44,6 +44,7 @@ #include <linux/jhash.h> #include <linux/string_helpers.h> #include <linux/fsnotify.h> +#include <linux/nfs_ssc.h> #include "xdr4.h" #include "xdr4cb.h" #include "vfs.h" @@ -1745,6 +1746,8 @@ static void nfsd4_conn_lost(struct svc_xpt_user *u) struct nfsd4_conn *c = container_of(u, struct nfsd4_conn, cn_xpt_user); struct nfs4_client *clp = c->cn_session->se_client; + trace_nfsd_cb_lost(clp); + spin_lock(&clp->cl_lock); if (!list_empty(&c->cn_persession)) { list_del(&c->cn_persession); @@ -2355,6 +2358,21 @@ static void seq_quote_mem(struct seq_file *m, char *data, int len) seq_printf(m, "\""); } +static const char *cb_state2str(int state) +{ + switch (state) { + case NFSD4_CB_UP: + return "UP"; + case NFSD4_CB_UNKNOWN: + return "UNKNOWN"; + case NFSD4_CB_DOWN: + return "DOWN"; + case NFSD4_CB_FAULT: + return "FAULT"; + } + return "UNDEFINED"; +} + static int client_info_show(struct seq_file *m, void *v) { struct inode *inode = m->private; @@ -2383,6 +2401,8 @@ static int client_info_show(struct seq_file *m, void *v) seq_printf(m, "\nImplementation time: [%lld, %ld]\n", clp->cl_nii_time.tv_sec, clp->cl_nii_time.tv_nsec); } + seq_printf(m, "callback state: %s\n", cb_state2str(clp->cl_cb_state)); + seq_printf(m, "callback address: %pISpc\n", &clp->cl_cb_conn.cb_addr); drop_client(clp); return 0; @@ -2665,6 +2685,8 @@ static void force_expire_client(struct nfs4_client *clp) struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); bool already_expired; + trace_nfsd_clid_admin_expired(&clp->cl_clientid); + spin_lock(&clp->cl_lock); clp->cl_time = 0; spin_unlock(&clp->cl_lock); @@ -2816,14 +2838,11 @@ move_to_confirmed(struct nfs4_client *clp) lockdep_assert_held(&nn->client_lock); - dprintk("NFSD: move_to_confirm nfs4_client %p\n", clp); list_move(&clp->cl_idhash, &nn->conf_id_hashtbl[idhashval]); rb_erase(&clp->cl_namenode, &nn->unconf_name_tree); add_clp_to_name_tree(clp, &nn->conf_name_tree); - if (!test_and_set_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags) && - clp->cl_nfsd_dentry && - clp->cl_nfsd_info_dentry) - fsnotify_dentry(clp->cl_nfsd_info_dentry, FS_MODIFY); + set_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags); + trace_nfsd_clid_confirmed(&clp->cl_clientid); renew_client_locked(clp); } @@ -3176,20 +3195,24 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, } /* case 6 */ exid->flags |= EXCHGID4_FLAG_CONFIRMED_R; + trace_nfsd_clid_confirmed_r(conf); goto out_copy; } if (!creds_match) { /* case 3 */ if (client_has_state(conf)) { status = nfserr_clid_inuse; + trace_nfsd_clid_cred_mismatch(conf, rqstp); goto out; } goto out_new; } if (verfs_match) { /* case 2 */ conf->cl_exchange_flags |= EXCHGID4_FLAG_CONFIRMED_R; + trace_nfsd_clid_confirmed_r(conf); goto out_copy; } /* case 5, client reboot */ + trace_nfsd_clid_verf_mismatch(conf, rqstp, &verf); conf = NULL; goto out_new; } @@ -3199,16 +3222,19 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; } - unconf = find_unconfirmed_client_by_name(&exid->clname, nn); + unconf = find_unconfirmed_client_by_name(&exid->clname, nn); if (unconf) /* case 4, possible retry or client restart */ unhash_client_locked(unconf); - /* case 1 (normal case) */ + /* case 1, new owner ID */ + trace_nfsd_clid_fresh(new); + out_new: if (conf) { status = mark_client_expired_locked(conf); if (status) goto out; + trace_nfsd_clid_replaced(&conf->cl_clientid); } new->cl_minorversion = cstate->minorversion; new->cl_spo_must_allow.u.words[0] = exid->spo_must_allow[0]; @@ -3232,8 +3258,10 @@ out: out_nolock: if (new) expire_client(new); - if (unconf) + if (unconf) { + trace_nfsd_clid_expire_unconf(&unconf->cl_clientid); expire_client(unconf); + } return status; } @@ -3425,9 +3453,10 @@ nfsd4_create_session(struct svc_rqst *rqstp, goto out_free_conn; } } else if (unconf) { + status = nfserr_clid_inuse; if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) || !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) { - status = nfserr_clid_inuse; + trace_nfsd_clid_cred_mismatch(unconf, rqstp); goto out_free_conn; } status = nfserr_wrong_cred; @@ -3447,6 +3476,7 @@ nfsd4_create_session(struct svc_rqst *rqstp, old = NULL; goto out_free_conn; } + trace_nfsd_clid_replaced(&old->cl_clientid); } move_to_confirmed(unconf); conf = unconf; @@ -3471,6 +3501,8 @@ nfsd4_create_session(struct svc_rqst *rqstp, /* cache solo and embedded create sessions under the client_lock */ nfsd4_cache_create_session(cr_ses, cs_slot, status); spin_unlock(&nn->client_lock); + if (conf == unconf) + fsnotify_dentry(conf->cl_nfsd_info_dentry, FS_MODIFY); /* init connection and backchannel */ nfsd4_init_conn(rqstp, conn, new); nfsd4_put_session(new); @@ -3904,6 +3936,7 @@ nfsd4_destroy_clientid(struct svc_rqst *rqstp, status = nfserr_wrong_cred; goto out; } + trace_nfsd_clid_destroyed(&clp->cl_clientid); unhash_client_locked(clp); out: spin_unlock(&nn->client_lock); @@ -3946,6 +3979,7 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp, goto out; status = nfs_ok; + trace_nfsd_clid_reclaim_complete(&clp->cl_clientid); nfsd4_client_record_create(clp); inc_reclaim_complete(clp); out: @@ -3967,27 +4001,29 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, new = create_client(clname, rqstp, &clverifier); if (new == NULL) return nfserr_jukebox; - /* Cases below refer to rfc 3530 section 14.2.33: */ spin_lock(&nn->client_lock); conf = find_confirmed_client_by_name(&clname, nn); if (conf && client_has_state(conf)) { - /* case 0: */ status = nfserr_clid_inuse; if (clp_used_exchangeid(conf)) goto out; if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) { - trace_nfsd_clid_inuse_err(conf); + trace_nfsd_clid_cred_mismatch(conf, rqstp); goto out; } } unconf = find_unconfirmed_client_by_name(&clname, nn); if (unconf) unhash_client_locked(unconf); - /* We need to handle only case 1: probable callback update */ - if (conf && same_verf(&conf->cl_verifier, &clverifier)) { - copy_clid(new, conf); - gen_confirm(new, nn); - } + if (conf) { + if (same_verf(&conf->cl_verifier, &clverifier)) { + copy_clid(new, conf); + gen_confirm(new, nn); + } else + trace_nfsd_clid_verf_mismatch(conf, rqstp, + &clverifier); + } else + trace_nfsd_clid_fresh(new); new->cl_minorversion = 0; gen_callback(new, setclid, rqstp); add_to_unconfirmed(new); @@ -4000,12 +4036,13 @@ out: spin_unlock(&nn->client_lock); if (new) free_client(new); - if (unconf) + if (unconf) { + trace_nfsd_clid_expire_unconf(&unconf->cl_clientid); expire_client(unconf); + } return status; } - __be32 nfsd4_setclientid_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, @@ -4034,25 +4071,27 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, * Nevertheless, RFC 7530 recommends INUSE for this case: */ status = nfserr_clid_inuse; - if (unconf && !same_creds(&unconf->cl_cred, &rqstp->rq_cred)) + if (unconf && !same_creds(&unconf->cl_cred, &rqstp->rq_cred)) { + trace_nfsd_clid_cred_mismatch(unconf, rqstp); goto out; - if (conf && !same_creds(&conf->cl_cred, &rqstp->rq_cred)) + } + if (conf && !same_creds(&conf->cl_cred, &rqstp->rq_cred)) { + trace_nfsd_clid_cred_mismatch(conf, rqstp); goto out; - /* cases below refer to rfc 3530 section 14.2.34: */ + } if (!unconf || !same_verf(&confirm, &unconf->cl_confirm)) { if (conf && same_verf(&confirm, &conf->cl_confirm)) { - /* case 2: probable retransmit */ status = nfs_ok; - } else /* case 4: client hasn't noticed we rebooted yet? */ + } else status = nfserr_stale_clientid; goto out; } status = nfs_ok; - if (conf) { /* case 1: callback update */ + if (conf) { old = unconf; unhash_client_locked(old); nfsd4_change_callback(conf, &unconf->cl_cb_conn); - } else { /* case 3: normal case; new or rebooted client */ + } else { old = find_confirmed_client_by_name(&unconf->cl_name, nn); if (old) { status = nfserr_clid_inuse; @@ -4065,12 +4104,15 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, old = NULL; goto out; } + trace_nfsd_clid_replaced(&old->cl_clientid); } move_to_confirmed(unconf); conf = unconf; } get_client_locked(conf); spin_unlock(&nn->client_lock); + if (conf == unconf) + fsnotify_dentry(conf->cl_nfsd_info_dentry, FS_MODIFY); nfsd4_probe_callback(conf); spin_lock(&nn->client_lock); put_client_renew_locked(conf); @@ -4618,7 +4660,7 @@ nfsd_break_deleg_cb(struct file_lock *fl) struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner; struct nfs4_file *fp = dp->dl_stid.sc_file; - trace_nfsd_deleg_break(&dp->dl_stid.sc_stateid); + trace_nfsd_cb_recall(&dp->dl_stid); /* * We don't want the locks code to timeout the lease for us; @@ -5457,6 +5499,69 @@ static bool state_expired(struct laundry_time *lt, time64_t last_refresh) return false; } +#ifdef CONFIG_NFSD_V4_2_INTER_SSC +void nfsd4_ssc_init_umount_work(struct nfsd_net *nn) +{ + spin_lock_init(&nn->nfsd_ssc_lock); + INIT_LIST_HEAD(&nn->nfsd_ssc_mount_list); + init_waitqueue_head(&nn->nfsd_ssc_waitq); +} +EXPORT_SYMBOL_GPL(nfsd4_ssc_init_umount_work); + +/* + * This is called when nfsd is being shutdown, after all inter_ssc + * cleanup were done, to destroy the ssc delayed unmount list. + */ +static void nfsd4_ssc_shutdown_umount(struct nfsd_net *nn) +{ + struct nfsd4_ssc_umount_item *ni = NULL; + struct nfsd4_ssc_umount_item *tmp; + + spin_lock(&nn->nfsd_ssc_lock); + list_for_each_entry_safe(ni, tmp, &nn->nfsd_ssc_mount_list, nsui_list) { + list_del(&ni->nsui_list); + spin_unlock(&nn->nfsd_ssc_lock); + mntput(ni->nsui_vfsmount); + kfree(ni); + spin_lock(&nn->nfsd_ssc_lock); + } + spin_unlock(&nn->nfsd_ssc_lock); +} + +static void nfsd4_ssc_expire_umount(struct nfsd_net *nn) +{ + bool do_wakeup = false; + struct nfsd4_ssc_umount_item *ni = 0; + struct nfsd4_ssc_umount_item *tmp; + + spin_lock(&nn->nfsd_ssc_lock); + list_for_each_entry_safe(ni, tmp, &nn->nfsd_ssc_mount_list, nsui_list) { + if (time_after(jiffies, ni->nsui_expire)) { + if (refcount_read(&ni->nsui_refcnt) > 1) + continue; + + /* mark being unmount */ + ni->nsui_busy = true; + spin_unlock(&nn->nfsd_ssc_lock); + mntput(ni->nsui_vfsmount); + spin_lock(&nn->nfsd_ssc_lock); + + /* waiters need to start from begin of list */ + list_del(&ni->nsui_list); + kfree(ni); + + /* wakeup ssc_connect waiters */ + do_wakeup = true; + continue; + } + break; + } + if (do_wakeup) + wake_up_all(&nn->nfsd_ssc_waitq); + spin_unlock(&nn->nfsd_ssc_lock); +} +#endif + static time64_t nfs4_laundromat(struct nfsd_net *nn) { @@ -5495,10 +5600,8 @@ nfs4_laundromat(struct nfsd_net *nn) clp = list_entry(pos, struct nfs4_client, cl_lru); if (!state_expired(<, clp->cl_time)) break; - if (mark_client_expired_locked(clp)) { - trace_nfsd_clid_expired(&clp->cl_clientid); + if (mark_client_expired_locked(clp)) continue; - } list_add(&clp->cl_lru, &reaplist); } spin_unlock(&nn->client_lock); @@ -5568,6 +5671,10 @@ nfs4_laundromat(struct nfsd_net *nn) list_del_init(&nbl->nbl_lru); free_blocked_lock(nbl); } +#ifdef CONFIG_NFSD_V4_2_INTER_SSC + /* service the server-to-server copy delayed unmount list */ + nfsd4_ssc_expire_umount(nn); +#endif out: return max_t(time64_t, lt.new_timeo, NFSD_LAUNDROMAT_MINTIMEOUT); } @@ -6430,8 +6537,10 @@ nfsd4_lm_notify(struct file_lock *fl) } spin_unlock(&nn->blocked_locks_lock); - if (queue) + if (queue) { + trace_nfsd_cb_notify_lock(lo, nbl); nfsd4_run_cb(&nbl->nbl_cb); + } } static const struct lock_manager_operations nfsd_posix_mng_ops = { @@ -7229,7 +7338,6 @@ nfs4_client_to_reclaim(struct xdr_netobj name, struct xdr_netobj princhash, unsigned int strhashval; struct nfs4_client_reclaim *crp; - trace_nfsd_clid_reclaim(nn, name.len, name.data); crp = alloc_reclaim(); if (crp) { strhashval = clientstr_hashval(name); @@ -7279,8 +7387,6 @@ nfsd4_find_reclaim_client(struct xdr_netobj name, struct nfsd_net *nn) unsigned int strhashval; struct nfs4_client_reclaim *crp = NULL; - trace_nfsd_clid_find(nn, name.len, name.data); - strhashval = clientstr_hashval(name); list_for_each_entry(crp, &nn->reclaim_str_hashtbl[strhashval], cr_strhash) { if (compare_blob(&crp->cr_name, &name) == 0) { @@ -7486,6 +7592,9 @@ nfs4_state_shutdown_net(struct net *net) nfsd4_client_tracking_exit(net); nfs4_state_destroy_net(net); +#ifdef CONFIG_NFSD_V4_2_INTER_SSC + nfsd4_ssc_shutdown_umount(nn); +#endif } void diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 14dbfa75059d..9664303afdaf 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -484,6 +484,10 @@ static inline bool nfsd_attrs_supported(u32 minorversion, const u32 *bmval) extern int nfsd4_is_junction(struct dentry *dentry); extern int register_cld_notifier(void); extern void unregister_cld_notifier(void); +#ifdef CONFIG_NFSD_V4_2_INTER_SSC +extern void nfsd4_ssc_init_umount_work(struct nfsd_net *nn); +#endif + #else /* CONFIG_NFSD_V4 */ static inline int nfsd4_is_junction(struct dentry *dentry) { diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index aff2cda5c6c3..6106697adc04 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -225,15 +225,12 @@ static inline bool fh_fsid_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2) * returns a crc32 hash for the filehandle that is compatible with * the one displayed by "wireshark". */ - -static inline u32 -knfsd_fh_hash(struct knfsd_fh *fh) +static inline u32 knfsd_fh_hash(const struct knfsd_fh *fh) { return ~crc32_le(0xFFFFFFFF, (unsigned char *)&fh->fh_base, fh->fh_size); } #else -static inline u32 -knfsd_fh_hash(struct knfsd_fh *fh) +static inline u32 knfsd_fh_hash(const struct knfsd_fh *fh) { return 0; } diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index dd5d69921676..ccb59e91011b 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -403,6 +403,9 @@ static int nfsd_startup_net(struct net *net, const struct cred *cred) if (ret) goto out_filecache; +#ifdef CONFIG_NFSD_V4_2_INTER_SSC + nfsd4_ssc_init_umount_work(nn); +#endif nn->nfsd_net_up = true; return 0; diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 27a93ebd1d80..adaec43548d1 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -408,7 +408,6 @@ TRACE_EVENT(nfsd_dirent, __entry->ino = ino; __entry->len = namlen; memcpy(__get_str(name), name, namlen); - __assign_str(name, name); ), TP_printk("fh_hash=0x%08x ino=%llu name=%.*s", __entry->fh_hash, __entry->ino, @@ -459,7 +458,6 @@ DEFINE_STATEID_EVENT(layout_recall_release); DEFINE_STATEID_EVENT(open); DEFINE_STATEID_EVENT(deleg_read); -DEFINE_STATEID_EVENT(deleg_break); DEFINE_STATEID_EVENT(deleg_recall); DECLARE_EVENT_CLASS(nfsd_stateseqid_class, @@ -511,7 +509,12 @@ DEFINE_EVENT(nfsd_clientid_class, nfsd_clid_##name, \ TP_PROTO(const clientid_t *clid), \ TP_ARGS(clid)) -DEFINE_CLIENTID_EVENT(expired); +DEFINE_CLIENTID_EVENT(expire_unconf); +DEFINE_CLIENTID_EVENT(reclaim_complete); +DEFINE_CLIENTID_EVENT(confirmed); +DEFINE_CLIENTID_EVENT(destroyed); +DEFINE_CLIENTID_EVENT(admin_expired); +DEFINE_CLIENTID_EVENT(replaced); DEFINE_CLIENTID_EVENT(purged); DEFINE_CLIENTID_EVENT(renew); DEFINE_CLIENTID_EVENT(stale); @@ -536,58 +539,102 @@ DEFINE_EVENT(nfsd_net_class, nfsd_##name, \ DEFINE_NET_EVENT(grace_start); DEFINE_NET_EVENT(grace_complete); -DECLARE_EVENT_CLASS(nfsd_clid_class, - TP_PROTO(const struct nfsd_net *nn, - unsigned int namelen, - const unsigned char *namedata), - TP_ARGS(nn, namelen, namedata), +TRACE_EVENT(nfsd_clid_cred_mismatch, + TP_PROTO( + const struct nfs4_client *clp, + const struct svc_rqst *rqstp + ), + TP_ARGS(clp, rqstp), TP_STRUCT__entry( - __field(unsigned long long, boot_time) - __field(unsigned int, namelen) - __dynamic_array(unsigned char, name, namelen) + __field(u32, cl_boot) + __field(u32, cl_id) + __field(unsigned long, cl_flavor) + __field(unsigned long, new_flavor) + __array(unsigned char, addr, sizeof(struct sockaddr_in6)) ), TP_fast_assign( - __entry->boot_time = nn->boot_time; - __entry->namelen = namelen; - memcpy(__get_dynamic_array(name), namedata, namelen); + __entry->cl_boot = clp->cl_clientid.cl_boot; + __entry->cl_id = clp->cl_clientid.cl_id; + __entry->cl_flavor = clp->cl_cred.cr_flavor; + __entry->new_flavor = rqstp->rq_cred.cr_flavor; + memcpy(__entry->addr, &rqstp->rq_xprt->xpt_remote, + sizeof(struct sockaddr_in6)); ), - TP_printk("boot_time=%16llx nfs4_clientid=%.*s", - __entry->boot_time, __entry->namelen, __get_str(name)) + TP_printk("client %08x:%08x flavor=%s, conflict=%s from addr=%pISpc", + __entry->cl_boot, __entry->cl_id, + show_nfsd_authflavor(__entry->cl_flavor), + show_nfsd_authflavor(__entry->new_flavor), __entry->addr + ) ) -#define DEFINE_CLID_EVENT(name) \ -DEFINE_EVENT(nfsd_clid_class, nfsd_clid_##name, \ - TP_PROTO(const struct nfsd_net *nn, \ - unsigned int namelen, \ - const unsigned char *namedata), \ - TP_ARGS(nn, namelen, namedata)) - -DEFINE_CLID_EVENT(find); -DEFINE_CLID_EVENT(reclaim); +TRACE_EVENT(nfsd_clid_verf_mismatch, + TP_PROTO( + const struct nfs4_client *clp, + const struct svc_rqst *rqstp, + const nfs4_verifier *verf + ), + TP_ARGS(clp, rqstp, verf), + TP_STRUCT__entry( + __field(u32, cl_boot) + __field(u32, cl_id) + __array(unsigned char, cl_verifier, NFS4_VERIFIER_SIZE) + __array(unsigned char, new_verifier, NFS4_VERIFIER_SIZE) + __array(unsigned char, addr, sizeof(struct sockaddr_in6)) + ), + TP_fast_assign( + __entry->cl_boot = clp->cl_clientid.cl_boot; + __entry->cl_id = clp->cl_clientid.cl_id; + memcpy(__entry->cl_verifier, (void *)&clp->cl_verifier, + NFS4_VERIFIER_SIZE); + memcpy(__entry->new_verifier, (void *)verf, + NFS4_VERIFIER_SIZE); + memcpy(__entry->addr, &rqstp->rq_xprt->xpt_remote, + sizeof(struct sockaddr_in6)); + ), + TP_printk("client %08x:%08x verf=0x%s, updated=0x%s from addr=%pISpc", + __entry->cl_boot, __entry->cl_id, + __print_hex_str(__entry->cl_verifier, NFS4_VERIFIER_SIZE), + __print_hex_str(__entry->new_verifier, NFS4_VERIFIER_SIZE), + __entry->addr + ) +); -TRACE_EVENT(nfsd_clid_inuse_err, +DECLARE_EVENT_CLASS(nfsd_clid_class, TP_PROTO(const struct nfs4_client *clp), TP_ARGS(clp), TP_STRUCT__entry( __field(u32, cl_boot) __field(u32, cl_id) __array(unsigned char, addr, sizeof(struct sockaddr_in6)) - __field(unsigned int, namelen) - __dynamic_array(unsigned char, name, clp->cl_name.len) + __field(unsigned long, flavor) + __array(unsigned char, verifier, NFS4_VERIFIER_SIZE) + __dynamic_array(char, name, clp->cl_name.len + 1) ), TP_fast_assign( __entry->cl_boot = clp->cl_clientid.cl_boot; __entry->cl_id = clp->cl_clientid.cl_id; memcpy(__entry->addr, &clp->cl_addr, sizeof(struct sockaddr_in6)); - __entry->namelen = clp->cl_name.len; - memcpy(__get_dynamic_array(name), clp->cl_name.data, - clp->cl_name.len); - ), - TP_printk("nfs4_clientid %.*s already in use by %pISpc, client %08x:%08x", - __entry->namelen, __get_str(name), __entry->addr, + __entry->flavor = clp->cl_cred.cr_flavor; + memcpy(__entry->verifier, (void *)&clp->cl_verifier, + NFS4_VERIFIER_SIZE); + memcpy(__get_str(name), clp->cl_name.data, clp->cl_name.len); + __get_str(name)[clp->cl_name.len] = '\0'; + ), + TP_printk("addr=%pISpc name='%s' verifier=0x%s flavor=%s client=%08x:%08x", + __entry->addr, __get_str(name), + __print_hex_str(__entry->verifier, NFS4_VERIFIER_SIZE), + show_nfsd_authflavor(__entry->flavor), __entry->cl_boot, __entry->cl_id) -) +); + +#define DEFINE_CLID_EVENT(name) \ +DEFINE_EVENT(nfsd_clid_class, nfsd_clid_##name, \ + TP_PROTO(const struct nfs4_client *clp), \ + TP_ARGS(clp)) + +DEFINE_CLID_EVENT(fresh); +DEFINE_CLID_EVENT(confirmed_r); /* * from fs/nfsd/filecache.h @@ -809,9 +856,9 @@ TRACE_EVENT(nfsd_cb_args, memcpy(__entry->addr, &conn->cb_addr, sizeof(struct sockaddr_in6)); ), - TP_printk("client %08x:%08x callback addr=%pISpc prog=%u ident=%u", - __entry->cl_boot, __entry->cl_id, - __entry->addr, __entry->prog, __entry->ident) + TP_printk("addr=%pISpc client %08x:%08x prog=%u ident=%u", + __entry->addr, __entry->cl_boot, __entry->cl_id, + __entry->prog, __entry->ident) ); TRACE_EVENT(nfsd_cb_nodelegs, @@ -828,11 +875,6 @@ TRACE_EVENT(nfsd_cb_nodelegs, TP_printk("client %08x:%08x", __entry->cl_boot, __entry->cl_id) ) -TRACE_DEFINE_ENUM(NFSD4_CB_UP); -TRACE_DEFINE_ENUM(NFSD4_CB_UNKNOWN); -TRACE_DEFINE_ENUM(NFSD4_CB_DOWN); -TRACE_DEFINE_ENUM(NFSD4_CB_FAULT); - #define show_cb_state(val) \ __print_symbolic(val, \ { NFSD4_CB_UP, "UP" }, \ @@ -866,10 +908,53 @@ DEFINE_EVENT(nfsd_cb_class, nfsd_cb_##name, \ TP_PROTO(const struct nfs4_client *clp), \ TP_ARGS(clp)) -DEFINE_NFSD_CB_EVENT(setup); DEFINE_NFSD_CB_EVENT(state); +DEFINE_NFSD_CB_EVENT(probe); +DEFINE_NFSD_CB_EVENT(lost); DEFINE_NFSD_CB_EVENT(shutdown); +TRACE_DEFINE_ENUM(RPC_AUTH_NULL); +TRACE_DEFINE_ENUM(RPC_AUTH_UNIX); +TRACE_DEFINE_ENUM(RPC_AUTH_GSS); +TRACE_DEFINE_ENUM(RPC_AUTH_GSS_KRB5); +TRACE_DEFINE_ENUM(RPC_AUTH_GSS_KRB5I); +TRACE_DEFINE_ENUM(RPC_AUTH_GSS_KRB5P); + +#define show_nfsd_authflavor(val) \ + __print_symbolic(val, \ + { RPC_AUTH_NULL, "none" }, \ + { RPC_AUTH_UNIX, "sys" }, \ + { RPC_AUTH_GSS, "gss" }, \ + { RPC_AUTH_GSS_KRB5, "krb5" }, \ + { RPC_AUTH_GSS_KRB5I, "krb5i" }, \ + { RPC_AUTH_GSS_KRB5P, "krb5p" }) + +TRACE_EVENT(nfsd_cb_setup, + TP_PROTO(const struct nfs4_client *clp, + const char *netid, + rpc_authflavor_t authflavor + ), + TP_ARGS(clp, netid, authflavor), + TP_STRUCT__entry( + __field(u32, cl_boot) + __field(u32, cl_id) + __field(unsigned long, authflavor) + __array(unsigned char, addr, sizeof(struct sockaddr_in6)) + __array(unsigned char, netid, 8) + ), + TP_fast_assign( + __entry->cl_boot = clp->cl_clientid.cl_boot; + __entry->cl_id = clp->cl_clientid.cl_id; + strlcpy(__entry->netid, netid, sizeof(__entry->netid)); + __entry->authflavor = authflavor; + memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr, + sizeof(struct sockaddr_in6)); + ), + TP_printk("addr=%pISpc client %08x:%08x proto=%s flavor=%s", + __entry->addr, __entry->cl_boot, __entry->cl_id, + __entry->netid, show_nfsd_authflavor(__entry->authflavor)) +); + TRACE_EVENT(nfsd_cb_setup_err, TP_PROTO( const struct nfs4_client *clp, @@ -893,52 +978,97 @@ TRACE_EVENT(nfsd_cb_setup_err, __entry->addr, __entry->cl_boot, __entry->cl_id, __entry->error) ); -TRACE_EVENT(nfsd_cb_work, +TRACE_EVENT(nfsd_cb_recall, TP_PROTO( - const struct nfs4_client *clp, - const char *procedure + const struct nfs4_stid *stid ), - TP_ARGS(clp, procedure), + TP_ARGS(stid), TP_STRUCT__entry( __field(u32, cl_boot) __field(u32, cl_id) - __string(procedure, procedure) + __field(u32, si_id) + __field(u32, si_generation) __array(unsigned char, addr, sizeof(struct sockaddr_in6)) ), TP_fast_assign( + const stateid_t *stp = &stid->sc_stateid; + const struct nfs4_client *clp = stid->sc_client; + + __entry->cl_boot = stp->si_opaque.so_clid.cl_boot; + __entry->cl_id = stp->si_opaque.so_clid.cl_id; + __entry->si_id = stp->si_opaque.so_id; + __entry->si_generation = stp->si_generation; + if (clp) + memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr, + sizeof(struct sockaddr_in6)); + else + memset(__entry->addr, 0, sizeof(struct sockaddr_in6)); + ), + TP_printk("addr=%pISpc client %08x:%08x stateid %08x:%08x", + __entry->addr, __entry->cl_boot, __entry->cl_id, + __entry->si_id, __entry->si_generation) +); + +TRACE_EVENT(nfsd_cb_notify_lock, + TP_PROTO( + const struct nfs4_lockowner *lo, + const struct nfsd4_blocked_lock *nbl + ), + TP_ARGS(lo, nbl), + TP_STRUCT__entry( + __field(u32, cl_boot) + __field(u32, cl_id) + __field(u32, fh_hash) + __array(unsigned char, addr, sizeof(struct sockaddr_in6)) + ), + TP_fast_assign( + const struct nfs4_client *clp = lo->lo_owner.so_client; + __entry->cl_boot = clp->cl_clientid.cl_boot; __entry->cl_id = clp->cl_clientid.cl_id; - __assign_str(procedure, procedure) + __entry->fh_hash = knfsd_fh_hash(&nbl->nbl_fh); memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr, sizeof(struct sockaddr_in6)); ), - TP_printk("addr=%pISpc client %08x:%08x procedure=%s", + TP_printk("addr=%pISpc client %08x:%08x fh_hash=0x%08x", __entry->addr, __entry->cl_boot, __entry->cl_id, - __get_str(procedure)) + __entry->fh_hash) ); -TRACE_EVENT(nfsd_cb_done, +TRACE_EVENT(nfsd_cb_offload, TP_PROTO( const struct nfs4_client *clp, - int status + const stateid_t *stp, + const struct knfsd_fh *fh, + u64 count, + __be32 status ), - TP_ARGS(clp, status), + TP_ARGS(clp, stp, fh, count, status), TP_STRUCT__entry( __field(u32, cl_boot) __field(u32, cl_id) + __field(u32, si_id) + __field(u32, si_generation) + __field(u32, fh_hash) __field(int, status) + __field(u64, count) __array(unsigned char, addr, sizeof(struct sockaddr_in6)) ), TP_fast_assign( - __entry->cl_boot = clp->cl_clientid.cl_boot; - __entry->cl_id = clp->cl_clientid.cl_id; - __entry->status = status; + __entry->cl_boot = stp->si_opaque.so_clid.cl_boot; + __entry->cl_id = stp->si_opaque.so_clid.cl_id; + __entry->si_id = stp->si_opaque.so_id; + __entry->si_generation = stp->si_generation; + __entry->fh_hash = knfsd_fh_hash(fh); + __entry->status = be32_to_cpu(status); + __entry->count = count; memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr, sizeof(struct sockaddr_in6)); ), - TP_printk("addr=%pISpc client %08x:%08x status=%d", + TP_printk("addr=%pISpc client %08x:%08x stateid %08x:%08x fh_hash=0x%08x count=%llu status=%d", __entry->addr, __entry->cl_boot, __entry->cl_id, - __entry->status) + __entry->si_id, __entry->si_generation, + __entry->fh_hash, __entry->count, __entry->status) ); #endif /* _NFSD_TRACE_H */ diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 15adf1f6ab21..a224a5e23cc1 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1123,6 +1123,19 @@ out: } #ifdef CONFIG_NFSD_V3 +static int +nfsd_filemap_write_and_wait_range(struct nfsd_file *nf, loff_t offset, + loff_t end) +{ + struct address_space *mapping = nf->nf_file->f_mapping; + int ret = filemap_fdatawrite_range(mapping, offset, end); + + if (ret) + return ret; + filemap_fdatawait_range_keep_errors(mapping, offset, end); + return 0; +} + /* * Commit all pending writes to stable storage. * @@ -1153,10 +1166,11 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, if (err) goto out; if (EX_ISSYNC(fhp->fh_export)) { - int err2; + int err2 = nfsd_filemap_write_and_wait_range(nf, offset, end); down_write(&nf->nf_rwsem); - err2 = vfs_fsync_range(nf->nf_file, offset, end, 0); + if (!err2) + err2 = vfs_fsync_range(nf->nf_file, offset, end, 0); switch (err2) { case 0: nfsd_copy_boot_verifier(verf, net_generic(nf->nf_net, @@ -1613,9 +1627,9 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, host_err = vfs_symlink(&init_user_ns, d_inode(dentry), dnew, path); err = nfserrno(host_err); + fh_unlock(fhp); if (!err) err = nfserrno(commit_metadata(fhp)); - fh_unlock(fhp); fh_drop_write(fhp); @@ -1680,6 +1694,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, if (d_really_is_negative(dold)) goto out_dput; host_err = vfs_link(dold, &init_user_ns, dirp, dnew, NULL); + fh_unlock(ffhp); if (!host_err) { err = nfserrno(commit_metadata(ffhp)); if (!err) @@ -1859,6 +1874,7 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, { struct dentry *dentry, *rdentry; struct inode *dirp; + struct inode *rinode; __be32 err; int host_err; @@ -1887,6 +1903,8 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, host_err = -ENOENT; goto out_drop_write; } + rinode = d_inode(rdentry); + ihold(rinode); if (!type) type = d_inode(rdentry)->i_mode & S_IFMT; @@ -1899,9 +1917,11 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, host_err = vfs_rmdir(&init_user_ns, dirp, rdentry); } + fh_unlock(fhp); if (!host_err) host_err = commit_metadata(fhp); dput(rdentry); + iput(rinode); /* truncate the inode here */ out_drop_write: fh_drop_write(fhp); diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index a7c425254fee..3e4052e3bd50 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -567,6 +567,7 @@ struct nfsd4_copy { struct vfsmount *ss_mnt; struct nfs_fh c_fh; nfs4_stateid stateid; + bool committed; }; struct nfsd4_seek { diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 6bf35a0d61f3..16ac617df7d7 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -249,8 +249,7 @@ static void orangefs_readahead(struct readahead_control *rac) { loff_t offset; struct iov_iter iter; - struct file *file = rac->file; - struct inode *inode = file->f_mapping->host; + struct inode *inode = rac->mapping->host; struct xarray *i_pages; struct page *page; loff_t new_start = readahead_pos(rac); @@ -269,14 +268,14 @@ static void orangefs_readahead(struct readahead_control *rac) readahead_expand(rac, new_start, new_len); offset = readahead_pos(rac); - i_pages = &file->f_mapping->i_pages; + i_pages = &rac->mapping->i_pages; iov_iter_xarray(&iter, READ, i_pages, offset, readahead_length(rac)); /* read in the pages. */ if ((ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, &offset, &iter, readahead_length(rac), - inode->i_size, NULL, NULL, file)) < 0) + inode->i_size, NULL, NULL, rac->file)) < 0) gossip_debug(GOSSIP_FILE_DEBUG, "%s: wait_for_direct_io failed. \n", __func__); else diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c index ee5efdc35cc1..2f2e430461b2 100644 --- a/fs/orangefs/super.c +++ b/fs/orangefs/super.c @@ -209,7 +209,7 @@ static int orangefs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bavail = (sector_t) new_op->downcall.resp.statfs.blocks_avail; buf->f_files = (sector_t) new_op->downcall.resp.statfs.files_total; buf->f_ffree = (sector_t) new_op->downcall.resp.statfs.files_avail; - buf->f_frsize = sb->s_blocksize; + buf->f_frsize = 0; out_op_release: op_release(new_op); diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 1bbb9fe661b1..fc718f6178f2 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -2824,7 +2824,7 @@ void dbg_debugfs_init_fs(struct ubifs_info *c) n = snprintf(d->dfs_dir_name, UBIFS_DFS_DIR_LEN + 1, UBIFS_DFS_DIR_NAME, c->vi.ubi_num, c->vi.vol_id); - if (n == UBIFS_DFS_DIR_LEN) { + if (n > UBIFS_DFS_DIR_LEN) { /* The array size is too small */ return; } diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 5bd8482e660a..7c61d0ec0159 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -1337,7 +1337,10 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, goto out_release; } + spin_lock(&whiteout->i_lock); whiteout->i_state |= I_LINKABLE; + spin_unlock(&whiteout->i_lock); + whiteout_ui = ubifs_inode(whiteout); whiteout_ui->data = dev; whiteout_ui->data_len = ubifs_encode_dev(dev, MKDEV(0, 0)); @@ -1430,7 +1433,11 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, inc_nlink(whiteout); mark_inode_dirty(whiteout); + + spin_lock(&whiteout->i_lock); whiteout->i_state &= ~I_LINKABLE; + spin_unlock(&whiteout->i_lock); + iput(whiteout); } diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 2857e64d673d..8ea680dba61e 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -882,6 +882,7 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode) struct ubifs_dent_node *xent, *pxent = NULL; if (ui->xattr_cnt > ubifs_xattr_max_cnt(c)) { + err = -EPERM; ubifs_err(c, "Cannot delete inode, it has too much xattrs!"); goto out_release; } @@ -1431,7 +1432,7 @@ out_free: /** * truncate_data_node - re-compress/encrypt a truncated data node. * @c: UBIFS file-system description object - * @inode: inode which referes to the data node + * @inode: inode which refers to the data node * @block: data block number * @dn: data node to re-compress * @new_len: new length diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c index 0df9a3dd0aaa..7adc37c10b6a 100644 --- a/fs/ubifs/master.c +++ b/fs/ubifs/master.c @@ -37,7 +37,7 @@ int ubifs_compare_master_node(struct ubifs_info *c, void *m1, void *m2) return ret; /* - * Do not compare the embedded HMAC aswell which also must be different + * Do not compare the embedded HMAC as well which also must be different * due to the different common node header. */ behind = hmac_offs + UBIFS_MAX_HMAC_LEN; diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c index 382a54c82930..5260d3e531bb 100644 --- a/fs/ubifs/replay.c +++ b/fs/ubifs/replay.c @@ -296,7 +296,7 @@ static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r) * @b: second replay entry * * This is a comparios function for 'list_sort()' which compares 2 replay - * entries @a and @b by comparing their sequence numer. Returns %1 if @a has + * entries @a and @b by comparing their sequence number. Returns %1 if @a has * greater sequence number and %-1 otherwise. */ static int replay_entries_cmp(void *priv, const struct list_head *a, diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 7b572e1414ba..f0fb25727d96 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -275,6 +275,7 @@ static struct inode *ubifs_alloc_inode(struct super_block *sb) memset((void *)ui + sizeof(struct inode), 0, sizeof(struct ubifs_inode) - sizeof(struct inode)); mutex_init(&ui->ui_mutex); + init_rwsem(&ui->xattr_sem); spin_lock_init(&ui->ui_lock); return &ui->vfs_inode; }; @@ -2060,7 +2061,7 @@ const struct super_operations ubifs_super_operations = { * @mode: UBI volume open mode * * The primary method of mounting UBIFS is by specifying the UBI volume - * character device node path. However, UBIFS may also be mounted withoug any + * character device node path. However, UBIFS may also be mounted without any * character device node using one of the following methods: * * o ubiX_Y - mount UBI device number X, volume Y; diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c index 234be1c4dc87..58c92c96ecef 100644 --- a/fs/ubifs/tnc_commit.c +++ b/fs/ubifs/tnc_commit.c @@ -930,7 +930,7 @@ static int write_index(struct ubifs_info *c) * flag cleared before %COW_ZNODE. Specifically, it matters in * the 'dirty_cow_znode()' function. This is the reason for the * first barrier. Also, we want the bit changes to be seen to - * other threads ASAP, to avoid unnecesarry copying, which is + * other threads ASAP, to avoid unnecessary copying, which is * the reason for the second barrier. */ clear_bit(DIRTY_ZNODE, &znode->flags); diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index b65c599a386a..c38066ce9ab0 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -356,6 +356,7 @@ struct ubifs_gced_idx_leb { * @ui_mutex: serializes inode write-back with the rest of VFS operations, * serializes "clean <-> dirty" state changes, serializes bulk-read, * protects @dirty, @bulk_read, @ui_size, and @xattr_size + * @xattr_sem: serilizes write operations (remove|set|create) on xattr * @ui_lock: protects @synced_i_size * @synced_i_size: synchronized size of inode, i.e. the value of inode size * currently stored on the flash; used only for regular file @@ -409,6 +410,7 @@ struct ubifs_inode { unsigned int bulk_read:1; unsigned int compr_type:2; struct mutex ui_mutex; + struct rw_semaphore xattr_sem; spinlock_t ui_lock; loff_t synced_i_size; loff_t ui_size; @@ -912,7 +914,7 @@ struct ubifs_budget_req { * @rb: rb-tree node of rb-tree of orphans sorted by inode number * @list: list head of list of orphans in order added * @new_list: list head of list of orphans added since the last commit - * @child_list: list of xattr childs if this orphan hosts xattrs, list head + * @child_list: list of xattr children if this orphan hosts xattrs, list head * if this orphan is a xattr, not used otherwise. * @cnext: next orphan to commit * @dnext: next orphan to delete diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index 6b1e9830b274..e4f193eae4b2 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -208,13 +208,11 @@ static int change_xattr(struct ubifs_info *c, struct inode *host, err = -ENOMEM; goto out_free; } - mutex_lock(&ui->ui_mutex); kfree(ui->data); ui->data = buf; inode->i_size = ui->ui_size = size; old_size = ui->data_len; ui->data_len = size; - mutex_unlock(&ui->ui_mutex); mutex_lock(&host_ui->ui_mutex); host->i_ctime = current_time(host); @@ -285,6 +283,7 @@ int ubifs_xattr_set(struct inode *host, const char *name, const void *value, if (!xent) return -ENOMEM; + down_write(&ubifs_inode(host)->xattr_sem); /* * The extended attribute entries are stored in LNC, so multiple * look-ups do not involve reading the flash. @@ -319,6 +318,7 @@ int ubifs_xattr_set(struct inode *host, const char *name, const void *value, iput(inode); out_free: + up_write(&ubifs_inode(host)->xattr_sem); kfree(xent); return err; } @@ -341,25 +341,25 @@ ssize_t ubifs_xattr_get(struct inode *host, const char *name, void *buf, if (!xent) return -ENOMEM; + down_read(&ubifs_inode(host)->xattr_sem); xent_key_init(c, &key, host->i_ino, &nm); err = ubifs_tnc_lookup_nm(c, &key, xent, &nm); if (err) { if (err == -ENOENT) err = -ENODATA; - goto out_unlock; + goto out_cleanup; } inode = iget_xattr(c, le64_to_cpu(xent->inum)); if (IS_ERR(inode)) { err = PTR_ERR(inode); - goto out_unlock; + goto out_cleanup; } ui = ubifs_inode(inode); ubifs_assert(c, inode->i_size == ui->data_len); ubifs_assert(c, ubifs_inode(host)->xattr_size > ui->data_len); - mutex_lock(&ui->ui_mutex); if (buf) { /* If @buf is %NULL we are supposed to return the length */ if (ui->data_len > size) { @@ -372,9 +372,9 @@ ssize_t ubifs_xattr_get(struct inode *host, const char *name, void *buf, err = ui->data_len; out_iput: - mutex_unlock(&ui->ui_mutex); iput(inode); -out_unlock: +out_cleanup: + up_read(&ubifs_inode(host)->xattr_sem); kfree(xent); return err; } @@ -406,16 +406,21 @@ ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size) dbg_gen("ino %lu ('%pd'), buffer size %zd", host->i_ino, dentry, size); + down_read(&host_ui->xattr_sem); len = host_ui->xattr_names + host_ui->xattr_cnt; - if (!buffer) + if (!buffer) { /* * We should return the minimum buffer size which will fit a * null-terminated list of all the extended attribute names. */ - return len; + err = len; + goto out_err; + } - if (len > size) - return -ERANGE; + if (len > size) { + err = -ERANGE; + goto out_err; + } lowest_xent_key(c, &key, host->i_ino); while (1) { @@ -437,8 +442,9 @@ ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size) pxent = xent; key_read(c, &xent->key, &key); } - kfree(pxent); + up_read(&host_ui->xattr_sem); + if (err != -ENOENT) { ubifs_err(c, "cannot find next direntry, error %d", err); return err; @@ -446,6 +452,10 @@ ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size) ubifs_assert(c, written <= size); return written; + +out_err: + up_read(&host_ui->xattr_sem); + return err; } static int remove_xattr(struct ubifs_info *c, struct inode *host, @@ -504,6 +514,7 @@ int ubifs_purge_xattrs(struct inode *host) ubifs_warn(c, "inode %lu has too many xattrs, doing a non-atomic deletion", host->i_ino); + down_write(&ubifs_inode(host)->xattr_sem); lowest_xent_key(c, &key, host->i_ino); while (1) { xent = ubifs_tnc_next_ent(c, &key, &nm); @@ -523,7 +534,7 @@ int ubifs_purge_xattrs(struct inode *host) ubifs_ro_mode(c, err); kfree(pxent); kfree(xent); - return err; + goto out_err; } ubifs_assert(c, ubifs_inode(xino)->xattr); @@ -535,7 +546,7 @@ int ubifs_purge_xattrs(struct inode *host) kfree(xent); iput(xino); ubifs_err(c, "cannot remove xattr, error %d", err); - return err; + goto out_err; } iput(xino); @@ -544,14 +555,19 @@ int ubifs_purge_xattrs(struct inode *host) pxent = xent; key_read(c, &xent->key, &key); } - kfree(pxent); + up_write(&ubifs_inode(host)->xattr_sem); + if (err != -ENOENT) { ubifs_err(c, "cannot find next direntry, error %d", err); return err; } return 0; + +out_err: + up_write(&ubifs_inode(host)->xattr_sem); + return err; } /** @@ -594,6 +610,7 @@ static int ubifs_xattr_remove(struct inode *host, const char *name) if (!xent) return -ENOMEM; + down_write(&ubifs_inode(host)->xattr_sem); xent_key_init(c, &key, host->i_ino, &nm); err = ubifs_tnc_lookup_nm(c, &key, xent, &nm); if (err) { @@ -618,6 +635,7 @@ static int ubifs_xattr_remove(struct inode *host, const char *name) iput(inode); out_free: + up_write(&ubifs_inode(host)->xattr_sem); kfree(xent); return err; } |