From 9caf696142252a466fb89e629d0eddcdced027b0 Mon Sep 17 00:00:00 2001 From: Imran Khan Date: Thu, 9 Mar 2023 22:09:30 +1100 Subject: kernfs: Introduce separate rwsem to protect inode attributes. Right now a global per-fs rwsem (kernfs_rwsem) synchronizes multiple kernfs operations. On a large system with few hundred CPUs and few hundred applications simultaneoulsy trying to access sysfs, this results in multiple sys_open(s) contending on kernfs_rwsem via kernfs_iop_permission and kernfs_dop_revalidate. For example on a system with 384 cores, if I run 200 instances of an application which is mostly executing the following loop: for (int loop = 0; loop <100 ; loop++) { for (int port_num = 1; port_num < 2; port_num++) { for (int gid_index = 0; gid_index < 254; gid_index++ ) { char ret_buf[64], ret_buf_lo[64]; char gid_file_path[1024]; int ret_len; int ret_fd; ssize_t ret_rd; ub4 i, saved_errno; memset(ret_buf, 0, sizeof(ret_buf)); memset(gid_file_path, 0, sizeof(gid_file_path)); ret_len = snprintf(gid_file_path, sizeof(gid_file_path), "/sys/class/infiniband/%s/ports/%d/gids/%d", dev_name, port_num, gid_index); ret_fd = open(gid_file_path, O_RDONLY | O_CLOEXEC); if (ret_fd < 0) { printf("Failed to open %s\n", gid_file_path); continue; } /* Read the GID */ ret_rd = read(ret_fd, ret_buf, 40); if (ret_rd == -1) { printf("Failed to read from file %s, errno: %u\n", gid_file_path, saved_errno); continue; } close(ret_fd); } } I see contention around kernfs_rwsem as follows: path_openat | |----link_path_walk.part.0.constprop.0 | | | |--49.92%--inode_permission | | | | | --48.69%--kernfs_iop_permission | | | | | |--18.16%--down_read | | | | | |--15.38%--up_read | | | | | --14.58%--_raw_spin_lock | | | | | ----- | | | |--29.08%--walk_component | | | | | --29.02%--lookup_fast | | | | | |--24.26%--kernfs_dop_revalidate | | | | | | | |--14.97%--down_read | | | | | | | --9.01%--up_read | | | | | --4.74%--__d_lookup | | | | | --4.64%--_raw_spin_lock | | | | | ---- Having a separate per-fs rwsem to protect kernfs inode attributes, will avoid the above mentioned contention and result in better performance as can bee seen below: path_openat | |----link_path_walk.part.0.constprop.0 | | | | | |--27.06%--inode_permission | | | | | --25.84%--kernfs_iop_permission | | | | | |--9.29%--up_read | | | | | |--8.19%--down_read | | | | | --7.89%--_raw_spin_lock | | | | | ---- | | | |--22.42%--walk_component | | | | | --22.36%--lookup_fast | | | | | |--16.07%--__d_lookup | | | | | | | --16.01%--_raw_spin_lock | | | | | | | ---- | | | | | --6.28%--kernfs_dop_revalidate | | | | | |--3.76%--down_read | | | | | --2.26%--up_read As can be seen from the above data the overhead due to both kerfs_iop_permission and kernfs_dop_revalidate have gone down and this also reduces overall run time of the earlier mentioned loop. Signed-off-by: Imran Khan Link: https://lore.kernel.org/r/20230309110932.2889010-2-imran.f.khan@oracle.com Signed-off-by: Greg Kroah-Hartman --- fs/kernfs/dir.c | 7 +++++++ fs/kernfs/inode.c | 16 ++++++++-------- fs/kernfs/kernfs-internal.h | 1 + 3 files changed, 16 insertions(+), 8 deletions(-) (limited to 'fs/kernfs') diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index ef00b5fe8cee..953b2717c60e 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -770,12 +770,15 @@ int kernfs_add_one(struct kernfs_node *kn) goto out_unlock; /* Update timestamps on the parent */ + down_write(&root->kernfs_iattr_rwsem); + ps_iattr = parent->iattr; if (ps_iattr) { ktime_get_real_ts64(&ps_iattr->ia_ctime); ps_iattr->ia_mtime = ps_iattr->ia_ctime; } + up_write(&root->kernfs_iattr_rwsem); up_write(&root->kernfs_rwsem); /* @@ -940,6 +943,7 @@ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, idr_init(&root->ino_idr); init_rwsem(&root->kernfs_rwsem); + init_rwsem(&root->kernfs_iattr_rwsem); INIT_LIST_HEAD(&root->supers); /* @@ -1462,11 +1466,14 @@ static void __kernfs_remove(struct kernfs_node *kn) pos->parent ? pos->parent->iattr : NULL; /* update timestamps on the parent */ + down_write(&kernfs_root(kn)->kernfs_iattr_rwsem); + if (ps_iattr) { ktime_get_real_ts64(&ps_iattr->ia_ctime); ps_iattr->ia_mtime = ps_iattr->ia_ctime; } + up_write(&kernfs_root(kn)->kernfs_iattr_rwsem); kernfs_put(pos); } diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index 30494dcb0df3..b22b74d1a115 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -101,9 +101,9 @@ int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) int ret; struct kernfs_root *root = kernfs_root(kn); - down_write(&root->kernfs_rwsem); + down_write(&root->kernfs_iattr_rwsem); ret = __kernfs_setattr(kn, iattr); - up_write(&root->kernfs_rwsem); + up_write(&root->kernfs_iattr_rwsem); return ret; } @@ -119,7 +119,7 @@ int kernfs_iop_setattr(struct mnt_idmap *idmap, struct dentry *dentry, return -EINVAL; root = kernfs_root(kn); - down_write(&root->kernfs_rwsem); + down_write(&root->kernfs_iattr_rwsem); error = setattr_prepare(&nop_mnt_idmap, dentry, iattr); if (error) goto out; @@ -132,7 +132,7 @@ int kernfs_iop_setattr(struct mnt_idmap *idmap, struct dentry *dentry, setattr_copy(&nop_mnt_idmap, inode, iattr); out: - up_write(&root->kernfs_rwsem); + up_write(&root->kernfs_iattr_rwsem); return error; } @@ -189,10 +189,10 @@ int kernfs_iop_getattr(struct mnt_idmap *idmap, struct kernfs_node *kn = inode->i_private; struct kernfs_root *root = kernfs_root(kn); - down_read(&root->kernfs_rwsem); + down_read(&root->kernfs_iattr_rwsem); kernfs_refresh_inode(kn, inode); generic_fillattr(&nop_mnt_idmap, inode, stat); - up_read(&root->kernfs_rwsem); + up_read(&root->kernfs_iattr_rwsem); return 0; } @@ -285,10 +285,10 @@ int kernfs_iop_permission(struct mnt_idmap *idmap, kn = inode->i_private; root = kernfs_root(kn); - down_read(&root->kernfs_rwsem); + down_read(&root->kernfs_iattr_rwsem); kernfs_refresh_inode(kn, inode); ret = generic_permission(&nop_mnt_idmap, inode, mask); - up_read(&root->kernfs_rwsem); + up_read(&root->kernfs_iattr_rwsem); return ret; } diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index 236c3a6113f1..3297093c920d 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -47,6 +47,7 @@ struct kernfs_root { wait_queue_head_t deactivate_waitq; struct rw_semaphore kernfs_rwsem; + struct rw_semaphore kernfs_iattr_rwsem; }; /* +1 to avoid triggering overflow warning when negating it */ -- cgit From c9f2dfb7b59e5a6db054f821a6e1a6db8fa57d64 Mon Sep 17 00:00:00 2001 From: Imran Khan Date: Thu, 9 Mar 2023 22:09:31 +1100 Subject: kernfs: Use a per-fs rwsem to protect per-fs list of kernfs_super_info. Right now per-fs kernfs_rwsem protects list of kernfs_super_info instances for a kernfs_root. Since kernfs_rwsem is used to synchronize several other operations across kernfs and since most of these operations don't impact kernfs_super_info, we can use a separate per-fs rwsem to synchronize access to list of kernfs_super_info. This helps in reducing contention around kernfs_rwsem and also allows operations that change/access list of kernfs_super_info to proceed without contending for kernfs_rwsem. Signed-off-by: Imran Khan Link: https://lore.kernel.org/r/20230309110932.2889010-3-imran.f.khan@oracle.com Signed-off-by: Greg Kroah-Hartman --- fs/kernfs/dir.c | 1 + fs/kernfs/file.c | 4 ++-- fs/kernfs/kernfs-internal.h | 1 + fs/kernfs/mount.c | 8 ++++---- 4 files changed, 8 insertions(+), 6 deletions(-) (limited to 'fs/kernfs') diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 953b2717c60e..2cdb8516e528 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -944,6 +944,7 @@ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, idr_init(&root->ino_idr); init_rwsem(&root->kernfs_rwsem); init_rwsem(&root->kernfs_iattr_rwsem); + init_rwsem(&root->kernfs_supers_rwsem); INIT_LIST_HEAD(&root->supers); /* diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index e4a50e4ff0d2..40c4661f15b7 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -922,8 +922,8 @@ repeat: root = kernfs_root(kn); /* kick fsnotify */ - down_write(&root->kernfs_rwsem); + down_read(&root->kernfs_supers_rwsem); list_for_each_entry(info, &kernfs_root(kn)->supers, node) { struct kernfs_node *parent; struct inode *p_inode = NULL; @@ -960,7 +960,7 @@ repeat: iput(inode); } - up_write(&root->kernfs_rwsem); + up_read(&root->kernfs_supers_rwsem); kernfs_put(kn); goto repeat; } diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index 3297093c920d..a9b854cdfdb5 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -48,6 +48,7 @@ struct kernfs_root { wait_queue_head_t deactivate_waitq; struct rw_semaphore kernfs_rwsem; struct rw_semaphore kernfs_iattr_rwsem; + struct rw_semaphore kernfs_supers_rwsem; }; /* +1 to avoid triggering overflow warning when negating it */ diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index e08e8d999807..d49606accb07 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -351,9 +351,9 @@ int kernfs_get_tree(struct fs_context *fc) } sb->s_flags |= SB_ACTIVE; - down_write(&root->kernfs_rwsem); + down_write(&root->kernfs_supers_rwsem); list_add(&info->node, &info->root->supers); - up_write(&root->kernfs_rwsem); + up_write(&root->kernfs_supers_rwsem); } fc->root = dget(sb->s_root); @@ -380,9 +380,9 @@ void kernfs_kill_sb(struct super_block *sb) struct kernfs_super_info *info = kernfs_info(sb); struct kernfs_root *root = info->root; - down_write(&root->kernfs_rwsem); + down_write(&root->kernfs_supers_rwsem); list_del(&info->node); - up_write(&root->kernfs_rwsem); + up_write(&root->kernfs_supers_rwsem); /* * Remove the superblock from fs_supers/s_instances -- cgit From 06fb4736139fde0e1431fe2a5b4912dcb7588d09 Mon Sep 17 00:00:00 2001 From: Imran Khan Date: Thu, 9 Mar 2023 22:09:32 +1100 Subject: kernfs: change kernfs_rename_lock into a read-write lock. kernfs_rename_lock protects a node's ->parent and thus kernfs topology. Thus it can be used in cases that rely on a stable kernfs topology. Change it to a read-write lock for better scalability. Suggested by: Al Viro Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Imran Khan Link: https://lore.kernel.org/r/20230309110932.2889010-4-imran.f.khan@oracle.com Signed-off-by: Greg Kroah-Hartman --- fs/kernfs/dir.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'fs/kernfs') diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 2cdb8516e528..06e27b36216f 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -17,7 +17,7 @@ #include "kernfs-internal.h" -static DEFINE_SPINLOCK(kernfs_rename_lock); /* kn->parent and ->name */ +static DEFINE_RWLOCK(kernfs_rename_lock); /* kn->parent and ->name */ /* * Don't use rename_lock to piggy back on pr_cont_buf. We don't want to * call pr_cont() while holding rename_lock. Because sometimes pr_cont() @@ -196,9 +196,9 @@ int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen) unsigned long flags; int ret; - spin_lock_irqsave(&kernfs_rename_lock, flags); + read_lock_irqsave(&kernfs_rename_lock, flags); ret = kernfs_name_locked(kn, buf, buflen); - spin_unlock_irqrestore(&kernfs_rename_lock, flags); + read_unlock_irqrestore(&kernfs_rename_lock, flags); return ret; } @@ -224,9 +224,9 @@ int kernfs_path_from_node(struct kernfs_node *to, struct kernfs_node *from, unsigned long flags; int ret; - spin_lock_irqsave(&kernfs_rename_lock, flags); + read_lock_irqsave(&kernfs_rename_lock, flags); ret = kernfs_path_from_node_locked(to, from, buf, buflen); - spin_unlock_irqrestore(&kernfs_rename_lock, flags); + read_unlock_irqrestore(&kernfs_rename_lock, flags); return ret; } EXPORT_SYMBOL_GPL(kernfs_path_from_node); @@ -294,10 +294,10 @@ struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn) struct kernfs_node *parent; unsigned long flags; - spin_lock_irqsave(&kernfs_rename_lock, flags); + read_lock_irqsave(&kernfs_rename_lock, flags); parent = kn->parent; kernfs_get(parent); - spin_unlock_irqrestore(&kernfs_rename_lock, flags); + read_unlock_irqrestore(&kernfs_rename_lock, flags); return parent; } @@ -1731,7 +1731,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, kernfs_get(new_parent); /* rename_lock protects ->parent and ->name accessors */ - spin_lock_irq(&kernfs_rename_lock); + write_lock_irq(&kernfs_rename_lock); old_parent = kn->parent; kn->parent = new_parent; @@ -1742,7 +1742,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, kn->name = new_name; } - spin_unlock_irq(&kernfs_rename_lock); + write_unlock_irq(&kernfs_rename_lock); kn->hash = kernfs_name_hash(kn->name, kn->ns); kernfs_link_sibling(kn); -- cgit