From ba37ff75e04be7df5fa19dcd86f81c984294a37b Mon Sep 17 00:00:00 2001 From: Ajay Kaher Date: Fri, 28 Jul 2023 23:50:44 +0530 Subject: eventfs: Implement tracefs_inode_cache Create a kmem cache of tracefs_inodes. To be more efficient, as there are lots of tracefs inodes, create its own cache. This also allows to see how many tracefs inodes have been created. Add helper functions: tracefs_alloc_inode() tracefs_free_inode() get_tracefs() Link: https://lkml.kernel.org/r/1690568452-46553-3-git-send-email-akaher@vmware.com Signed-off-by: Ajay Kaher Co-developed-by: Steven Rostedt (VMware) Signed-off-by: Steven Rostedt (VMware) Tested-by: Ching-lin Yu Signed-off-by: Steven Rostedt (Google) --- fs/tracefs/inode.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'fs/tracefs/inode.c') diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index 57ac8aa4a724..2508944cc4d8 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -21,13 +21,33 @@ #include #include #include +#include "internal.h" #define TRACEFS_DEFAULT_MODE 0700 +static struct kmem_cache *tracefs_inode_cachep __ro_after_init; static struct vfsmount *tracefs_mount; static int tracefs_mount_count; static bool tracefs_registered; +static struct inode *tracefs_alloc_inode(struct super_block *sb) +{ + struct tracefs_inode *ti; + + ti = kmem_cache_alloc(tracefs_inode_cachep, GFP_KERNEL); + if (!ti) + return NULL; + + ti->flags = 0; + + return &ti->vfs_inode; +} + +static void tracefs_free_inode(struct inode *inode) +{ + kmem_cache_free(tracefs_inode_cachep, get_tracefs(inode)); +} + static ssize_t default_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos) { @@ -346,6 +366,9 @@ static int tracefs_show_options(struct seq_file *m, struct dentry *root) } static const struct super_operations tracefs_super_operations = { + .alloc_inode = tracefs_alloc_inode, + .free_inode = tracefs_free_inode, + .drop_inode = generic_delete_inode, .statfs = simple_statfs, .remount_fs = tracefs_remount, .show_options = tracefs_show_options, @@ -628,10 +651,26 @@ bool tracefs_initialized(void) return tracefs_registered; } +static void init_once(void *foo) +{ + struct tracefs_inode *ti = (struct tracefs_inode *) foo; + + inode_init_once(&ti->vfs_inode); +} + static int __init tracefs_init(void) { int retval; + tracefs_inode_cachep = kmem_cache_create("tracefs_inode_cache", + sizeof(struct tracefs_inode), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD| + SLAB_ACCOUNT), + init_once); + if (!tracefs_inode_cachep) + return -ENOMEM; + retval = sysfs_create_mount_point(kernel_kobj, "tracing"); if (retval) return -EINVAL; -- cgit From 2c6b6b1029d46a8760d6cba09b4e75cb1ac9b579 Mon Sep 17 00:00:00 2001 From: Ajay Kaher Date: Fri, 28 Jul 2023 23:50:45 +0530 Subject: tracefs: Rename and export some tracefs functions Export a few tracefs functions that will be needed by the eventfs dynamic file system. Rename them to start with "tracefs_" to keep with the name space. start_creating -> tracefs_start_creating failed_creating -> tracefs_failed_creating end_creating -> tracefs_end_creating Link: https://lkml.kernel.org/r/1690568452-46553-4-git-send-email-akaher@vmware.com Signed-off-by: Ajay Kaher Co-developed-by: Steven Rostedt (VMware) Signed-off-by: Steven Rostedt (VMware) Tested-by: Ching-lin Yu Signed-off-by: Steven Rostedt (Google) --- fs/tracefs/inode.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'fs/tracefs/inode.c') diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index 2508944cc4d8..4acc4b4dfd22 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -147,7 +147,7 @@ static const struct inode_operations tracefs_dir_inode_operations = { .rmdir = tracefs_syscall_rmdir, }; -static struct inode *tracefs_get_inode(struct super_block *sb) +struct inode *tracefs_get_inode(struct super_block *sb) { struct inode *inode = new_inode(sb); if (inode) { @@ -422,7 +422,7 @@ static struct file_system_type trace_fs_type = { }; MODULE_ALIAS_FS("tracefs"); -static struct dentry *start_creating(const char *name, struct dentry *parent) +struct dentry *tracefs_start_creating(const char *name, struct dentry *parent) { struct dentry *dentry; int error; @@ -460,7 +460,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) return dentry; } -static struct dentry *failed_creating(struct dentry *dentry) +struct dentry *tracefs_failed_creating(struct dentry *dentry) { inode_unlock(d_inode(dentry->d_parent)); dput(dentry); @@ -468,7 +468,7 @@ static struct dentry *failed_creating(struct dentry *dentry) return NULL; } -static struct dentry *end_creating(struct dentry *dentry) +struct dentry *tracefs_end_creating(struct dentry *dentry) { inode_unlock(d_inode(dentry->d_parent)); return dentry; @@ -513,14 +513,14 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode, if (!(mode & S_IFMT)) mode |= S_IFREG; BUG_ON(!S_ISREG(mode)); - dentry = start_creating(name, parent); + dentry = tracefs_start_creating(name, parent); if (IS_ERR(dentry)) return NULL; inode = tracefs_get_inode(dentry->d_sb); if (unlikely(!inode)) - return failed_creating(dentry); + return tracefs_failed_creating(dentry); inode->i_mode = mode; inode->i_fop = fops ? fops : &tracefs_file_operations; @@ -529,13 +529,13 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode, inode->i_gid = d_inode(dentry->d_parent)->i_gid; d_instantiate(dentry, inode); fsnotify_create(d_inode(dentry->d_parent), dentry); - return end_creating(dentry); + return tracefs_end_creating(dentry); } static struct dentry *__create_dir(const char *name, struct dentry *parent, const struct inode_operations *ops) { - struct dentry *dentry = start_creating(name, parent); + struct dentry *dentry = tracefs_start_creating(name, parent); struct inode *inode; if (IS_ERR(dentry)) @@ -543,7 +543,7 @@ static struct dentry *__create_dir(const char *name, struct dentry *parent, inode = tracefs_get_inode(dentry->d_sb); if (unlikely(!inode)) - return failed_creating(dentry); + return tracefs_failed_creating(dentry); /* Do not set bits for OTH */ inode->i_mode = S_IFDIR | S_IRWXU | S_IRUSR| S_IRGRP | S_IXUSR | S_IXGRP; @@ -557,7 +557,7 @@ static struct dentry *__create_dir(const char *name, struct dentry *parent, d_instantiate(dentry, inode); inc_nlink(d_inode(dentry->d_parent)); fsnotify_mkdir(d_inode(dentry->d_parent), dentry); - return end_creating(dentry); + return tracefs_end_creating(dentry); } /** -- cgit From a3760079177765b7f1782419f1c3e12facaf1e9d Mon Sep 17 00:00:00 2001 From: Ajay Kaher Date: Fri, 28 Jul 2023 23:50:49 +0530 Subject: eventfs: Implement functions to create files and dirs when accessed Add create_file() and create_dir() functions to create the files and directories respectively when they are accessed. The functions will be called from the lookup operation of the inode_operations or from the open function of file_operations. Link: https://lkml.kernel.org/r/1690568452-46553-8-git-send-email-akaher@vmware.com Signed-off-by: Ajay Kaher Co-developed-by: Steven Rostedt (VMware) Signed-off-by: Steven Rostedt (VMware) Tested-by: Ching-lin Yu Signed-off-by: Steven Rostedt (Google) --- fs/tracefs/inode.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) (limited to 'fs/tracefs/inode.c') diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index 4acc4b4dfd22..d9273066f25f 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -474,6 +474,80 @@ struct dentry *tracefs_end_creating(struct dentry *dentry) return dentry; } +/** + * eventfs_start_creating - start the process of creating a dentry + * @name: Name of the file created for the dentry + * @parent: The parent dentry where this dentry will be created + * + * This is a simple helper function for the dynamically created eventfs + * files. When the directory of the eventfs files are accessed, their + * dentries are created on the fly. This function is used to start that + * process. + */ +struct dentry *eventfs_start_creating(const char *name, struct dentry *parent) +{ + struct dentry *dentry; + int error; + + error = simple_pin_fs(&trace_fs_type, &tracefs_mount, + &tracefs_mount_count); + if (error) + return ERR_PTR(error); + + /* + * If the parent is not specified, we create it in the root. + * We need the root dentry to do this, which is in the super + * block. A pointer to that is in the struct vfsmount that we + * have around. + */ + if (!parent) + parent = tracefs_mount->mnt_root; + + if (unlikely(IS_DEADDIR(parent->d_inode))) + dentry = ERR_PTR(-ENOENT); + else + dentry = lookup_one_len(name, parent, strlen(name)); + + if (!IS_ERR(dentry) && dentry->d_inode) { + dput(dentry); + dentry = ERR_PTR(-EEXIST); + } + + if (IS_ERR(dentry)) + simple_release_fs(&tracefs_mount, &tracefs_mount_count); + + return dentry; +} + +/** + * eventfs_failed_creating - clean up a failed eventfs dentry creation + * @dentry: The dentry to clean up + * + * If after calling eventfs_start_creating(), a failure is detected, the + * resources created by eventfs_start_creating() needs to be cleaned up. In + * that case, this function should be called to perform that clean up. + */ +struct dentry *eventfs_failed_creating(struct dentry *dentry) +{ + dput(dentry); + simple_release_fs(&tracefs_mount, &tracefs_mount_count); + return NULL; +} + +/** + * eventfs_end_creating - Finish the process of creating a eventfs dentry + * @dentry: The dentry that has successfully been created. + * + * This function is currently just a place holder to match + * eventfs_start_creating(). In case any synchronization needs to be added, + * this function will be used to implement that without having to modify + * the callers of eventfs_start_creating(). + */ +struct dentry *eventfs_end_creating(struct dentry *dentry) +{ + return dentry; +} + /** * tracefs_create_file - create a file in the tracefs filesystem * @name: a pointer to a string containing the name of the file to create. -- cgit From 27152bceea1df27ffebb12ac9cd9adbf2c4c3f35 Mon Sep 17 00:00:00 2001 From: Ajay Kaher Date: Fri, 28 Jul 2023 23:50:51 +0530 Subject: eventfs: Move tracing/events to eventfs Up until now, /sys/kernel/tracing/events was no different than any other part of tracefs. The files and directories within the events directory was created when the tracefs was mounted, and also created for the instances in /sys/kernel/tracing/instances//events. Most of these files and directories will never be referenced. Since there are thousands of these files and directories they spend their time wasting precious memory resources. Move the "events" directory to the new eventfs. The eventfs will take the meta data of the events that they represent and store that. When the files in the events directory are referenced, the dentry and inodes to represent them are then created. When the files are no longer referenced, they are freed. This saves the precious memory resources that were wasted on these seldom referenced dentries and inodes. Running the following: ~# cat /proc/meminfo /proc/slabinfo > before.out ~# mkdir /sys/kernel/tracing/instances/foo ~# cat /proc/meminfo /proc/slabinfo > after.out to test the changes produces the following deltas: Before this change: Before after deltas for meminfo: MemFree: -32260 MemAvailable: -21496 KReclaimable: 21528 Slab: 22440 SReclaimable: 21528 SUnreclaim: 912 VmallocUsed: 16 Before after deltas for slabinfo: : [ * = ] tracefs_inode_cache: 14472 [* 1184 = 17134848] buffer_head: 24 [* 168 = 4032] hmem_inode_cache: 28 [* 1480 = 41440] dentry: 14450 [* 312 = 4508400] lsm_inode_cache: 14453 [* 32 = 462496] vma_lock: 11 [* 152 = 1672] vm_area_struct: 2 [* 184 = 368] trace_event_file: 1748 [* 88 = 153824] kmalloc-256: 1072 [* 256 = 274432] kmalloc-64: 2842 [* 64 = 181888] Total slab additions in size: 22,763,400 bytes With this change: Before after deltas for meminfo: MemFree: -12600 MemAvailable: -12580 Cached: 24 Active: 12 Inactive: 68 Inactive(anon): 48 Active(file): 12 Inactive(file): 20 Dirty: -4 AnonPages: 68 KReclaimable: 12 Slab: 1856 SReclaimable: 12 SUnreclaim: 1844 KernelStack: 16 PageTables: 36 VmallocUsed: 16 Before after deltas for slabinfo: : [ * = ] tracefs_inode_cache: 108 [* 1184 = 127872] buffer_head: 24 [* 168 = 4032] hmem_inode_cache: 18 [* 1480 = 26640] dentry: 127 [* 312 = 39624] lsm_inode_cache: 152 [* 32 = 4864] vma_lock: 67 [* 152 = 10184] vm_area_struct: -12 [* 184 = -2208] trace_event_file: 1764 [* 96 = 169344] kmalloc-96: 14322 [* 96 = 1374912] kmalloc-64: 2814 [* 64 = 180096] kmalloc-32: 1103 [* 32 = 35296] kmalloc-16: 2308 [* 16 = 36928] kmalloc-8: 12800 [* 8 = 102400] Total slab additions in size: 2,109,984 bytes Which is a savings of 20,653,416 bytes (20 MB) per tracing instance. Link: https://lkml.kernel.org/r/1690568452-46553-10-git-send-email-akaher@vmware.com Signed-off-by: Ajay Kaher Co-developed-by: Steven Rostedt (VMware) Signed-off-by: Steven Rostedt (VMware) Tested-by: Ching-lin Yu Signed-off-by: Steven Rostedt (Google) --- fs/tracefs/inode.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'fs/tracefs/inode.c') diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index d9273066f25f..bb6de89eb446 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -374,6 +374,23 @@ static const struct super_operations tracefs_super_operations = { .show_options = tracefs_show_options, }; +static void tracefs_dentry_iput(struct dentry *dentry, struct inode *inode) +{ + struct tracefs_inode *ti; + + if (!dentry || !inode) + return; + + ti = get_tracefs(inode); + if (ti && ti->flags & TRACEFS_EVENT_INODE) + eventfs_set_ef_status_free(dentry); + iput(inode); +} + +static const struct dentry_operations tracefs_dentry_operations = { + .d_iput = tracefs_dentry_iput, +}; + static int trace_fill_super(struct super_block *sb, void *data, int silent) { static const struct tree_descr trace_files[] = {{""}}; @@ -396,6 +413,7 @@ static int trace_fill_super(struct super_block *sb, void *data, int silent) goto fail; sb->s_op = &tracefs_super_operations; + sb->s_d_op = &tracefs_dentry_operations; tracefs_apply_options(sb, false); -- cgit From 086629773ec96216d06c72c801602cc56ebece27 Mon Sep 17 00:00:00 2001 From: Sishuai Gong Date: Thu, 17 Aug 2023 20:00:31 -0400 Subject: tracefs: Avoid changing i_mode to a temp value Right now inode->i_mode is updated twice to reach the desired value in tracefs_apply_options(). Because there is no lock protecting the two writes, other threads might read the intermediate value of inode->i_mode. Thread-1 Thread-2 // tracefs_apply_options() //e.g., acl_permission_check inode->i_mode &= ~S_IALLUGO; unsigned int mode = inode->i_mode; inode->i_mode |= opts->mode; I think there is no need to introduce a lock but it is better to only update inode->i_mode ONCE, so the readers will either see the old or latest value, rather than an intermediate/temporary value. Note, the race is not a security concern as the intermediate value is more locked down than either the start or end version. This is more just to do the conversion cleanly. Link: https://lore.kernel.org/linux-trace-kernel/AB5B0A1C-75D9-4E82-A7F0-CF7D0715587B@gmail.com Signed-off-by: Sishuai Gong Signed-off-by: Steven Rostedt (Google) --- fs/tracefs/inode.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs/tracefs/inode.c') diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index bb6de89eb446..c7a10f965602 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -310,6 +310,7 @@ static int tracefs_apply_options(struct super_block *sb, bool remount) struct tracefs_fs_info *fsi = sb->s_fs_info; struct inode *inode = d_inode(sb->s_root); struct tracefs_mount_opts *opts = &fsi->mount_opts; + umode_t tmp_mode; /* * On remount, only reset mode/uid/gid if they were provided as mount @@ -317,8 +318,9 @@ static int tracefs_apply_options(struct super_block *sb, bool remount) */ if (!remount || opts->opts & BIT(Opt_mode)) { - inode->i_mode &= ~S_IALLUGO; - inode->i_mode |= opts->mode; + tmp_mode = READ_ONCE(inode->i_mode) & ~S_IALLUGO; + tmp_mode |= opts->mode; + WRITE_ONCE(inode->i_mode, tmp_mode); } if (!remount || opts->opts & BIT(Opt_uid)) -- cgit