From 1404ff3cc3a14cb1fe8535e30b87d20da9513767 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 7 Oct 2016 16:56:49 -0700
Subject: fsnotify: drop notification_mutex before destroying event

fsnotify_flush_notify() and fanotify_release() destroy notification
event while holding notification_mutex.

The destruction of fanotify event includes a path_put() call which may
end up calling into a filesystem to delete an inode if we happen to be
the last holders of dentry reference which happens to be the last holder
of inode reference.

That in turn may violate lock ordering for some filesystems since
notification_mutex is also acquired e. g. during write when generating
fanotify event.

Also this is the only thing that forces notification_mutex to be a
sleeping lock.  So drop notification_mutex before destroying a
notification event.

Link: http://lkml.kernel.org/r/1473797711-14111-4-git-send-email-jack@suse.cz
Signed-off-by: Jan Kara <jack@suse.cz>
Cc: Miklos Szeredi <mszeredi@redhat.com>
Cc: Lino Sanfilippo <LinoSanfilippo@gmx.de>
Cc: Eric Paris <eparis@redhat.com>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/notify/fanotify/fanotify_user.c | 6 ++++--
 fs/notify/notification.c           | 2 ++
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index a64313868d3a..46d135c4988f 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -390,9 +390,11 @@ static int fanotify_release(struct inode *ignored, struct file *file)
 	mutex_lock(&group->notification_mutex);
 	while (!fsnotify_notify_queue_is_empty(group)) {
 		fsn_event = fsnotify_remove_first_event(group);
-		if (!(fsn_event->mask & FAN_ALL_PERM_EVENTS))
+		if (!(fsn_event->mask & FAN_ALL_PERM_EVENTS)) {
+			mutex_unlock(&group->notification_mutex);
 			fsnotify_destroy_event(group, fsn_event);
-		else
+			mutex_lock(&group->notification_mutex);
+		} else
 			FANOTIFY_PE(fsn_event)->response = FAN_ALLOW;
 	}
 	mutex_unlock(&group->notification_mutex);
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index e455e83ceeeb..7d563dea52a4 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -178,7 +178,9 @@ void fsnotify_flush_notify(struct fsnotify_group *group)
 	mutex_lock(&group->notification_mutex);
 	while (!fsnotify_notify_queue_is_empty(group)) {
 		event = fsnotify_remove_first_event(group);
+		mutex_unlock(&group->notification_mutex);
 		fsnotify_destroy_event(group, event);
+		mutex_lock(&group->notification_mutex);
 	}
 	mutex_unlock(&group->notification_mutex);
 }
-- 
cgit 


From c21dbe20f606219fe54faf555b7bc5565487c58f Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 7 Oct 2016 16:56:52 -0700
Subject: fsnotify: convert notification_mutex to a spinlock

notification_mutex is used to protect the list of pending events.  As such
there's no reason to use a sleeping lock for it.  Convert it to a
spinlock.

[jack@suse.cz: fixed version]
  Link: http://lkml.kernel.org/r/1474031567-1831-1-git-send-email-jack@suse.cz
Link: http://lkml.kernel.org/r/1473797711-14111-5-git-send-email-jack@suse.cz
Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Lino Sanfilippo <LinoSanfilippo@gmx.de>
Tested-by: Guenter Roeck <linux@roeck-us.net>
Cc: Miklos Szeredi <mszeredi@redhat.com>
Cc: Eric Paris <eparis@redhat.com>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/notify/fanotify/fanotify_user.c | 27 ++++++++++++++-------------
 fs/notify/group.c                  |  6 +++---
 fs/notify/inotify/inotify_user.c   | 16 ++++++++--------
 fs/notify/notification.c           | 27 +++++++++++++++------------
 4 files changed, 40 insertions(+), 36 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 46d135c4988f..80091a5dc8c0 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -49,12 +49,13 @@ struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
  * enough to fit in "count". Return an error pointer if the count
  * is not large enough.
  *
- * Called with the group->notification_mutex held.
+ * Called with the group->notification_lock held.
  */
 static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
 					    size_t count)
 {
-	BUG_ON(!mutex_is_locked(&group->notification_mutex));
+	BUG_ON(IS_ENABLED(CONFIG_SMP) &&
+	       !spin_is_locked(&group->notification_lock));
 
 	pr_debug("%s: group=%p count=%zd\n", __func__, group, count);
 
@@ -64,7 +65,7 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
 	if (FAN_EVENT_METADATA_LEN > count)
 		return ERR_PTR(-EINVAL);
 
-	/* held the notification_mutex the whole time, so this is the
+	/* held the notification_lock the whole time, so this is the
 	 * same event we peeked above */
 	return fsnotify_remove_first_event(group);
 }
@@ -244,10 +245,10 @@ static unsigned int fanotify_poll(struct file *file, poll_table *wait)
 	int ret = 0;
 
 	poll_wait(file, &group->notification_waitq, wait);
-	mutex_lock(&group->notification_mutex);
+	spin_lock(&group->notification_lock);
 	if (!fsnotify_notify_queue_is_empty(group))
 		ret = POLLIN | POLLRDNORM;
-	mutex_unlock(&group->notification_mutex);
+	spin_unlock(&group->notification_lock);
 
 	return ret;
 }
@@ -268,9 +269,9 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
 
 	add_wait_queue(&group->notification_waitq, &wait);
 	while (1) {
-		mutex_lock(&group->notification_mutex);
+		spin_lock(&group->notification_lock);
 		kevent = get_one_event(group, count);
-		mutex_unlock(&group->notification_mutex);
+		spin_unlock(&group->notification_lock);
 
 		if (IS_ERR(kevent)) {
 			ret = PTR_ERR(kevent);
@@ -387,17 +388,17 @@ static int fanotify_release(struct inode *ignored, struct file *file)
 	 * dequeue them and set the response. They will be freed once the
 	 * response is consumed and fanotify_get_response() returns.
 	 */
-	mutex_lock(&group->notification_mutex);
+	spin_lock(&group->notification_lock);
 	while (!fsnotify_notify_queue_is_empty(group)) {
 		fsn_event = fsnotify_remove_first_event(group);
 		if (!(fsn_event->mask & FAN_ALL_PERM_EVENTS)) {
-			mutex_unlock(&group->notification_mutex);
+			spin_unlock(&group->notification_lock);
 			fsnotify_destroy_event(group, fsn_event);
-			mutex_lock(&group->notification_mutex);
+			spin_lock(&group->notification_lock);
 		} else
 			FANOTIFY_PE(fsn_event)->response = FAN_ALLOW;
 	}
-	mutex_unlock(&group->notification_mutex);
+	spin_unlock(&group->notification_lock);
 
 	/* Response for all permission events it set, wakeup waiters */
 	wake_up(&group->fanotify_data.access_waitq);
@@ -423,10 +424,10 @@ static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long ar
 
 	switch (cmd) {
 	case FIONREAD:
-		mutex_lock(&group->notification_mutex);
+		spin_lock(&group->notification_lock);
 		list_for_each_entry(fsn_event, &group->notification_list, list)
 			send_len += FAN_EVENT_METADATA_LEN;
-		mutex_unlock(&group->notification_mutex);
+		spin_unlock(&group->notification_lock);
 		ret = put_user(send_len, (int __user *) p);
 		break;
 	}
diff --git a/fs/notify/group.c b/fs/notify/group.c
index b47f7cfdcaa4..fbe3cbebec16 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -45,9 +45,9 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group)
  */
 void fsnotify_group_stop_queueing(struct fsnotify_group *group)
 {
-	mutex_lock(&group->notification_mutex);
+	spin_lock(&group->notification_lock);
 	group->shutdown = true;
-	mutex_unlock(&group->notification_mutex);
+	spin_unlock(&group->notification_lock);
 }
 
 /*
@@ -125,7 +125,7 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
 	atomic_set(&group->refcnt, 1);
 	atomic_set(&group->num_marks, 0);
 
-	mutex_init(&group->notification_mutex);
+	spin_lock_init(&group->notification_lock);
 	INIT_LIST_HEAD(&group->notification_list);
 	init_waitqueue_head(&group->notification_waitq);
 	group->max_events = UINT_MAX;
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index b8d08d0d0a4d..69d1ea3d292a 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -115,10 +115,10 @@ static unsigned int inotify_poll(struct file *file, poll_table *wait)
 	int ret = 0;
 
 	poll_wait(file, &group->notification_waitq, wait);
-	mutex_lock(&group->notification_mutex);
+	spin_lock(&group->notification_lock);
 	if (!fsnotify_notify_queue_is_empty(group))
 		ret = POLLIN | POLLRDNORM;
-	mutex_unlock(&group->notification_mutex);
+	spin_unlock(&group->notification_lock);
 
 	return ret;
 }
@@ -138,7 +138,7 @@ static int round_event_name_len(struct fsnotify_event *fsn_event)
  * enough to fit in "count". Return an error pointer if
  * not large enough.
  *
- * Called with the group->notification_mutex held.
+ * Called with the group->notification_lock held.
  */
 static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
 					    size_t count)
@@ -157,7 +157,7 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
 	if (event_size > count)
 		return ERR_PTR(-EINVAL);
 
-	/* held the notification_mutex the whole time, so this is the
+	/* held the notification_lock the whole time, so this is the
 	 * same event we peeked above */
 	fsnotify_remove_first_event(group);
 
@@ -234,9 +234,9 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
 
 	add_wait_queue(&group->notification_waitq, &wait);
 	while (1) {
-		mutex_lock(&group->notification_mutex);
+		spin_lock(&group->notification_lock);
 		kevent = get_one_event(group, count);
-		mutex_unlock(&group->notification_mutex);
+		spin_unlock(&group->notification_lock);
 
 		pr_debug("%s: group=%p kevent=%p\n", __func__, group, kevent);
 
@@ -300,13 +300,13 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
 
 	switch (cmd) {
 	case FIONREAD:
-		mutex_lock(&group->notification_mutex);
+		spin_lock(&group->notification_lock);
 		list_for_each_entry(fsn_event, &group->notification_list,
 				    list) {
 			send_len += sizeof(struct inotify_event);
 			send_len += round_event_name_len(fsn_event);
 		}
-		mutex_unlock(&group->notification_mutex);
+		spin_unlock(&group->notification_lock);
 		ret = put_user(send_len, (int __user *) p);
 		break;
 	}
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 7d563dea52a4..8a7a8cd041e8 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -63,7 +63,8 @@ EXPORT_SYMBOL_GPL(fsnotify_get_cookie);
 /* return true if the notify queue is empty, false otherwise */
 bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group)
 {
-	BUG_ON(!mutex_is_locked(&group->notification_mutex));
+	BUG_ON(IS_ENABLED(CONFIG_SMP) &&
+	       !spin_is_locked(&group->notification_lock));
 	return list_empty(&group->notification_list) ? true : false;
 }
 
@@ -95,10 +96,10 @@ int fsnotify_add_event(struct fsnotify_group *group,
 
 	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
 
-	mutex_lock(&group->notification_mutex);
+	spin_lock(&group->notification_lock);
 
 	if (group->shutdown) {
-		mutex_unlock(&group->notification_mutex);
+		spin_unlock(&group->notification_lock);
 		return 2;
 	}
 
@@ -106,7 +107,7 @@ int fsnotify_add_event(struct fsnotify_group *group,
 		ret = 2;
 		/* Queue overflow event only if it isn't already queued */
 		if (!list_empty(&group->overflow_event->list)) {
-			mutex_unlock(&group->notification_mutex);
+			spin_unlock(&group->notification_lock);
 			return ret;
 		}
 		event = group->overflow_event;
@@ -116,7 +117,7 @@ int fsnotify_add_event(struct fsnotify_group *group,
 	if (!list_empty(list) && merge) {
 		ret = merge(list, event);
 		if (ret) {
-			mutex_unlock(&group->notification_mutex);
+			spin_unlock(&group->notification_lock);
 			return ret;
 		}
 	}
@@ -124,7 +125,7 @@ int fsnotify_add_event(struct fsnotify_group *group,
 queue:
 	group->q_len++;
 	list_add_tail(&event->list, list);
-	mutex_unlock(&group->notification_mutex);
+	spin_unlock(&group->notification_lock);
 
 	wake_up(&group->notification_waitq);
 	kill_fasync(&group->fsn_fa, SIGIO, POLL_IN);
@@ -139,7 +140,8 @@ struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group)
 {
 	struct fsnotify_event *event;
 
-	BUG_ON(!mutex_is_locked(&group->notification_mutex));
+	BUG_ON(IS_ENABLED(CONFIG_SMP) &&
+	       !spin_is_locked(&group->notification_lock));
 
 	pr_debug("%s: group=%p\n", __func__, group);
 
@@ -161,7 +163,8 @@ struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group)
  */
 struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group)
 {
-	BUG_ON(!mutex_is_locked(&group->notification_mutex));
+	BUG_ON(IS_ENABLED(CONFIG_SMP) &&
+	       !spin_is_locked(&group->notification_lock));
 
 	return list_first_entry(&group->notification_list,
 				struct fsnotify_event, list);
@@ -175,14 +178,14 @@ void fsnotify_flush_notify(struct fsnotify_group *group)
 {
 	struct fsnotify_event *event;
 
-	mutex_lock(&group->notification_mutex);
+	spin_lock(&group->notification_lock);
 	while (!fsnotify_notify_queue_is_empty(group)) {
 		event = fsnotify_remove_first_event(group);
-		mutex_unlock(&group->notification_mutex);
+		spin_unlock(&group->notification_lock);
 		fsnotify_destroy_event(group, event);
-		mutex_lock(&group->notification_mutex);
+		spin_lock(&group->notification_lock);
 	}
-	mutex_unlock(&group->notification_mutex);
+	spin_unlock(&group->notification_lock);
 }
 
 /*
-- 
cgit 


From 073f65522aeb23e46fc8a809d69513132d3acc81 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 7 Oct 2016 16:56:55 -0700
Subject: fanotify: use notification_lock instead of access_lock

Fanotify code has its own lock (access_lock) to protect a list of events
waiting for a response from userspace.

However this is somewhat awkward as the same list_head in the event is
protected by notification_lock if it is part of the notification queue
and by access_lock if it is part of the fanotify private queue which
makes it difficult for any reliable checks in the generic code.  So make
fanotify use the same lock - notification_lock - for protecting its
private event list.

Link: http://lkml.kernel.org/r/1473797711-14111-6-git-send-email-jack@suse.cz
Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Lino Sanfilippo <LinoSanfilippo@gmx.de>
Cc: Miklos Szeredi <mszeredi@redhat.com>
Cc: Eric Paris <eparis@redhat.com>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/notify/fanotify/fanotify_user.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 80091a5dc8c0..189fab3ac4e6 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -148,7 +148,7 @@ static struct fanotify_perm_event_info *dequeue_event(
 {
 	struct fanotify_perm_event_info *event, *return_e = NULL;
 
-	spin_lock(&group->fanotify_data.access_lock);
+	spin_lock(&group->notification_lock);
 	list_for_each_entry(event, &group->fanotify_data.access_list,
 			    fae.fse.list) {
 		if (event->fd != fd)
@@ -158,7 +158,7 @@ static struct fanotify_perm_event_info *dequeue_event(
 		return_e = event;
 		break;
 	}
-	spin_unlock(&group->fanotify_data.access_lock);
+	spin_unlock(&group->notification_lock);
 
 	pr_debug("%s: found return_re=%p\n", __func__, return_e);
 
@@ -310,10 +310,10 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
 				wake_up(&group->fanotify_data.access_waitq);
 				break;
 			}
-			spin_lock(&group->fanotify_data.access_lock);
+			spin_lock(&group->notification_lock);
 			list_add_tail(&kevent->list,
 				      &group->fanotify_data.access_list);
-			spin_unlock(&group->fanotify_data.access_lock);
+			spin_unlock(&group->notification_lock);
 #endif
 		}
 		buf += ret;
@@ -372,7 +372,7 @@ static int fanotify_release(struct inode *ignored, struct file *file)
 	 * Process all permission events on access_list and notification queue
 	 * and simulate reply from userspace.
 	 */
-	spin_lock(&group->fanotify_data.access_lock);
+	spin_lock(&group->notification_lock);
 	list_for_each_entry_safe(event, next, &group->fanotify_data.access_list,
 				 fae.fse.list) {
 		pr_debug("%s: found group=%p event=%p\n", __func__, group,
@@ -381,14 +381,12 @@ static int fanotify_release(struct inode *ignored, struct file *file)
 		list_del_init(&event->fae.fse.list);
 		event->response = FAN_ALLOW;
 	}
-	spin_unlock(&group->fanotify_data.access_lock);
 
 	/*
 	 * Destroy all non-permission events. For permission events just
 	 * dequeue them and set the response. They will be freed once the
 	 * response is consumed and fanotify_get_response() returns.
 	 */
-	spin_lock(&group->notification_lock);
 	while (!fsnotify_notify_queue_is_empty(group)) {
 		fsn_event = fsnotify_remove_first_event(group);
 		if (!(fsn_event->mask & FAN_ALL_PERM_EVENTS)) {
@@ -768,7 +766,6 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 		event_f_flags |= O_LARGEFILE;
 	group->fanotify_data.f_flags = event_f_flags;
 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
-	spin_lock_init(&group->fanotify_data.access_lock);
 	init_waitqueue_head(&group->fanotify_data.access_waitq);
 	INIT_LIST_HEAD(&group->fanotify_data.access_list);
 #endif
-- 
cgit 


From 0b1b86527df4b1f398266c23e926dd788925bb69 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 7 Oct 2016 16:56:58 -0700
Subject: fanotify: fix possible false warning when freeing events

When freeing permission events by fsnotify_destroy_event(), the warning
WARN_ON(!list_empty(&event->list)); may falsely hit.

This is because although fanotify_get_response() saw event->response
set, there is nothing to make sure the current CPU also sees the removal
of the event from the list.  Add proper locking around the WARN_ON() to
avoid the false warning.

Link: http://lkml.kernel.org/r/1473797711-14111-7-git-send-email-jack@suse.cz
Reported-by: Miklos Szeredi <mszeredi@redhat.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Lino Sanfilippo <LinoSanfilippo@gmx.de>
Cc: Eric Paris <eparis@redhat.com>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/notify/notification.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 8a7a8cd041e8..1a8010e7a2a0 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -74,8 +74,17 @@ void fsnotify_destroy_event(struct fsnotify_group *group,
 	/* Overflow events are per-group and we don't want to free them */
 	if (!event || event->mask == FS_Q_OVERFLOW)
 		return;
-	/* If the event is still queued, we have a problem... */
-	WARN_ON(!list_empty(&event->list));
+	/*
+	 * If the event is still queued, we have a problem... Do an unreliable
+	 * lockless check first to avoid locking in the common case. The
+	 * locking may be necessary for permission events which got removed
+	 * from the list by a different CPU than the one freeing the event.
+	 */
+	if (!list_empty(&event->list)) {
+		spin_lock(&group->notification_lock);
+		WARN_ON(!list_empty(&event->list));
+		spin_unlock(&group->notification_lock);
+	}
 	group->ops->free_event(event);
 }
 
-- 
cgit 


From ed2726406c6a71f5da63719c0ba7d9e21dd9581c Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 7 Oct 2016 16:57:01 -0700
Subject: fsnotify: clean up spinlock assertions

Use assert_spin_locked() macro instead of hand-made BUG_ON statements.

Link: http://lkml.kernel.org/r/1474537439-18919-1-git-send-email-jack@suse.cz
Signed-off-by: Jan Kara <jack@suse.cz>
Suggested-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/notify/fanotify/fanotify_user.c | 3 +--
 fs/notify/notification.c           | 9 +++------
 2 files changed, 4 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 189fab3ac4e6..7ebfca6a1427 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -54,8 +54,7 @@ struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
 static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
 					    size_t count)
 {
-	BUG_ON(IS_ENABLED(CONFIG_SMP) &&
-	       !spin_is_locked(&group->notification_lock));
+	assert_spin_locked(&group->notification_lock);
 
 	pr_debug("%s: group=%p count=%zd\n", __func__, group, count);
 
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 1a8010e7a2a0..66f85c651c52 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -63,8 +63,7 @@ EXPORT_SYMBOL_GPL(fsnotify_get_cookie);
 /* return true if the notify queue is empty, false otherwise */
 bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group)
 {
-	BUG_ON(IS_ENABLED(CONFIG_SMP) &&
-	       !spin_is_locked(&group->notification_lock));
+	assert_spin_locked(&group->notification_lock);
 	return list_empty(&group->notification_list) ? true : false;
 }
 
@@ -149,8 +148,7 @@ struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group)
 {
 	struct fsnotify_event *event;
 
-	BUG_ON(IS_ENABLED(CONFIG_SMP) &&
-	       !spin_is_locked(&group->notification_lock));
+	assert_spin_locked(&group->notification_lock);
 
 	pr_debug("%s: group=%p\n", __func__, group);
 
@@ -172,8 +170,7 @@ struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group)
  */
 struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group)
 {
-	BUG_ON(IS_ENABLED(CONFIG_SMP) &&
-	       !spin_is_locked(&group->notification_lock));
+	assert_spin_locked(&group->notification_lock);
 
 	return list_first_entry(&group->notification_list,
 				struct fsnotify_event, list);
-- 
cgit 


From 0b41be07630ef4ef45e082fed530c633e6cdce88 Mon Sep 17 00:00:00 2001
From: Bhaktipriya Shridhar <bhaktipriya96@gmail.com>
Date: Fri, 7 Oct 2016 16:57:07 -0700
Subject: fs/ocfs2/dlmfs: remove deprecated create_singlethread_workqueue()

The workqueue "user_dlm_worker" queues a single work item
&lockres->l_work per user_lock_res instance and so it doesn't require
execution ordering.  Hence, alloc_workqueue has been used to replace the
deprecated create_singlethread_workqueue instance.

The WQ_MEM_RECLAIM flag has been set to ensure forward progress under
memory pressure.

Since there are fixed number of work items, explicit concurrency
limit is unnecessary here.

Link: http://lkml.kernel.org/r/9748136d3a3b18138ad1d6ba708367aa1fe9f98c.1472590094.git.bhaktipriya96@gmail.com
Signed-off-by: Bhaktipriya Shridhar <bhaktipriya96@gmail.com>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/dlmfs/dlmfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index ef474cdd6404..354cdf9714aa 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -646,7 +646,7 @@ static int __init init_dlmfs_fs(void)
 	}
 	cleanup_inode = 1;
 
-	user_dlm_worker = create_singlethread_workqueue("user_dlm");
+	user_dlm_worker = alloc_workqueue("user_dlm", WQ_MEM_RECLAIM, 0);
 	if (!user_dlm_worker) {
 		status = -ENOMEM;
 		goto bail;
-- 
cgit 


From bf940776c051ad6226567d117f8102f1c09f0431 Mon Sep 17 00:00:00 2001
From: Bhaktipriya Shridhar <bhaktipriya96@gmail.com>
Date: Fri, 7 Oct 2016 16:57:10 -0700
Subject: fs/ocfs2/cluster: remove deprecated create_singlethread_workqueue()

The workqueue "o2net_wq" queues multiple work items viz
&old_sc->sc_shutdown_work, &sc->sc_rx_work, &sc->sc_connect_work which
require strict execution ordering.  Hence, an ordered dedicated
workqueue has been used.

WQ_MEM_RECLAIM has been set to ensure forward progress under memory
pressure.

Link: http://lkml.kernel.org/r/ddc12e5766c79ba26f8a00d98049107f8a1d4866.1472590094.git.bhaktipriya96@gmail.com
Signed-off-by: Bhaktipriya Shridhar <bhaktipriya96@gmail.com>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/cluster/tcp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 1d67fcbf7160..8abab16b4602 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -2104,7 +2104,7 @@ int o2net_start_listening(struct o2nm_node *node)
 	BUG_ON(o2net_listen_sock != NULL);
 
 	mlog(ML_KTHREAD, "starting o2net thread...\n");
-	o2net_wq = create_singlethread_workqueue("o2net");
+	o2net_wq = alloc_ordered_workqueue("o2net", WQ_MEM_RECLAIM);
 	if (o2net_wq == NULL) {
 		mlog(ML_ERROR, "unable to launch o2net thread\n");
 		return -ENOMEM; /* ? */
-- 
cgit 


From 44be975691703cda8e23fe45296f78b287e40232 Mon Sep 17 00:00:00 2001
From: Bhaktipriya Shridhar <bhaktipriya96@gmail.com>
Date: Fri, 7 Oct 2016 16:57:13 -0700
Subject: fs/ocfs2/super: remove deprecated create_singlethread_workqueue()

The workqueue "ocfs2_wq" queues multiple work items viz
&osb->la_enable_wq, &journal->j_recovery_work, &os->os_orphan_scan_work,
&osb->osb_truncate_log_wq which require strict execution ordering.  Hence,
an ordered dedicated workqueue has been used.

WQ_MEM_RECLAIM has been set to ensure forward progress under memory
pressure because the workqueue is being used on a memory reclaim path.

Link: http://lkml.kernel.org/r/66279de510a7f4cfc6e386d99b7e04b3f65fb11b.1472590094.git.bhaktipriya96@gmail.com
Signed-off-by: Bhaktipriya Shridhar <bhaktipriya96@gmail.com>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 603b28d6f008..f56fe39fab04 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -2329,7 +2329,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	}
 	cleancache_init_shared_fs(sb);
 
-	osb->ocfs2_wq = create_singlethread_workqueue("ocfs2_wq");
+	osb->ocfs2_wq = alloc_ordered_workqueue("ocfs2_wq", WQ_MEM_RECLAIM);
 	if (!osb->ocfs2_wq) {
 		status = -ENOMEM;
 		mlog_errno(status);
-- 
cgit 


From 055fdcff35e5e688f5002e7ab330469a58883ff7 Mon Sep 17 00:00:00 2001
From: Bhaktipriya Shridhar <bhaktipriya96@gmail.com>
Date: Fri, 7 Oct 2016 16:57:17 -0700
Subject: fs/ocfs2/dlm: remove deprecated create_singlethread_workqueue()

The workqueue "dlm_worker" queues a single work item &dlm->dispatched_work
and thus it doesn't require execution ordering.  Hence, alloc_workqueue
has been used to replace the deprecated create_singlethread_workqueue
instance.

The WQ_MEM_RECLAIM flag has been set to ensure forward progress under
memory pressure.

Since there are fixed number of work items, explicit concurrency
limit is unnecessary here.

Link: http://lkml.kernel.org/r/2b5ad8d6688effe1a9ddb2bc2082d26fbbe00302.1472590094.git.bhaktipriya96@gmail.com
Signed-off-by: Bhaktipriya Shridhar <bhaktipriya96@gmail.com>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/dlm/dlmdomain.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 533bd524e41e..733e4e79c8e2 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1904,7 +1904,7 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)
 	}
 
 	snprintf(wq_name, O2NM_MAX_NAME_LEN, "dlm_wq-%s", dlm->name);
-	dlm->dlm_worker = create_singlethread_workqueue(wq_name);
+	dlm->dlm_worker = alloc_workqueue(wq_name, WQ_MEM_RECLAIM, 0);
 	if (!dlm->dlm_worker) {
 		status = -ENOMEM;
 		mlog_errno(status);
-- 
cgit 


From 48e509ece97e00b68e52d1d18e3e4b809c5b3991 Mon Sep 17 00:00:00 2001
From: Joseph Qi <joseph.qi@huawei.com>
Date: Fri, 7 Oct 2016 16:57:20 -0700
Subject: ocfs2: fix undefined struct variable in inode.h

The extern struct variable ocfs2_inode_cache is not defined. It meant to
use ocfs2_inode_cachep defined in super.c, I think. Fortunately it is
not used anywhere now, so no impact actually. Clean it up to fix this
mistake.

Link: http://lkml.kernel.org/r/57E1E49D.8050503@huawei.com
Signed-off-by: Joseph Qi <joseph.qi@huawei.com>
Reviewed-by: Eric Ren <zren@suse.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/inode.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 50cc55047443..5af68fcdf9d3 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -123,8 +123,6 @@ static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
 #define INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags & OCFS2_INODE_JOURNAL)
 #define SET_INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags |= OCFS2_INODE_JOURNAL)
 
-extern struct kmem_cache *ocfs2_inode_cache;
-
 extern const struct address_space_operations ocfs2_aops;
 extern const struct ocfs2_caching_operations ocfs2_inode_caching_ops;
 
-- 
cgit 


From dbe6ec815641aa22b50775aaeb47fa3a8d04ccf1 Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hpe.com>
Date: Fri, 7 Oct 2016 16:59:59 -0700
Subject: ext2/4, xfs: call thp_get_unmapped_area() for pmd mappings

To support DAX pmd mappings with unmodified applications, filesystems
need to align an mmap address by the pmd size.

Call thp_get_unmapped_area() from f_op->get_unmapped_area.

Note, there is no change in behavior for a non-DAX file.

Link: http://lkml.kernel.org/r/1472497881-9323-3-git-send-email-toshi.kani@hpe.com
Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext2/file.c    | 1 +
 fs/ext4/file.c    | 1 +
 fs/xfs/xfs_file.c | 1 +
 3 files changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 423cc01c9d41..0ca363d1341c 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -234,6 +234,7 @@ const struct file_operations ext2_file_operations = {
 	.open		= dquot_file_open,
 	.release	= ext2_release_file,
 	.fsync		= ext2_fsync,
+	.get_unmapped_area = thp_get_unmapped_area,
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= iter_file_splice_write,
 };
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 261ac3734c58..28f542bb0bda 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -703,6 +703,7 @@ const struct file_operations ext4_file_operations = {
 	.open		= ext4_file_open,
 	.release	= ext4_release_file,
 	.fsync		= ext4_sync_file,
+	.get_unmapped_area = thp_get_unmapped_area,
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= iter_file_splice_write,
 	.fallocate	= ext4_fallocate,
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index c68517b0f248..bac55c687085 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1618,6 +1618,7 @@ const struct file_operations xfs_file_operations = {
 	.open		= xfs_file_open,
 	.release	= xfs_file_release,
 	.fsync		= xfs_file_fsync,
+	.get_unmapped_area = thp_get_unmapped_area,
 	.fallocate	= xfs_file_fallocate,
 };
 
-- 
cgit 


From 0f30206bf2a42e278c2cec32e4b722626458c75b Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Fri, 7 Oct 2016 17:00:06 -0700
Subject: fs/proc/task_mmu.c: make the task_mmu walk_page_range() limit in
 clear_refs_write() obvious

Trying to walk all of virtual memory requires architecture specific
knowledge.  On x86_64, addresses must be sign extended from bit 48,
whereas on arm64 the top VA_BITS of address space have their own set of
page tables.

clear_refs_write() calls walk_page_range() on the range 0 to ~0UL, it
provides a test_walk() callback that only expects to be walking over
VMAs.  Currently walk_pmd_range() will skip memory regions that don't
have a VMA, reporting them as a hole.

As this call only expects to walk user address space, make it walk 0 to
'highest_vm_end'.

Link: http://lkml.kernel.org/r/1472655792-22439-1-git-send-email-james.morse@arm.com
Signed-off-by: James Morse <james.morse@arm.com>
Acked-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index f6fa99eca515..d2a70cf2154e 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1070,7 +1070,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
 			}
 			mmu_notifier_invalidate_range_start(mm, 0, -1);
 		}
-		walk_page_range(0, ~0UL, &clear_refs_walk);
+		walk_page_range(0, mm->highest_vm_end, &clear_refs_walk);
 		if (type == CLEAR_REFS_SOFT_DIRTY)
 			mmu_notifier_invalidate_range_end(mm, 0, -1);
 		flush_tlb_mm(mm);
-- 
cgit 


From 6fcb52a56ff60d240f06296b12827e7f20d45f63 Mon Sep 17 00:00:00 2001
From: Aaron Lu <aaron.lu@intel.com>
Date: Fri, 7 Oct 2016 17:00:08 -0700
Subject: thp: reduce usage of huge zero page's atomic counter

The global zero page is used to satisfy an anonymous read fault.  If
THP(Transparent HugePage) is enabled then the global huge zero page is
used.  The global huge zero page uses an atomic counter for reference
counting and is allocated/freed dynamically according to its counter
value.

CPU time spent on that counter will greatly increase if there are a lot
of processes doing anonymous read faults.  This patch proposes a way to
reduce the access to the global counter so that the CPU load can be
reduced accordingly.

To do this, a new flag of the mm_struct is introduced:
MMF_USED_HUGE_ZERO_PAGE.  With this flag, the process only need to touch
the global counter in two cases:

 1 The first time it uses the global huge zero page;
 2 The time when mm_user of its mm_struct reaches zero.

Note that right now, the huge zero page is eligible to be freed as soon
as its last use goes away.  With this patch, the page will not be
eligible to be freed until the exit of the last process from which it
was ever used.

And with the use of mm_user, the kthread is not eligible to use huge
zero page either.  Since no kthread is using huge zero page today, there
is no difference after applying this patch.  But if that is not desired,
I can change it to when mm_count reaches zero.

Case used for test on Haswell EP:

  usemem -n 72 --readonly -j 0x200000 100G

Which spawns 72 processes and each will mmap 100G anonymous space and
then do read only access to that space sequentially with a step of 2MB.

  CPU cycles from perf report for base commit:
      54.03%  usemem   [kernel.kallsyms]   [k] get_huge_zero_page
  CPU cycles from perf report for this commit:
       0.11%  usemem   [kernel.kallsyms]   [k] mm_get_huge_zero_page

Performance(throughput) of the workload for base commit: 1784430792
Performance(throughput) of the workload for this commit: 4726928591
164% increase.

Runtime of the workload for base commit: 707592 us
Runtime of the workload for this commit: 303970 us
50% drop.

Link: http://lkml.kernel.org/r/fe51a88f-446a-4622-1363-ad1282d71385@intel.com
Signed-off-by: Aaron Lu <aaron.lu@intel.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Jerome Marchand <jmarchan@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Ebru Akagunduz <ebru.akagunduz@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dax.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/dax.c b/fs/dax.c
index cc025f82ef07..014defd2e744 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1036,7 +1036,7 @@ int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	if (!write && !buffer_mapped(&bh)) {
 		spinlock_t *ptl;
 		pmd_t entry;
-		struct page *zero_page = get_huge_zero_page();
+		struct page *zero_page = mm_get_huge_zero_page(vma->vm_mm);
 
 		if (unlikely(!zero_page)) {
 			dax_pmd_dbg(&bh, address, "no zero page");
-- 
cgit 


From 8cd797887ae0a73313ba248e027e59c0a597d693 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Fri, 7 Oct 2016 17:00:24 -0700
Subject: mm: remove page_file_index

After using the offset of the swap entry as the key of the swap cache,
the page_index() becomes exactly same as page_file_index().  So the
page_file_index() is removed and the callers are changed to use
page_index() instead.

Link: http://lkml.kernel.org/r/1473270649-27229-2-git-send-email-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Trond Myklebust <trond.myklebust@primarydata.com>
Cc: Anna Schumaker <anna.schumaker@netapp.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Eric Dumazet <edumazet@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nfs/internal.h | 6 +++---
 fs/nfs/pagelist.c | 2 +-
 fs/nfs/read.c     | 2 +-
 fs/nfs/write.c    | 4 ++--
 4 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 74935a19e4bf..da9e5584bfdc 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -681,11 +681,11 @@ unsigned int nfs_page_length(struct page *page)
 	loff_t i_size = i_size_read(page_file_mapping(page)->host);
 
 	if (i_size > 0) {
-		pgoff_t page_index = page_file_index(page);
+		pgoff_t index = page_index(page);
 		pgoff_t end_index = (i_size - 1) >> PAGE_SHIFT;
-		if (page_index < end_index)
+		if (index < end_index)
 			return PAGE_SIZE;
-		if (page_index == end_index)
+		if (index == end_index)
 			return ((i_size - 1) & ~PAGE_MASK) + 1;
 	}
 	return 0;
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 174dd4cf5747..965db474f4b0 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -342,7 +342,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct page *page,
 	 * update_nfs_request below if the region is not locked. */
 	req->wb_page    = page;
 	if (page) {
-		req->wb_index = page_file_index(page);
+		req->wb_index = page_index(page);
 		get_page(page);
 	}
 	req->wb_offset  = offset;
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 572e5b3b06f1..defc9233e985 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -295,7 +295,7 @@ int nfs_readpage(struct file *file, struct page *page)
 	int		error;
 
 	dprintk("NFS: nfs_readpage (%p %ld@%lu)\n",
-		page, PAGE_SIZE, page_file_index(page));
+		page, PAGE_SIZE, page_index(page));
 	nfs_inc_stats(inode, NFSIOS_VFSREADPAGE);
 	nfs_add_stats(inode, NFSIOS_READPAGES, 1);
 
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 3a6724c6eb5f..53211838f72a 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -151,7 +151,7 @@ static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int c
 	spin_lock(&inode->i_lock);
 	i_size = i_size_read(inode);
 	end_index = (i_size - 1) >> PAGE_SHIFT;
-	if (i_size > 0 && page_file_index(page) < end_index)
+	if (i_size > 0 && page_index(page) < end_index)
 		goto out;
 	end = page_file_offset(page) + ((loff_t)offset+count);
 	if (i_size >= end)
@@ -603,7 +603,7 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc,
 {
 	int ret;
 
-	nfs_pageio_cond_complete(pgio, page_file_index(page));
+	nfs_pageio_cond_complete(pgio, page_index(page));
 	ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE,
 				   launder);
 	if (ret == -EAGAIN) {
-- 
cgit 


From 461a7184320a1b4d2c12ad538354062fef4ee0f1 Mon Sep 17 00:00:00 2001
From: Yisheng Xie <xieyisheng1@huawei.com>
Date: Fri, 7 Oct 2016 17:01:46 -0700
Subject: mm/hugetlb: introduce ARCH_HAS_GIGANTIC_PAGE

Avoid making ifdef get pretty unwieldy if many ARCHs support gigantic
page.  No functional change with this patch.

Link: http://lkml.kernel.org/r/1475227569-63446-2-git-send-email-xieyisheng1@huawei.com
Signed-off-by: Yisheng Xie <xieyisheng1@huawei.com>
Suggested-by: Michal Hocko <mhocko@suse.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Rob Herring <robh+dt@kernel.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/Kconfig | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 3ef62bad8f2b..4bd03a2b0518 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -200,6 +200,9 @@ config HUGETLBFS
 config HUGETLB_PAGE
 	def_bool HUGETLBFS
 
+config ARCH_HAS_GIGANTIC_PAGE
+	bool
+
 source "fs/configfs/Kconfig"
 source "fs/efivarfs/Kconfig"
 
-- 
cgit 


From 72e2936c04f7d2a4bf87d7f72d3bf11cf91ebb47 Mon Sep 17 00:00:00 2001
From: zhong jiang <zhongjiang@huawei.com>
Date: Fri, 7 Oct 2016 17:02:01 -0700
Subject: mm: remove unnecessary condition in remove_inode_hugepages

When the huge page is added to the page cahce (huge_add_to_page_cache),
the page private flag will be cleared.  since this code
(remove_inode_hugepages) will only be called for pages in the page
cahce, PagePrivate(page) will always be false.

The patch remove the code without any functional change.

Link: http://lkml.kernel.org/r/1475113323-29368-1-git-send-email-zhongjiang@huawei.com
Signed-off-by: zhong jiang <zhongjiang@huawei.com>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Tested-by: Mike Kravetz <mike.kravetz@oracle.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hugetlbfs/inode.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 4ea71eba40a5..7337cac29e9e 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -416,7 +416,6 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
 
 		for (i = 0; i < pagevec_count(&pvec); ++i) {
 			struct page *page = pvec.pages[i];
-			bool rsv_on_error;
 			u32 hash;
 
 			/*
@@ -458,18 +457,17 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
 			 * cache (remove_huge_page) BEFORE removing the
 			 * region/reserve map (hugetlb_unreserve_pages).  In
 			 * rare out of memory conditions, removal of the
-			 * region/reserve map could fail.  Before free'ing
-			 * the page, note PagePrivate which is used in case
-			 * of error.
+			 * region/reserve map could fail. Correspondingly,
+			 * the subpool and global reserve usage count can need
+			 * to be adjusted.
 			 */
-			rsv_on_error = !PagePrivate(page);
+			VM_BUG_ON(PagePrivate(page));
 			remove_huge_page(page);
 			freed++;
 			if (!truncate_op) {
 				if (unlikely(hugetlb_unreserve_pages(inode,
 							next, next + 1, 1)))
-					hugetlb_fix_reserve_counts(inode,
-								rsv_on_error);
+					hugetlb_fix_reserve_counts(inode);
 			}
 
 			unlock_page(page);
-- 
cgit 


From f7a5f132b447cb6301ab3f0b0468a63db29e41f5 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Fri, 7 Oct 2016 17:02:17 -0700
Subject: proc: faster /proc/*/status

top(1) opens the following files for every PID:

	/proc/*/stat
	/proc/*/statm
	/proc/*/status

This patch switches /proc/*/status away from seq_printf().
The result is 13.5% speedup.

Benchmark is open("/proc/self/status")+read+close 1.000.000 million times.

				BEFORE
$ perf stat -r 10 taskset -c 3 ./proc-self-status

 Performance counter stats for 'taskset -c 3 ./proc-self-status' (10 runs):

      10748.474301      task-clock (msec)         #    0.954 CPUs utilized            ( +-  0.91% )
                12      context-switches          #    0.001 K/sec                    ( +-  1.09% )
                 1      cpu-migrations            #    0.000 K/sec
               104      page-faults               #    0.010 K/sec                    ( +-  0.45% )
    37,424,127,876      cycles                    #    3.482 GHz                      ( +-  0.04% )
     8,453,010,029      stalled-cycles-frontend   #   22.59% frontend cycles idle     ( +-  0.12% )
     3,747,609,427      stalled-cycles-backend    #  10.01% backend cycles idle       ( +-  0.68% )
    65,632,764,147      instructions              #    1.75  insn per cycle
                                                  #    0.13  stalled cycles per insn  ( +-  0.00% )
    13,981,324,775      branches                  # 1300.773 M/sec                    ( +-  0.00% )
       138,967,110      branch-misses             #    0.99% of all branches          ( +-  0.18% )

      11.263885428 seconds time elapsed                                          ( +-  0.04% )
      ^^^^^^^^^^^^

				AFTER
$ perf stat -r 10 taskset -c 3 ./proc-self-status

 Performance counter stats for 'taskset -c 3 ./proc-self-status' (10 runs):

       9010.521776      task-clock (msec)         #    0.925 CPUs utilized            ( +-  1.54% )
                11      context-switches          #    0.001 K/sec                    ( +-  1.54% )
                 1      cpu-migrations            #    0.000 K/sec                    ( +- 11.11% )
               103      page-faults               #    0.011 K/sec                    ( +-  0.60% )
    32,352,310,603      cycles                    #    3.591 GHz                      ( +-  0.07% )
     7,849,199,578      stalled-cycles-frontend   #   24.26% frontend cycles idle     ( +-  0.27% )
     3,269,738,842      stalled-cycles-backend    #  10.11% backend cycles idle       ( +-  0.73% )
    56,012,163,567      instructions              #    1.73  insn per cycle
                                                  #    0.14  stalled cycles per insn  ( +-  0.00% )
    11,735,778,795      branches                  # 1302.453 M/sec                    ( +-  0.00% )
        98,084,459      branch-misses             #    0.84% of all branches          ( +-  0.28% )

       9.741247736 seconds time elapsed                                          ( +-  0.07% )
       ^^^^^^^^^^^

Link: http://lkml.kernel.org/r/20160806125608.GB1187@p183.telecom.by
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Joe Perches <joe@perches.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/array.c | 87 +++++++++++++++++++++++++++++++--------------------------
 1 file changed, 47 insertions(+), 40 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 88c7de12197b..5e7d2521d496 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -186,51 +186,52 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
 	task_unlock(p);
 	rcu_read_unlock();
 
-	seq_printf(m,
-		"State:\t%s\n"
-		"Tgid:\t%d\n"
-		"Ngid:\t%d\n"
-		"Pid:\t%d\n"
-		"PPid:\t%d\n"
-		"TracerPid:\t%d\n"
-		"Uid:\t%d\t%d\t%d\t%d\n"
-		"Gid:\t%d\t%d\t%d\t%d\n"
-		"FDSize:\t%d\nGroups:\t",
-		get_task_state(p),
-		tgid, ngid, pid_nr_ns(pid, ns), ppid, tpid,
-		from_kuid_munged(user_ns, cred->uid),
-		from_kuid_munged(user_ns, cred->euid),
-		from_kuid_munged(user_ns, cred->suid),
-		from_kuid_munged(user_ns, cred->fsuid),
-		from_kgid_munged(user_ns, cred->gid),
-		from_kgid_munged(user_ns, cred->egid),
-		from_kgid_munged(user_ns, cred->sgid),
-		from_kgid_munged(user_ns, cred->fsgid),
-		max_fds);
-
+	seq_printf(m, "State:\t%s", get_task_state(p));
+
+	seq_puts(m, "\nTgid:\t");
+	seq_put_decimal_ull(m, 0, tgid);
+	seq_puts(m, "\nNgid:\t");
+	seq_put_decimal_ull(m, 0, ngid);
+	seq_puts(m, "\nPid:\t");
+	seq_put_decimal_ull(m, 0, pid_nr_ns(pid, ns));
+	seq_puts(m, "\nPPid:\t");
+	seq_put_decimal_ull(m, 0, ppid);
+	seq_puts(m, "\nTracerPid:\t");
+	seq_put_decimal_ull(m, 0, tpid);
+	seq_puts(m, "\nUid:");
+	seq_put_decimal_ull(m, '\t', from_kuid_munged(user_ns, cred->uid));
+	seq_put_decimal_ull(m, '\t', from_kuid_munged(user_ns, cred->euid));
+	seq_put_decimal_ull(m, '\t', from_kuid_munged(user_ns, cred->suid));
+	seq_put_decimal_ull(m, '\t', from_kuid_munged(user_ns, cred->fsuid));
+	seq_puts(m, "\nGid:");
+	seq_put_decimal_ull(m, '\t', from_kgid_munged(user_ns, cred->gid));
+	seq_put_decimal_ull(m, '\t', from_kgid_munged(user_ns, cred->egid));
+	seq_put_decimal_ull(m, '\t', from_kgid_munged(user_ns, cred->sgid));
+	seq_put_decimal_ull(m, '\t', from_kgid_munged(user_ns, cred->fsgid));
+	seq_puts(m, "\nFDSize:\t");
+	seq_put_decimal_ull(m, 0, max_fds);
+
+	seq_puts(m, "\nGroups:\t");
 	group_info = cred->group_info;
 	for (g = 0; g < group_info->ngroups; g++)
-		seq_printf(m, "%d ",
-			   from_kgid_munged(user_ns, GROUP_AT(group_info, g)));
+		seq_put_decimal_ull(m, g ? ' ' : 0, from_kgid_munged(user_ns, GROUP_AT(group_info, g)));
 	put_cred(cred);
+	/* Trailing space shouldn't have been added in the first place. */
+	seq_putc(m, ' ');
 
 #ifdef CONFIG_PID_NS
 	seq_puts(m, "\nNStgid:");
 	for (g = ns->level; g <= pid->level; g++)
-		seq_printf(m, "\t%d",
-			task_tgid_nr_ns(p, pid->numbers[g].ns));
+		seq_put_decimal_ull(m, '\t', task_tgid_nr_ns(p, pid->numbers[g].ns));
 	seq_puts(m, "\nNSpid:");
 	for (g = ns->level; g <= pid->level; g++)
-		seq_printf(m, "\t%d",
-			task_pid_nr_ns(p, pid->numbers[g].ns));
+		seq_put_decimal_ull(m, '\t', task_pid_nr_ns(p, pid->numbers[g].ns));
 	seq_puts(m, "\nNSpgid:");
 	for (g = ns->level; g <= pid->level; g++)
-		seq_printf(m, "\t%d",
-			task_pgrp_nr_ns(p, pid->numbers[g].ns));
+		seq_put_decimal_ull(m, '\t', task_pgrp_nr_ns(p, pid->numbers[g].ns));
 	seq_puts(m, "\nNSsid:");
 	for (g = ns->level; g <= pid->level; g++)
-		seq_printf(m, "\t%d",
-			task_session_nr_ns(p, pid->numbers[g].ns));
+		seq_put_decimal_ull(m, '\t', task_session_nr_ns(p, pid->numbers[g].ns));
 #endif
 	seq_putc(m, '\n');
 }
@@ -299,11 +300,14 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
 		unlock_task_sighand(p, &flags);
 	}
 
-	seq_printf(m, "Threads:\t%d\n", num_threads);
-	seq_printf(m, "SigQ:\t%lu/%lu\n", qsize, qlim);
+	seq_puts(m, "Threads:\t");
+	seq_put_decimal_ull(m, 0, num_threads);
+	seq_puts(m, "\nSigQ:\t");
+	seq_put_decimal_ull(m, 0, qsize);
+	seq_put_decimal_ull(m, '/', qlim);
 
 	/* render them all */
-	render_sigset_t(m, "SigPnd:\t", &pending);
+	render_sigset_t(m, "\nSigPnd:\t", &pending);
 	render_sigset_t(m, "ShdPnd:\t", &shpending);
 	render_sigset_t(m, "SigBlk:\t", &blocked);
 	render_sigset_t(m, "SigIgn:\t", &ignored);
@@ -348,17 +352,20 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p)
 static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
 {
 #ifdef CONFIG_SECCOMP
-	seq_printf(m, "Seccomp:\t%d\n", p->seccomp.mode);
+	seq_puts(m, "Seccomp:\t");
+	seq_put_decimal_ull(m, 0, p->seccomp.mode);
+	seq_putc(m, '\n');
 #endif
 }
 
 static inline void task_context_switch_counts(struct seq_file *m,
 						struct task_struct *p)
 {
-	seq_printf(m,	"voluntary_ctxt_switches:\t%lu\n"
-			"nonvoluntary_ctxt_switches:\t%lu\n",
-			p->nvcsw,
-			p->nivcsw);
+	seq_puts(m, "voluntary_ctxt_switches:\t");
+	seq_put_decimal_ull(m, 0, p->nvcsw);
+	seq_puts(m, "\nnonvoluntary_ctxt_switches:\t");
+	seq_put_decimal_ull(m, 0, p->nivcsw);
+	seq_putc(m, '\n');
 }
 
 static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
-- 
cgit 


From 75ba1d07fd6a494851db5132612944a9d4773f9c Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Fri, 7 Oct 2016 17:02:20 -0700
Subject: seq/proc: modify seq_put_decimal_[u]ll to take a const char *, not
 char

Allow some seq_puts removals by taking a string instead of a single
char.

[akpm@linux-foundation.org: update vmstat_show(), per Joe]
Link: http://lkml.kernel.org/r/667e1cf3d436de91a5698170a1e98d882905e956.1470704995.git.joe@perches.com
Signed-off-by: Joe Perches <joe@perches.com>
Cc: Joe Perches <joe@perches.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/array.c | 178 ++++++++++++++++++++++++++------------------------------
 fs/proc/stat.c  |  49 ++++++++--------
 fs/seq_file.c   |  57 +++++++++++++-----
 3 files changed, 151 insertions(+), 133 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 5e7d2521d496..d25b44601b30 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -188,33 +188,26 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
 
 	seq_printf(m, "State:\t%s", get_task_state(p));
 
-	seq_puts(m, "\nTgid:\t");
-	seq_put_decimal_ull(m, 0, tgid);
-	seq_puts(m, "\nNgid:\t");
-	seq_put_decimal_ull(m, 0, ngid);
-	seq_puts(m, "\nPid:\t");
-	seq_put_decimal_ull(m, 0, pid_nr_ns(pid, ns));
-	seq_puts(m, "\nPPid:\t");
-	seq_put_decimal_ull(m, 0, ppid);
-	seq_puts(m, "\nTracerPid:\t");
-	seq_put_decimal_ull(m, 0, tpid);
-	seq_puts(m, "\nUid:");
-	seq_put_decimal_ull(m, '\t', from_kuid_munged(user_ns, cred->uid));
-	seq_put_decimal_ull(m, '\t', from_kuid_munged(user_ns, cred->euid));
-	seq_put_decimal_ull(m, '\t', from_kuid_munged(user_ns, cred->suid));
-	seq_put_decimal_ull(m, '\t', from_kuid_munged(user_ns, cred->fsuid));
-	seq_puts(m, "\nGid:");
-	seq_put_decimal_ull(m, '\t', from_kgid_munged(user_ns, cred->gid));
-	seq_put_decimal_ull(m, '\t', from_kgid_munged(user_ns, cred->egid));
-	seq_put_decimal_ull(m, '\t', from_kgid_munged(user_ns, cred->sgid));
-	seq_put_decimal_ull(m, '\t', from_kgid_munged(user_ns, cred->fsgid));
-	seq_puts(m, "\nFDSize:\t");
-	seq_put_decimal_ull(m, 0, max_fds);
+	seq_put_decimal_ull(m, "\nTgid:\t", tgid);
+	seq_put_decimal_ull(m, "\nNgid:\t", ngid);
+	seq_put_decimal_ull(m, "\nPid:\t", pid_nr_ns(pid, ns));
+	seq_put_decimal_ull(m, "\nPPid:\t", ppid);
+	seq_put_decimal_ull(m, "\nTracerPid:\t", tpid);
+	seq_put_decimal_ull(m, "\nUid:\t", from_kuid_munged(user_ns, cred->uid));
+	seq_put_decimal_ull(m, "\t", from_kuid_munged(user_ns, cred->euid));
+	seq_put_decimal_ull(m, "\t", from_kuid_munged(user_ns, cred->suid));
+	seq_put_decimal_ull(m, "\t", from_kuid_munged(user_ns, cred->fsuid));
+	seq_put_decimal_ull(m, "\nGid:\t", from_kgid_munged(user_ns, cred->gid));
+	seq_put_decimal_ull(m, "\t", from_kgid_munged(user_ns, cred->egid));
+	seq_put_decimal_ull(m, "\t", from_kgid_munged(user_ns, cred->sgid));
+	seq_put_decimal_ull(m, "\t", from_kgid_munged(user_ns, cred->fsgid));
+	seq_put_decimal_ull(m, "\nFDSize:\t", max_fds);
 
 	seq_puts(m, "\nGroups:\t");
 	group_info = cred->group_info;
 	for (g = 0; g < group_info->ngroups; g++)
-		seq_put_decimal_ull(m, g ? ' ' : 0, from_kgid_munged(user_ns, GROUP_AT(group_info, g)));
+		seq_put_decimal_ull(m, g ? " " : "",
+				    from_kgid_munged(user_ns, GROUP_AT(group_info, g)));
 	put_cred(cred);
 	/* Trailing space shouldn't have been added in the first place. */
 	seq_putc(m, ' ');
@@ -222,16 +215,16 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
 #ifdef CONFIG_PID_NS
 	seq_puts(m, "\nNStgid:");
 	for (g = ns->level; g <= pid->level; g++)
-		seq_put_decimal_ull(m, '\t', task_tgid_nr_ns(p, pid->numbers[g].ns));
+		seq_put_decimal_ull(m, "\t", task_tgid_nr_ns(p, pid->numbers[g].ns));
 	seq_puts(m, "\nNSpid:");
 	for (g = ns->level; g <= pid->level; g++)
-		seq_put_decimal_ull(m, '\t', task_pid_nr_ns(p, pid->numbers[g].ns));
+		seq_put_decimal_ull(m, "\t", task_pid_nr_ns(p, pid->numbers[g].ns));
 	seq_puts(m, "\nNSpgid:");
 	for (g = ns->level; g <= pid->level; g++)
-		seq_put_decimal_ull(m, '\t', task_pgrp_nr_ns(p, pid->numbers[g].ns));
+		seq_put_decimal_ull(m, "\t", task_pgrp_nr_ns(p, pid->numbers[g].ns));
 	seq_puts(m, "\nNSsid:");
 	for (g = ns->level; g <= pid->level; g++)
-		seq_put_decimal_ull(m, '\t', task_session_nr_ns(p, pid->numbers[g].ns));
+		seq_put_decimal_ull(m, "\t", task_session_nr_ns(p, pid->numbers[g].ns));
 #endif
 	seq_putc(m, '\n');
 }
@@ -300,11 +293,9 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
 		unlock_task_sighand(p, &flags);
 	}
 
-	seq_puts(m, "Threads:\t");
-	seq_put_decimal_ull(m, 0, num_threads);
-	seq_puts(m, "\nSigQ:\t");
-	seq_put_decimal_ull(m, 0, qsize);
-	seq_put_decimal_ull(m, '/', qlim);
+	seq_put_decimal_ull(m, "Threads:\t", num_threads);
+	seq_put_decimal_ull(m, "\nSigQ:\t", qsize);
+	seq_put_decimal_ull(m, "/", qlim);
 
 	/* render them all */
 	render_sigset_t(m, "\nSigPnd:\t", &pending);
@@ -352,8 +343,7 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p)
 static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
 {
 #ifdef CONFIG_SECCOMP
-	seq_puts(m, "Seccomp:\t");
-	seq_put_decimal_ull(m, 0, p->seccomp.mode);
+	seq_put_decimal_ull(m, "Seccomp:\t", p->seccomp.mode);
 	seq_putc(m, '\n');
 #endif
 }
@@ -361,10 +351,8 @@ static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
 static inline void task_context_switch_counts(struct seq_file *m,
 						struct task_struct *p)
 {
-	seq_puts(m, "voluntary_ctxt_switches:\t");
-	seq_put_decimal_ull(m, 0, p->nvcsw);
-	seq_puts(m, "\nnonvoluntary_ctxt_switches:\t");
-	seq_put_decimal_ull(m, 0, p->nivcsw);
+	seq_put_decimal_ull(m, "voluntary_ctxt_switches:\t", p->nvcsw);
+	seq_put_decimal_ull(m, "\nnonvoluntary_ctxt_switches:\t", p->nivcsw);
 	seq_putc(m, '\n');
 }
 
@@ -497,41 +485,41 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 	start_time = nsec_to_clock_t(task->real_start_time);
 
 	seq_printf(m, "%d (%s) %c", pid_nr_ns(pid, ns), tcomm, state);
-	seq_put_decimal_ll(m, ' ', ppid);
-	seq_put_decimal_ll(m, ' ', pgid);
-	seq_put_decimal_ll(m, ' ', sid);
-	seq_put_decimal_ll(m, ' ', tty_nr);
-	seq_put_decimal_ll(m, ' ', tty_pgrp);
-	seq_put_decimal_ull(m, ' ', task->flags);
-	seq_put_decimal_ull(m, ' ', min_flt);
-	seq_put_decimal_ull(m, ' ', cmin_flt);
-	seq_put_decimal_ull(m, ' ', maj_flt);
-	seq_put_decimal_ull(m, ' ', cmaj_flt);
-	seq_put_decimal_ull(m, ' ', cputime_to_clock_t(utime));
-	seq_put_decimal_ull(m, ' ', cputime_to_clock_t(stime));
-	seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cutime));
-	seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cstime));
-	seq_put_decimal_ll(m, ' ', priority);
-	seq_put_decimal_ll(m, ' ', nice);
-	seq_put_decimal_ll(m, ' ', num_threads);
-	seq_put_decimal_ull(m, ' ', 0);
-	seq_put_decimal_ull(m, ' ', start_time);
-	seq_put_decimal_ull(m, ' ', vsize);
-	seq_put_decimal_ull(m, ' ', mm ? get_mm_rss(mm) : 0);
-	seq_put_decimal_ull(m, ' ', rsslim);
-	seq_put_decimal_ull(m, ' ', mm ? (permitted ? mm->start_code : 1) : 0);
-	seq_put_decimal_ull(m, ' ', mm ? (permitted ? mm->end_code : 1) : 0);
-	seq_put_decimal_ull(m, ' ', (permitted && mm) ? mm->start_stack : 0);
-	seq_put_decimal_ull(m, ' ', esp);
-	seq_put_decimal_ull(m, ' ', eip);
+	seq_put_decimal_ll(m, " ", ppid);
+	seq_put_decimal_ll(m, " ", pgid);
+	seq_put_decimal_ll(m, " ", sid);
+	seq_put_decimal_ll(m, " ", tty_nr);
+	seq_put_decimal_ll(m, " ", tty_pgrp);
+	seq_put_decimal_ull(m, " ", task->flags);
+	seq_put_decimal_ull(m, " ", min_flt);
+	seq_put_decimal_ull(m, " ", cmin_flt);
+	seq_put_decimal_ull(m, " ", maj_flt);
+	seq_put_decimal_ull(m, " ", cmaj_flt);
+	seq_put_decimal_ull(m, " ", cputime_to_clock_t(utime));
+	seq_put_decimal_ull(m, " ", cputime_to_clock_t(stime));
+	seq_put_decimal_ll(m, " ", cputime_to_clock_t(cutime));
+	seq_put_decimal_ll(m, " ", cputime_to_clock_t(cstime));
+	seq_put_decimal_ll(m, " ", priority);
+	seq_put_decimal_ll(m, " ", nice);
+	seq_put_decimal_ll(m, " ", num_threads);
+	seq_put_decimal_ull(m, " ", 0);
+	seq_put_decimal_ull(m, " ", start_time);
+	seq_put_decimal_ull(m, " ", vsize);
+	seq_put_decimal_ull(m, " ", mm ? get_mm_rss(mm) : 0);
+	seq_put_decimal_ull(m, " ", rsslim);
+	seq_put_decimal_ull(m, " ", mm ? (permitted ? mm->start_code : 1) : 0);
+	seq_put_decimal_ull(m, " ", mm ? (permitted ? mm->end_code : 1) : 0);
+	seq_put_decimal_ull(m, " ", (permitted && mm) ? mm->start_stack : 0);
+	seq_put_decimal_ull(m, " ", esp);
+	seq_put_decimal_ull(m, " ", eip);
 	/* The signal information here is obsolete.
 	 * It must be decimal for Linux 2.0 compatibility.
 	 * Use /proc/#/status for real-time signals.
 	 */
-	seq_put_decimal_ull(m, ' ', task->pending.signal.sig[0] & 0x7fffffffUL);
-	seq_put_decimal_ull(m, ' ', task->blocked.sig[0] & 0x7fffffffUL);
-	seq_put_decimal_ull(m, ' ', sigign.sig[0] & 0x7fffffffUL);
-	seq_put_decimal_ull(m, ' ', sigcatch.sig[0] & 0x7fffffffUL);
+	seq_put_decimal_ull(m, " ", task->pending.signal.sig[0] & 0x7fffffffUL);
+	seq_put_decimal_ull(m, " ", task->blocked.sig[0] & 0x7fffffffUL);
+	seq_put_decimal_ull(m, " ", sigign.sig[0] & 0x7fffffffUL);
+	seq_put_decimal_ull(m, " ", sigcatch.sig[0] & 0x7fffffffUL);
 
 	/*
 	 * We used to output the absolute kernel address, but that's an
@@ -545,31 +533,31 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 	else
 		seq_puts(m, " 0");
 
-	seq_put_decimal_ull(m, ' ', 0);
-	seq_put_decimal_ull(m, ' ', 0);
-	seq_put_decimal_ll(m, ' ', task->exit_signal);
-	seq_put_decimal_ll(m, ' ', task_cpu(task));
-	seq_put_decimal_ull(m, ' ', task->rt_priority);
-	seq_put_decimal_ull(m, ' ', task->policy);
-	seq_put_decimal_ull(m, ' ', delayacct_blkio_ticks(task));
-	seq_put_decimal_ull(m, ' ', cputime_to_clock_t(gtime));
-	seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cgtime));
+	seq_put_decimal_ull(m, " ", 0);
+	seq_put_decimal_ull(m, " ", 0);
+	seq_put_decimal_ll(m, " ", task->exit_signal);
+	seq_put_decimal_ll(m, " ", task_cpu(task));
+	seq_put_decimal_ull(m, " ", task->rt_priority);
+	seq_put_decimal_ull(m, " ", task->policy);
+	seq_put_decimal_ull(m, " ", delayacct_blkio_ticks(task));
+	seq_put_decimal_ull(m, " ", cputime_to_clock_t(gtime));
+	seq_put_decimal_ll(m, " ", cputime_to_clock_t(cgtime));
 
 	if (mm && permitted) {
-		seq_put_decimal_ull(m, ' ', mm->start_data);
-		seq_put_decimal_ull(m, ' ', mm->end_data);
-		seq_put_decimal_ull(m, ' ', mm->start_brk);
-		seq_put_decimal_ull(m, ' ', mm->arg_start);
-		seq_put_decimal_ull(m, ' ', mm->arg_end);
-		seq_put_decimal_ull(m, ' ', mm->env_start);
-		seq_put_decimal_ull(m, ' ', mm->env_end);
+		seq_put_decimal_ull(m, " ", mm->start_data);
+		seq_put_decimal_ull(m, " ", mm->end_data);
+		seq_put_decimal_ull(m, " ", mm->start_brk);
+		seq_put_decimal_ull(m, " ", mm->arg_start);
+		seq_put_decimal_ull(m, " ", mm->arg_end);
+		seq_put_decimal_ull(m, " ", mm->env_start);
+		seq_put_decimal_ull(m, " ", mm->env_end);
 	} else
-		seq_printf(m, " 0 0 0 0 0 0 0");
+		seq_puts(m, " 0 0 0 0 0 0 0");
 
 	if (permitted)
-		seq_put_decimal_ll(m, ' ', task->exit_code);
+		seq_put_decimal_ll(m, " ", task->exit_code);
 	else
-		seq_put_decimal_ll(m, ' ', 0);
+		seq_puts(m, " 0");
 
 	seq_putc(m, '\n');
 	if (mm)
@@ -605,13 +593,13 @@ int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns,
 	 * seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n",
 	 *               size, resident, shared, text, data);
 	 */
-	seq_put_decimal_ull(m, 0, size);
-	seq_put_decimal_ull(m, ' ', resident);
-	seq_put_decimal_ull(m, ' ', shared);
-	seq_put_decimal_ull(m, ' ', text);
-	seq_put_decimal_ull(m, ' ', 0);
-	seq_put_decimal_ull(m, ' ', data);
-	seq_put_decimal_ull(m, ' ', 0);
+	seq_put_decimal_ull(m, "", size);
+	seq_put_decimal_ull(m, " ", resident);
+	seq_put_decimal_ull(m, " ", shared);
+	seq_put_decimal_ull(m, " ", text);
+	seq_put_decimal_ull(m, " ", 0);
+	seq_put_decimal_ull(m, " ", data);
+	seq_put_decimal_ull(m, " ", 0);
 	seq_putc(m, '\n');
 
 	return 0;
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 7907e456ac4f..d700c42b3572 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -115,17 +115,16 @@ static int show_stat(struct seq_file *p, void *v)
 	}
 	sum += arch_irq_stat();
 
-	seq_puts(p, "cpu ");
-	seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(user));
-	seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(nice));
-	seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(system));
-	seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(idle));
-	seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(iowait));
-	seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(irq));
-	seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(softirq));
-	seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(steal));
-	seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest));
-	seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest_nice));
+	seq_put_decimal_ull(p, "cpu  ", cputime64_to_clock_t(user));
+	seq_put_decimal_ull(p, " ", cputime64_to_clock_t(nice));
+	seq_put_decimal_ull(p, " ", cputime64_to_clock_t(system));
+	seq_put_decimal_ull(p, " ", cputime64_to_clock_t(idle));
+	seq_put_decimal_ull(p, " ", cputime64_to_clock_t(iowait));
+	seq_put_decimal_ull(p, " ", cputime64_to_clock_t(irq));
+	seq_put_decimal_ull(p, " ", cputime64_to_clock_t(softirq));
+	seq_put_decimal_ull(p, " ", cputime64_to_clock_t(steal));
+	seq_put_decimal_ull(p, " ", cputime64_to_clock_t(guest));
+	seq_put_decimal_ull(p, " ", cputime64_to_clock_t(guest_nice));
 	seq_putc(p, '\n');
 
 	for_each_online_cpu(i) {
@@ -141,23 +140,23 @@ static int show_stat(struct seq_file *p, void *v)
 		guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
 		guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE];
 		seq_printf(p, "cpu%d", i);
-		seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(user));
-		seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(nice));
-		seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(system));
-		seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(idle));
-		seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(iowait));
-		seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(irq));
-		seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(softirq));
-		seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(steal));
-		seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest));
-		seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest_nice));
+		seq_put_decimal_ull(p, " ", cputime64_to_clock_t(user));
+		seq_put_decimal_ull(p, " ", cputime64_to_clock_t(nice));
+		seq_put_decimal_ull(p, " ", cputime64_to_clock_t(system));
+		seq_put_decimal_ull(p, " ", cputime64_to_clock_t(idle));
+		seq_put_decimal_ull(p, " ", cputime64_to_clock_t(iowait));
+		seq_put_decimal_ull(p, " ", cputime64_to_clock_t(irq));
+		seq_put_decimal_ull(p, " ", cputime64_to_clock_t(softirq));
+		seq_put_decimal_ull(p, " ", cputime64_to_clock_t(steal));
+		seq_put_decimal_ull(p, " ", cputime64_to_clock_t(guest));
+		seq_put_decimal_ull(p, " ", cputime64_to_clock_t(guest_nice));
 		seq_putc(p, '\n');
 	}
-	seq_printf(p, "intr %llu", (unsigned long long)sum);
+	seq_put_decimal_ull(p, "intr ", (unsigned long long)sum);
 
 	/* sum again ? it could be updated? */
 	for_each_irq_nr(j)
-		seq_put_decimal_ull(p, ' ', kstat_irqs_usr(j));
+		seq_put_decimal_ull(p, " ", kstat_irqs_usr(j));
 
 	seq_printf(p,
 		"\nctxt %llu\n"
@@ -171,10 +170,10 @@ static int show_stat(struct seq_file *p, void *v)
 		nr_running(),
 		nr_iowait());
 
-	seq_printf(p, "softirq %llu", (unsigned long long)sum_softirq);
+	seq_put_decimal_ull(p, "softirq ", (unsigned long long)sum_softirq);
 
 	for (i = 0; i < NR_SOFTIRQS; i++)
-		seq_put_decimal_ull(p, ' ', per_softirq_sums[i]);
+		seq_put_decimal_ull(p, " ", per_softirq_sums[i]);
 	seq_putc(p, '\n');
 
 	return 0;
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 6dc4296eed62..368bfb92b115 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -679,11 +679,11 @@ EXPORT_SYMBOL(seq_puts);
 /*
  * A helper routine for putting decimal numbers without rich format of printf().
  * only 'unsigned long long' is supported.
- * This routine will put one byte delimiter + number into seq_file.
+ * This routine will put strlen(delimiter) + number into seq_file.
  * This routine is very quick when you show lots of numbers.
  * In usual cases, it will be better to use seq_printf(). It's easier to read.
  */
-void seq_put_decimal_ull(struct seq_file *m, char delimiter,
+void seq_put_decimal_ull(struct seq_file *m, const char *delimiter,
 			 unsigned long long num)
 {
 	int len;
@@ -691,8 +691,15 @@ void seq_put_decimal_ull(struct seq_file *m, char delimiter,
 	if (m->count + 2 >= m->size) /* we'll write 2 bytes at least */
 		goto overflow;
 
-	if (delimiter)
-		m->buf[m->count++] = delimiter;
+	len = strlen(delimiter);
+	if (m->count + len >= m->size)
+		goto overflow;
+
+	memcpy(m->buf + m->count, delimiter, len);
+	m->count += len;
+
+	if (m->count + 1 >= m->size)
+		goto overflow;
 
 	if (num < 10) {
 		m->buf[m->count++] = num + '0';
@@ -702,6 +709,7 @@ void seq_put_decimal_ull(struct seq_file *m, char delimiter,
 	len = num_to_str(m->buf + m->count, m->size - m->count, num);
 	if (!len)
 		goto overflow;
+
 	m->count += len;
 	return;
 
@@ -710,19 +718,42 @@ overflow:
 }
 EXPORT_SYMBOL(seq_put_decimal_ull);
 
-void seq_put_decimal_ll(struct seq_file *m, char delimiter, long long num)
+void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num)
 {
+	int len;
+
+	if (m->count + 3 >= m->size) /* we'll write 2 bytes at least */
+		goto overflow;
+
+	len = strlen(delimiter);
+	if (m->count + len >= m->size)
+		goto overflow;
+
+	memcpy(m->buf + m->count, delimiter, len);
+	m->count += len;
+
+	if (m->count + 2 >= m->size)
+		goto overflow;
+
 	if (num < 0) {
-		if (m->count + 3 >= m->size) {
-			seq_set_overflow(m);
-			return;
-		}
-		if (delimiter)
-			m->buf[m->count++] = delimiter;
+		m->buf[m->count++] = '-';
 		num = -num;
-		delimiter = '-';
 	}
-	seq_put_decimal_ull(m, delimiter, num);
+
+	if (num < 10) {
+		m->buf[m->count++] = num + '0';
+		return;
+	}
+
+	len = num_to_str(m->buf + m->count, m->size - m->count, num);
+	if (!len)
+		goto overflow;
+
+	m->count += len;
+	return;
+
+overflow:
+	seq_set_overflow(m);
 }
 EXPORT_SYMBOL(seq_put_decimal_ll);
 
-- 
cgit 


From e16e2d8e14a14bd87df8482c637dde8f760a8d5f Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Fri, 7 Oct 2016 17:02:23 -0700
Subject: meminfo: break apart a very long seq_printf with #ifdefs

Use a specific routine to emit most lines so that the code is easier to
read and maintain.

akpm:
   text    data     bss     dec     hex filename
   2976       8       0    2984     ba8 fs/proc/meminfo.o before
   2669       8       0    2677     a75 fs/proc/meminfo.o after

Link: http://lkml.kernel.org/r/8fce7fdef2ba081a4ef531594e97da8a9feebb58.1470810406.git.joe@perches.com
Signed-off-by: Joe Perches <joe@perches.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/meminfo.c | 211 ++++++++++++++++++++++++------------------------------
 1 file changed, 95 insertions(+), 116 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index b9a8c813e5e6..8a428498d6b2 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -23,6 +23,25 @@ void __attribute__((weak)) arch_report_meminfo(struct seq_file *m)
 {
 }
 
+static void show_val_kb(struct seq_file *m, const char *s, unsigned long num)
+{
+	char v[32];
+	static const char blanks[7] = {' ', ' ', ' ', ' ',' ', ' ', ' '};
+	int len;
+
+	len = num_to_str(v, sizeof(v), num << (PAGE_SHIFT - 10));
+
+	seq_write(m, s, 16);
+
+	if (len > 0) {
+		if (len < 8)
+			seq_write(m, blanks, 8 - len);
+
+		seq_write(m, v, len);
+	}
+	seq_write(m, " kB\n", 4);
+}
+
 static int meminfo_proc_show(struct seq_file *m, void *v)
 {
 	struct sysinfo i;
@@ -32,10 +51,6 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 	unsigned long pages[NR_LRU_LISTS];
 	int lru;
 
-/*
- * display in kilobytes.
- */
-#define K(x) ((x) << (PAGE_SHIFT - 10))
 	si_meminfo(&i);
 	si_swapinfo(&i);
 	committed = percpu_counter_read_positive(&vm_committed_as);
@@ -50,136 +65,100 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 
 	available = si_mem_available();
 
-	/*
-	 * Tagged format, for easy grepping and expansion.
-	 */
-	seq_printf(m,
-		"MemTotal:       %8lu kB\n"
-		"MemFree:        %8lu kB\n"
-		"MemAvailable:   %8lu kB\n"
-		"Buffers:        %8lu kB\n"
-		"Cached:         %8lu kB\n"
-		"SwapCached:     %8lu kB\n"
-		"Active:         %8lu kB\n"
-		"Inactive:       %8lu kB\n"
-		"Active(anon):   %8lu kB\n"
-		"Inactive(anon): %8lu kB\n"
-		"Active(file):   %8lu kB\n"
-		"Inactive(file): %8lu kB\n"
-		"Unevictable:    %8lu kB\n"
-		"Mlocked:        %8lu kB\n"
-#ifdef CONFIG_HIGHMEM
-		"HighTotal:      %8lu kB\n"
-		"HighFree:       %8lu kB\n"
-		"LowTotal:       %8lu kB\n"
-		"LowFree:        %8lu kB\n"
-#endif
-#ifndef CONFIG_MMU
-		"MmapCopy:       %8lu kB\n"
-#endif
-		"SwapTotal:      %8lu kB\n"
-		"SwapFree:       %8lu kB\n"
-		"Dirty:          %8lu kB\n"
-		"Writeback:      %8lu kB\n"
-		"AnonPages:      %8lu kB\n"
-		"Mapped:         %8lu kB\n"
-		"Shmem:          %8lu kB\n"
-		"Slab:           %8lu kB\n"
-		"SReclaimable:   %8lu kB\n"
-		"SUnreclaim:     %8lu kB\n"
-		"KernelStack:    %8lu kB\n"
-		"PageTables:     %8lu kB\n"
-#ifdef CONFIG_QUICKLIST
-		"Quicklists:     %8lu kB\n"
-#endif
-		"NFS_Unstable:   %8lu kB\n"
-		"Bounce:         %8lu kB\n"
-		"WritebackTmp:   %8lu kB\n"
-		"CommitLimit:    %8lu kB\n"
-		"Committed_AS:   %8lu kB\n"
-		"VmallocTotal:   %8lu kB\n"
-		"VmallocUsed:    %8lu kB\n"
-		"VmallocChunk:   %8lu kB\n"
-#ifdef CONFIG_MEMORY_FAILURE
-		"HardwareCorrupted: %5lu kB\n"
-#endif
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-		"AnonHugePages:  %8lu kB\n"
-		"ShmemHugePages: %8lu kB\n"
-		"ShmemPmdMapped: %8lu kB\n"
-#endif
-#ifdef CONFIG_CMA
-		"CmaTotal:       %8lu kB\n"
-		"CmaFree:        %8lu kB\n"
-#endif
-		,
-		K(i.totalram),
-		K(i.freeram),
-		K(available),
-		K(i.bufferram),
-		K(cached),
-		K(total_swapcache_pages()),
-		K(pages[LRU_ACTIVE_ANON]   + pages[LRU_ACTIVE_FILE]),
-		K(pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]),
-		K(pages[LRU_ACTIVE_ANON]),
-		K(pages[LRU_INACTIVE_ANON]),
-		K(pages[LRU_ACTIVE_FILE]),
-		K(pages[LRU_INACTIVE_FILE]),
-		K(pages[LRU_UNEVICTABLE]),
-		K(global_page_state(NR_MLOCK)),
+	show_val_kb(m, "MemTotal:       ", i.totalram);
+	show_val_kb(m, "MemFree:        ", i.freeram);
+	show_val_kb(m, "MemAvailable:   ", available);
+	show_val_kb(m, "Buffers:        ", i.bufferram);
+	show_val_kb(m, "Cached:         ", cached);
+	show_val_kb(m, "SwapCached:     ", total_swapcache_pages());
+	show_val_kb(m, "Active:         ", pages[LRU_ACTIVE_ANON] +
+					   pages[LRU_ACTIVE_FILE]);
+	show_val_kb(m, "Inactive:       ", pages[LRU_INACTIVE_ANON] +
+					   pages[LRU_INACTIVE_FILE]);
+	show_val_kb(m, "Active(anon):   ", pages[LRU_ACTIVE_ANON]);
+	show_val_kb(m, "Inactive(anon): ", pages[LRU_INACTIVE_ANON]);
+	show_val_kb(m, "Active(file):   ", pages[LRU_ACTIVE_FILE]);
+	show_val_kb(m, "Inactive(file): ", pages[LRU_INACTIVE_FILE]);
+	show_val_kb(m, "Unevictable:    ", pages[LRU_UNEVICTABLE]);
+	show_val_kb(m, "Mlocked:        ", global_page_state(NR_MLOCK));
+
 #ifdef CONFIG_HIGHMEM
-		K(i.totalhigh),
-		K(i.freehigh),
-		K(i.totalram-i.totalhigh),
-		K(i.freeram-i.freehigh),
+	show_val_kb(m, "HighTotal:      ", i.totalhigh);
+	show_val_kb(m, "HighFree:       ", i.freehigh);
+	show_val_kb(m, "LowTotal:       ", i.totalram - i.totalhigh);
+	show_val_kb(m, "LowFree:        ", i.freeram - i.freehigh);
 #endif
+
 #ifndef CONFIG_MMU
-		K((unsigned long) atomic_long_read(&mmap_pages_allocated)),
+	show_val_kb(m, "MmapCopy:       ",
+		    (unsigned long)atomic_long_read(&mmap_pages_allocated));
 #endif
-		K(i.totalswap),
-		K(i.freeswap),
-		K(global_node_page_state(NR_FILE_DIRTY)),
-		K(global_node_page_state(NR_WRITEBACK)),
-		K(global_node_page_state(NR_ANON_MAPPED)),
-		K(global_node_page_state(NR_FILE_MAPPED)),
-		K(i.sharedram),
-		K(global_page_state(NR_SLAB_RECLAIMABLE) +
-				global_page_state(NR_SLAB_UNRECLAIMABLE)),
-		K(global_page_state(NR_SLAB_RECLAIMABLE)),
-		K(global_page_state(NR_SLAB_UNRECLAIMABLE)),
-		global_page_state(NR_KERNEL_STACK_KB),
-		K(global_page_state(NR_PAGETABLE)),
+
+	show_val_kb(m, "SwapTotal:      ", i.totalswap);
+	show_val_kb(m, "SwapFree:       ", i.freeswap);
+	show_val_kb(m, "Dirty:          ",
+		    global_node_page_state(NR_FILE_DIRTY));
+	show_val_kb(m, "Writeback:      ",
+		    global_node_page_state(NR_WRITEBACK));
+	show_val_kb(m, "AnonPages:      ",
+		    global_node_page_state(NR_ANON_MAPPED));
+	show_val_kb(m, "Mapped:         ",
+		    global_node_page_state(NR_FILE_MAPPED));
+	show_val_kb(m, "Shmem:          ", i.sharedram);
+	show_val_kb(m, "Slab:           ",
+		    global_page_state(NR_SLAB_RECLAIMABLE) +
+		    global_page_state(NR_SLAB_UNRECLAIMABLE));
+
+	show_val_kb(m, "SReclaimable:   ",
+		    global_page_state(NR_SLAB_RECLAIMABLE));
+	show_val_kb(m, "SUnreclaim:     ",
+		    global_page_state(NR_SLAB_UNRECLAIMABLE));
+	seq_printf(m, "KernelStack:    %8lu kB\n",
+		   global_page_state(NR_KERNEL_STACK_KB));
+	show_val_kb(m, "PageTables:     ",
+		    global_page_state(NR_PAGETABLE));
 #ifdef CONFIG_QUICKLIST
-		K(quicklist_total_size()),
+	show_val_kb(m, "Quicklists:     ", quicklist_total_size());
 #endif
-		K(global_node_page_state(NR_UNSTABLE_NFS)),
-		K(global_page_state(NR_BOUNCE)),
-		K(global_node_page_state(NR_WRITEBACK_TEMP)),
-		K(vm_commit_limit()),
-		K(committed),
-		(unsigned long)VMALLOC_TOTAL >> 10,
-		0ul, // used to be vmalloc 'used'
-		0ul  // used to be vmalloc 'largest_chunk'
+
+	show_val_kb(m, "NFS_Unstable:   ",
+		    global_node_page_state(NR_UNSTABLE_NFS));
+	show_val_kb(m, "Bounce:         ",
+		    global_page_state(NR_BOUNCE));
+	show_val_kb(m, "WritebackTmp:   ",
+		    global_node_page_state(NR_WRITEBACK_TEMP));
+	show_val_kb(m, "CommitLimit:    ", vm_commit_limit());
+	show_val_kb(m, "Committed_AS:   ", committed);
+	seq_printf(m, "VmallocTotal:   %8lu kB\n",
+		   (unsigned long)VMALLOC_TOTAL >> 10);
+	show_val_kb(m, "VmallocUsed:    ", 0ul);
+	show_val_kb(m, "VmallocChunk:   ", 0ul);
+
 #ifdef CONFIG_MEMORY_FAILURE
-		, atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10)
+	seq_printf(m, "HardwareCorrupted: %5lu kB\n",
+		   atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10));
 #endif
+
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-		, K(global_node_page_state(NR_ANON_THPS) * HPAGE_PMD_NR)
-		, K(global_node_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR)
-		, K(global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR)
+	show_val_kb(m, "AnonHugePages:  ",
+		    global_node_page_state(NR_ANON_THPS) * HPAGE_PMD_NR);
+	show_val_kb(m, "ShmemHugePages: ",
+		    global_node_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR);
+	show_val_kb(m, "ShmemPmdMapped: ",
+		    global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR);
 #endif
+
 #ifdef CONFIG_CMA
-		, K(totalcma_pages)
-		, K(global_page_state(NR_FREE_CMA_PAGES))
+	show_val_kb(m, "CmaTotal:       ", totalcma_pages);
+	show_val_kb(m, "CmaFree:        ",
+		    global_page_state(NR_FREE_CMA_PAGES));
 #endif
-		);
 
 	hugetlb_report_meminfo(m);
 
 	arch_report_meminfo(m);
 
 	return 0;
-#undef K
 }
 
 static int meminfo_proc_open(struct inode *inode, struct file *file)
-- 
cgit 


From 7abbaf94049914f074306d960b0f968ffe52e59f Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Fri, 7 Oct 2016 17:02:26 -0700
Subject: proc: relax /proc/<tid>/timerslack_ns capability requirements

When an interface to allow a task to change another tasks timerslack was
first proposed, it was suggested that something greater then
CAP_SYS_NICE would be needed, as a task could be delayed further then
what normally could be done with nice adjustments.

So CAP_SYS_PTRACE was adopted instead for what became the
/proc/<tid>/timerslack_ns interface.  However, for Android (where this
feature originates), giving the system_server CAP_SYS_PTRACE would allow
it to observe and modify all tasks memory.  This is considered too high
a privilege level for only needing to change the timerslack.

After some discussion, it was realized that a CAP_SYS_NICE process can
set a task as SCHED_FIFO, so they could fork some spinning processes and
set them all SCHED_FIFO 99, in effect delaying all other tasks for an
infinite amount of time.

So as a CAP_SYS_NICE task can already cause trouble for other tasks,
using it as a required capability for accessing and modifying
/proc/<tid>/timerslack_ns seems sufficient.

Thus, this patch loosens the capability requirements to CAP_SYS_NICE and
removes CAP_SYS_PTRACE, simplifying some of the code flow as well.

This is technically an ABI change, but as the feature just landed in
4.6, I suspect no one is yet using it.

Link: http://lkml.kernel.org/r/1469132667-17377-1-git-send-email-john.stultz@linaro.org
Signed-off-by: John Stultz <john.stultz@linaro.org>
Reviewed-by: Nick Kralevich <nnk@google.com>
Acked-by: Serge Hallyn <serge@hallyn.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: "Serge E. Hallyn" <serge@hallyn.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Oren Laadan <orenl@cellrox.com>
Cc: Ruchi Kandoi <kandoiruchi@google.com>
Cc: Rom Lemarchand <romlem@android.com>
Cc: Todd Kjos <tkjos@google.com>
Cc: Colin Cross <ccross@android.com>
Cc: Nick Kralevich <nnk@google.com>
Cc: Dmitry Shmidt <dimitrysh@google.com>
Cc: Elliott Hughes <enh@google.com>
Cc: Android Kernel Team <kernel-team@android.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c | 34 ++++++++++++++++++++--------------
 1 file changed, 20 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 3b792ab3c0dc..248f008d46b8 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2280,16 +2280,19 @@ static ssize_t timerslack_ns_write(struct file *file, const char __user *buf,
 	if (!p)
 		return -ESRCH;
 
-	if (ptrace_may_access(p, PTRACE_MODE_ATTACH_FSCREDS)) {
-		task_lock(p);
-		if (slack_ns == 0)
-			p->timer_slack_ns = p->default_timer_slack_ns;
-		else
-			p->timer_slack_ns = slack_ns;
-		task_unlock(p);
-	} else
+	if (!capable(CAP_SYS_NICE)) {
 		count = -EPERM;
+		goto out;
+	}
 
+	task_lock(p);
+	if (slack_ns == 0)
+		p->timer_slack_ns = p->default_timer_slack_ns;
+	else
+		p->timer_slack_ns = slack_ns;
+	task_unlock(p);
+
+out:
 	put_task_struct(p);
 
 	return count;
@@ -2299,19 +2302,22 @@ static int timerslack_ns_show(struct seq_file *m, void *v)
 {
 	struct inode *inode = m->private;
 	struct task_struct *p;
-	int err =  0;
+	int err = 0;
 
 	p = get_proc_task(inode);
 	if (!p)
 		return -ESRCH;
 
-	if (ptrace_may_access(p, PTRACE_MODE_ATTACH_FSCREDS)) {
-		task_lock(p);
-		seq_printf(m, "%llu\n", p->timer_slack_ns);
-		task_unlock(p);
-	} else
+	if (!capable(CAP_SYS_NICE)) {
 		err = -EPERM;
+		goto out;
+	}
 
+	task_lock(p);
+	seq_printf(m, "%llu\n", p->timer_slack_ns);
+	task_unlock(p);
+
+out:
 	put_task_struct(p);
 
 	return err;
-- 
cgit 


From 904763e1fb5eebf8249ec41a2019e5e32246df2f Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Fri, 7 Oct 2016 17:02:29 -0700
Subject: proc: add LSM hook checks to /proc/<tid>/timerslack_ns

As requested, this patch checks the existing LSM hooks
task_getscheduler/task_setscheduler when reading or modifying the task's
timerslack value.

Previous versions added new get/settimerslack LSM hooks, but since they
checked the same PROCESS__SET/GETSCHED values as existing hooks, it was
suggested we just use the existing ones.

Link: http://lkml.kernel.org/r/1469132667-17377-2-git-send-email-john.stultz@linaro.org
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: "Serge E. Hallyn" <serge@hallyn.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Oren Laadan <orenl@cellrox.com>
Cc: Ruchi Kandoi <kandoiruchi@google.com>
Cc: Rom Lemarchand <romlem@android.com>
Cc: Todd Kjos <tkjos@google.com>
Cc: Colin Cross <ccross@android.com>
Cc: Nick Kralevich <nnk@google.com>
Cc: Dmitry Shmidt <dimitrysh@google.com>
Cc: Elliott Hughes <enh@google.com>
Cc: James Morris <jmorris@namei.org>
Cc: Android Kernel Team <kernel-team@android.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 248f008d46b8..ebccdc192830 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2285,6 +2285,12 @@ static ssize_t timerslack_ns_write(struct file *file, const char __user *buf,
 		goto out;
 	}
 
+	err = security_task_setscheduler(p);
+	if (err) {
+		count = err;
+		goto out;
+	}
+
 	task_lock(p);
 	if (slack_ns == 0)
 		p->timer_slack_ns = p->default_timer_slack_ns;
@@ -2313,6 +2319,10 @@ static int timerslack_ns_show(struct seq_file *m, void *v)
 		goto out;
 	}
 
+	err = security_task_getscheduler(p);
+	if (err)
+		goto out;
+
 	task_lock(p);
 	seq_printf(m, "%llu\n", p->timer_slack_ns);
 	task_unlock(p);
-- 
cgit 


From 4b2bd5fec007a4fd3fc82474b9199af25013de4c Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Fri, 7 Oct 2016 17:02:33 -0700
Subject: proc: fix timerslack_ns CAP_SYS_NICE check when adjusting self

In changing from checking ptrace_may_access(p, PTRACE_MODE_ATTACH_FSCREDS)
to capable(CAP_SYS_NICE), I missed that ptrace_my_access succeeds when p
== current, but the CAP_SYS_NICE doesn't.

Thus while the previous commit was intended to loosen the needed
privileges to modify a processes timerslack, it needlessly restricted a
task modifying its own timerslack via the proc/<tid>/timerslack_ns
(which is permitted also via the PR_SET_TIMERSLACK method).

This patch corrects this by checking if p == current before checking the
CAP_SYS_NICE value.

This patch applies on top of my two previous patches currently in -mm

Link: http://lkml.kernel.org/r/1471906870-28624-1-git-send-email-john.stultz@linaro.org
Signed-off-by: John Stultz <john.stultz@linaro.org>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: "Serge E. Hallyn" <serge@hallyn.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Oren Laadan <orenl@cellrox.com>
Cc: Ruchi Kandoi <kandoiruchi@google.com>
Cc: Rom Lemarchand <romlem@android.com>
Cc: Todd Kjos <tkjos@google.com>
Cc: Colin Cross <ccross@android.com>
Cc: Nick Kralevich <nnk@google.com>
Cc: Dmitry Shmidt <dimitrysh@google.com>
Cc: Elliott Hughes <enh@google.com>
Cc: Android Kernel Team <kernel-team@android.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c | 34 +++++++++++++++++++---------------
 1 file changed, 19 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index ebccdc192830..dc7fe5f3a53c 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2280,15 +2280,17 @@ static ssize_t timerslack_ns_write(struct file *file, const char __user *buf,
 	if (!p)
 		return -ESRCH;
 
-	if (!capable(CAP_SYS_NICE)) {
-		count = -EPERM;
-		goto out;
-	}
+	if (p != current) {
+		if (!capable(CAP_SYS_NICE)) {
+			count = -EPERM;
+			goto out;
+		}
 
-	err = security_task_setscheduler(p);
-	if (err) {
-		count = err;
-		goto out;
+		err = security_task_setscheduler(p);
+		if (err) {
+			count = err;
+			goto out;
+		}
 	}
 
 	task_lock(p);
@@ -2314,14 +2316,16 @@ static int timerslack_ns_show(struct seq_file *m, void *v)
 	if (!p)
 		return -ESRCH;
 
-	if (!capable(CAP_SYS_NICE)) {
-		err = -EPERM;
-		goto out;
-	}
+	if (p != current) {
 
-	err = security_task_getscheduler(p);
-	if (err)
-		goto out;
+		if (!capable(CAP_SYS_NICE)) {
+			err = -EPERM;
+			goto out;
+		}
+		err = security_task_getscheduler(p);
+		if (err)
+			goto out;
+	}
 
 	task_lock(p);
 	seq_printf(m, "%llu\n", p->timer_slack_ns);
-- 
cgit 


From 855af072b6c40aeb266f4dc98fd9a6a49edf22af Mon Sep 17 00:00:00 2001
From: Robert Ho <robert.hu@intel.com>
Date: Fri, 7 Oct 2016 17:02:36 -0700
Subject: mm, proc: fix region lost in /proc/self/smaps

Recently, Redhat reported that nvml test suite failed on QEMU/KVM,
more detailed info please refer to:

   https://bugzilla.redhat.com/show_bug.cgi?id=1365721

Actually, this bug is not only for NVDIMM/DAX but also for any other
file systems.  This simple test case abstracted from nvml can easily
reproduce this bug in common environment:

-------------------------- testcase.c -----------------------------

int
is_pmem_proc(const void *addr, size_t len)
{
        const char *caddr = addr;

        FILE *fp;
        if ((fp = fopen("/proc/self/smaps", "r")) == NULL) {
                printf("!/proc/self/smaps");
                return 0;
        }

        int retval = 0;         /* assume false until proven otherwise */
        char line[PROCMAXLEN];  /* for fgets() */
        char *lo = NULL;        /* beginning of current range in smaps file */
        char *hi = NULL;        /* end of current range in smaps file */
        int needmm = 0;         /* looking for mm flag for current range */
        while (fgets(line, PROCMAXLEN, fp) != NULL) {
                static const char vmflags[] = "VmFlags:";
                static const char mm[] = " wr";

                /* check for range line */
                if (sscanf(line, "%p-%p", &lo, &hi) == 2) {
                        if (needmm) {
                                /* last range matched, but no mm flag found */
                                printf("never found mm flag.\n");
                                break;
                        } else if (caddr < lo) {
                                /* never found the range for caddr */
                                printf("#######no match for addr %p.\n", caddr);
                                break;
                        } else if (caddr < hi) {
                                /* start address is in this range */
                                size_t rangelen = (size_t)(hi - caddr);

                                /* remember that matching has started */
                                needmm = 1;

                                /* calculate remaining range to search for */
                                if (len > rangelen) {
                                        len -= rangelen;
                                        caddr += rangelen;
                                        printf("matched %zu bytes in range "
                                                "%p-%p, %zu left over.\n",
                                                        rangelen, lo, hi, len);
                                } else {
                                        len = 0;
                                        printf("matched all bytes in range "
                                                        "%p-%p.\n", lo, hi);
                                }
                        }
                } else if (needmm && strncmp(line, vmflags,
                                        sizeof(vmflags) - 1) == 0) {
                        if (strstr(&line[sizeof(vmflags) - 1], mm) != NULL) {
                                printf("mm flag found.\n");
                                if (len == 0) {
                                        /* entire range matched */
                                        retval = 1;
                                        break;
                                }
                                needmm = 0;     /* saw what was needed */
                        } else {
                                /* mm flag not set for some or all of range */
                                printf("range has no mm flag.\n");
                                break;
                        }
                }
        }

        fclose(fp);

        printf("returning %d.\n", retval);
        return retval;
}

void *Addr;
size_t Size;

/*
 * worker -- the work each thread performs
 */
static void *
worker(void *arg)
{
        int *ret = (int *)arg;
        *ret =  is_pmem_proc(Addr, Size);
        return NULL;
}

int main(int argc, char *argv[])
{
        if (argc <  2 || argc > 3) {
                printf("usage: %s file [env].\n", argv[0]);
                return -1;
        }

        int fd = open(argv[1], O_RDWR);

        struct stat stbuf;
        fstat(fd, &stbuf);

        Size = stbuf.st_size;
        Addr = mmap(0, stbuf.st_size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);

        close(fd);

        pthread_t threads[NTHREAD];
        int ret[NTHREAD];

        /* kick off NTHREAD threads */
        for (int i = 0; i < NTHREAD; i++)
                pthread_create(&threads[i], NULL, worker, &ret[i]);

        /* wait for all the threads to complete */
        for (int i = 0; i < NTHREAD; i++)
                pthread_join(threads[i], NULL);

        /* verify that all the threads return the same value */
        for (int i = 1; i < NTHREAD; i++) {
                if (ret[0] != ret[i]) {
                        printf("Error i %d ret[0] = %d ret[i] = %d.\n", i,
                                ret[0], ret[i]);
                }
        }

        printf("%d", ret[0]);
        return 0;
}

It failed as some threads can not find the memory region in
"/proc/self/smaps" which is allocated in the main process

It is caused by proc fs which uses 'file->version' to indicate the VMA that
is the last one has already been handled by read() system call. When the
next read() issues, it uses the 'version' to find the VMA, then the next
VMA is what we want to handle, the related code is as follows:

        if (last_addr) {
                vma = find_vma(mm, last_addr);
                if (vma && (vma = m_next_vma(priv, vma)))
                        return vma;
        }

However, VMA will be lost if the last VMA is gone, e.g:

The process VMA list is A->B->C->D

CPU 0                                  CPU 1
read() system call
   handle VMA B
   version = B
return to userspace

                                   unmap VMA B

issue read() again to continue to get
the region info
   find_vma(version) will get VMA C
   m_next_vma(C) will get VMA D
   handle D
   !!! VMA C is lost !!!

In order to fix this bug, we make 'file->version' indicate the end address
of the current VMA.  m_start will then look up a vma which with vma_start
< last_vm_end and moves on to the next vma if we found the same or an
overlapping vma.  This will guarantee that we will not miss an exclusive
vma but we can still miss one if the previous vma was shrunk.  This is
acceptable because guaranteeing "never miss a vma" is simply not feasible.
User has to cope with some inconsistencies if the file is not read in one
go.

[mhocko@suse.com: changelog fixes]
Link: http://lkml.kernel.org/r/1475296958-27652-1-git-send-email-robert.hu@intel.com
Acked-by: Dave Hansen <dave.hansen@intel.com>
Signed-off-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Signed-off-by: Robert Hu <robert.hu@intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Gleb Natapov <gleb@kernel.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Stefan Hajnoczi <stefanha@redhat.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index d2a70cf2154e..6909582ce5e5 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -147,7 +147,7 @@ m_next_vma(struct proc_maps_private *priv, struct vm_area_struct *vma)
 static void m_cache_vma(struct seq_file *m, struct vm_area_struct *vma)
 {
 	if (m->count < m->size)	/* vma is copied successfully */
-		m->version = m_next_vma(m->private, vma) ? vma->vm_start : -1UL;
+		m->version = m_next_vma(m->private, vma) ? vma->vm_end : -1UL;
 }
 
 static void *m_start(struct seq_file *m, loff_t *ppos)
@@ -175,8 +175,10 @@ static void *m_start(struct seq_file *m, loff_t *ppos)
 	priv->tail_vma = get_gate_vma(mm);
 
 	if (last_addr) {
-		vma = find_vma(mm, last_addr);
-		if (vma && (vma = m_next_vma(priv, vma)))
+		vma = find_vma(mm, last_addr - 1);
+		if (vma && vma->vm_start <= last_addr)
+			vma = m_next_vma(priv, vma);
+		if (vma)
 			return vma;
 	}
 
-- 
cgit 


From 81243eacfa400f5f7b89f4c2323d0de9982bb0fb Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Fri, 7 Oct 2016 17:03:12 -0700
Subject: cred: simpler, 1D supplementary groups

Current supplementary groups code can massively overallocate memory and
is implemented in a way so that access to individual gid is done via 2D
array.

If number of gids is <= 32, memory allocation is more or less tolerable
(140/148 bytes).  But if it is not, code allocates full page (!)
regardless and, what's even more fun, doesn't reuse small 32-entry
array.

2D array means dependent shifts, loads and LEAs without possibility to
optimize them (gid is never known at compile time).

All of the above is unnecessary.  Switch to the usual
trailing-zero-len-array scheme.  Memory is allocated with
kmalloc/vmalloc() and only as much as needed.  Accesses become simpler
(LEA 8(gi,idx,4) or even without displacement).

Maximum number of gids is 65536 which translates to 256KB+8 bytes.  I
think kernel can handle such allocation.

On my usual desktop system with whole 9 (nine) aux groups, struct
group_info shrinks from 148 bytes to 44 bytes, yay!

Nice side effects:

 - "gi->gid[i]" is shorter than "GROUP_AT(gi, i)", less typing,

 - fix little mess in net/ipv4/ping.c
   should have been using GROUP_AT macro but this point becomes moot,

 - aux group allocation is persistent and should be accounted as such.

Link: http://lkml.kernel.org/r/20160817201927.GA2096@p183.telecom.by
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Vasily Kulikov <segoon@openwall.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nfsd/auth.c      | 6 +++---
 fs/nfsd/nfs4state.c | 2 +-
 fs/proc/array.c     | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 9d46a0bdd9f9..62469c60be23 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -55,10 +55,10 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
 			goto oom;
 
 		for (i = 0; i < rqgi->ngroups; i++) {
-			if (gid_eq(GLOBAL_ROOT_GID, GROUP_AT(rqgi, i)))
-				GROUP_AT(gi, i) = exp->ex_anon_gid;
+			if (gid_eq(GLOBAL_ROOT_GID, rqgi->gid[i]))
+				gi->gid[i] = exp->ex_anon_gid;
 			else
-				GROUP_AT(gi, i) = GROUP_AT(rqgi, i);
+				gi->gid[i] = rqgi->gid[i];
 		}
 	} else {
 		gi = get_group_info(rqgi);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index a204d7e109d4..39bfaba9c99c 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1903,7 +1903,7 @@ static bool groups_equal(struct group_info *g1, struct group_info *g2)
 	if (g1->ngroups != g2->ngroups)
 		return false;
 	for (i=0; i<g1->ngroups; i++)
-		if (!gid_eq(GROUP_AT(g1, i), GROUP_AT(g2, i)))
+		if (!gid_eq(g1->gid[i], g2->gid[i]))
 			return false;
 	return true;
 }
diff --git a/fs/proc/array.c b/fs/proc/array.c
index d25b44601b30..89600fd5963d 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -207,7 +207,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
 	group_info = cred->group_info;
 	for (g = 0; g < group_info->ngroups; g++)
 		seq_put_decimal_ull(m, g ? " " : "",
-				    from_kgid_munged(user_ns, GROUP_AT(group_info, g)));
+				from_kgid_munged(user_ns, group_info->gid[g]));
 	put_cred(cred);
 	/* Trailing space shouldn't have been added in the first place. */
 	seq_putc(m, ' ');
-- 
cgit