diff options
Diffstat (limited to 'kernel')
38 files changed, 836 insertions, 439 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3737a682cdf5..b6eadfe30e7b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -47,6 +47,7 @@  #include <linux/hash.h>  #include <linux/namei.h>  #include <linux/smp_lock.h> +#include <linux/pid_namespace.h>  #include <asm/atomic.h> @@ -734,16 +735,28 @@ static void cgroup_d_remove_dir(struct dentry *dentry)   * reference to css->refcnt. In general, this refcnt is expected to goes down   * to zero, soon.   * - * CGRP_WAIT_ON_RMDIR flag is modified under cgroup's inode->i_mutex; + * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;   */  DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); -static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp) +static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)  { -	if (unlikely(test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) +	if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))  		wake_up_all(&cgroup_rmdir_waitq);  } +void cgroup_exclude_rmdir(struct cgroup_subsys_state *css) +{ +	css_get(css); +} + +void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css) +{ +	cgroup_wakeup_rmdir_waiter(css->cgroup); +	css_put(css); +} + +  static int rebind_subsystems(struct cgroupfs_root *root,  			      unsigned long final_bits)  { @@ -960,6 +973,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)  	INIT_LIST_HEAD(&cgrp->children);  	INIT_LIST_HEAD(&cgrp->css_sets);  	INIT_LIST_HEAD(&cgrp->release_list); +	INIT_LIST_HEAD(&cgrp->pids_list);  	init_rwsem(&cgrp->pids_mutex);  }  static void init_cgroup_root(struct cgroupfs_root *root) @@ -1357,7 +1371,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)  	 * wake up rmdir() waiter. the rmdir should fail since the cgroup  	 * is no longer empty.  	 */ -	cgroup_wakeup_rmdir_waiters(cgrp); +	cgroup_wakeup_rmdir_waiter(cgrp);  	return 0;  } @@ -2201,12 +2215,30 @@ err:  	return ret;  } +/* + * Cache pids for all threads in the same pid namespace that are + * opening the same "tasks" file. + */ +struct cgroup_pids { +	/* The node in cgrp->pids_list */ +	struct list_head list; +	/* The cgroup those pids belong to */ +	struct cgroup *cgrp; +	/* The namepsace those pids belong to */ +	struct pid_namespace *ns; +	/* Array of process ids in the cgroup */ +	pid_t *tasks_pids; +	/* How many files are using the this tasks_pids array */ +	int use_count; +	/* Length of the current tasks_pids array */ +	int length; +}; +  static int cmppid(const void *a, const void *b)  {  	return *(pid_t *)a - *(pid_t *)b;  } -  /*   * seq_file methods for the "tasks" file. The seq_file position is the   * next pid to display; the seq_file iterator is a pointer to the pid @@ -2221,45 +2253,47 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)  	 * after a seek to the start). Use a binary-search to find the  	 * next pid to display, if any  	 */ -	struct cgroup *cgrp = s->private; +	struct cgroup_pids *cp = s->private; +	struct cgroup *cgrp = cp->cgrp;  	int index = 0, pid = *pos;  	int *iter;  	down_read(&cgrp->pids_mutex);  	if (pid) { -		int end = cgrp->pids_length; +		int end = cp->length;  		while (index < end) {  			int mid = (index + end) / 2; -			if (cgrp->tasks_pids[mid] == pid) { +			if (cp->tasks_pids[mid] == pid) {  				index = mid;  				break; -			} else if (cgrp->tasks_pids[mid] <= pid) +			} else if (cp->tasks_pids[mid] <= pid)  				index = mid + 1;  			else  				end = mid;  		}  	}  	/* If we're off the end of the array, we're done */ -	if (index >= cgrp->pids_length) +	if (index >= cp->length)  		return NULL;  	/* Update the abstract position to be the actual pid that we found */ -	iter = cgrp->tasks_pids + index; +	iter = cp->tasks_pids + index;  	*pos = *iter;  	return iter;  }  static void cgroup_tasks_stop(struct seq_file *s, void *v)  { -	struct cgroup *cgrp = s->private; +	struct cgroup_pids *cp = s->private; +	struct cgroup *cgrp = cp->cgrp;  	up_read(&cgrp->pids_mutex);  }  static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)  { -	struct cgroup *cgrp = s->private; +	struct cgroup_pids *cp = s->private;  	int *p = v; -	int *end = cgrp->tasks_pids + cgrp->pids_length; +	int *end = cp->tasks_pids + cp->length;  	/*  	 * Advance to the next pid in the array. If this goes off the @@ -2286,26 +2320,33 @@ static struct seq_operations cgroup_tasks_seq_operations = {  	.show = cgroup_tasks_show,  }; -static void release_cgroup_pid_array(struct cgroup *cgrp) +static void release_cgroup_pid_array(struct cgroup_pids *cp)  { +	struct cgroup *cgrp = cp->cgrp; +  	down_write(&cgrp->pids_mutex); -	BUG_ON(!cgrp->pids_use_count); -	if (!--cgrp->pids_use_count) { -		kfree(cgrp->tasks_pids); -		cgrp->tasks_pids = NULL; -		cgrp->pids_length = 0; +	BUG_ON(!cp->use_count); +	if (!--cp->use_count) { +		list_del(&cp->list); +		put_pid_ns(cp->ns); +		kfree(cp->tasks_pids); +		kfree(cp);  	}  	up_write(&cgrp->pids_mutex);  }  static int cgroup_tasks_release(struct inode *inode, struct file *file)  { -	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); +	struct seq_file *seq; +	struct cgroup_pids *cp;  	if (!(file->f_mode & FMODE_READ))  		return 0; -	release_cgroup_pid_array(cgrp); +	seq = file->private_data; +	cp = seq->private; + +	release_cgroup_pid_array(cp);  	return seq_release(inode, file);  } @@ -2324,6 +2365,8 @@ static struct file_operations cgroup_tasks_operations = {  static int cgroup_tasks_open(struct inode *unused, struct file *file)  {  	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); +	struct pid_namespace *ns = current->nsproxy->pid_ns; +	struct cgroup_pids *cp;  	pid_t *pidarray;  	int npids;  	int retval; @@ -2350,20 +2393,37 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)  	 * array if necessary  	 */  	down_write(&cgrp->pids_mutex); -	kfree(cgrp->tasks_pids); -	cgrp->tasks_pids = pidarray; -	cgrp->pids_length = npids; -	cgrp->pids_use_count++; + +	list_for_each_entry(cp, &cgrp->pids_list, list) { +		if (ns == cp->ns) +			goto found; +	} + +	cp = kzalloc(sizeof(*cp), GFP_KERNEL); +	if (!cp) { +		up_write(&cgrp->pids_mutex); +		kfree(pidarray); +		return -ENOMEM; +	} +	cp->cgrp = cgrp; +	cp->ns = ns; +	get_pid_ns(ns); +	list_add(&cp->list, &cgrp->pids_list); +found: +	kfree(cp->tasks_pids); +	cp->tasks_pids = pidarray; +	cp->length = npids; +	cp->use_count++;  	up_write(&cgrp->pids_mutex);  	file->f_op = &cgroup_tasks_operations;  	retval = seq_open(file, &cgroup_tasks_seq_operations);  	if (retval) { -		release_cgroup_pid_array(cgrp); +		release_cgroup_pid_array(cp);  		return retval;  	} -	((struct seq_file *)file->private_data)->private = cgrp; +	((struct seq_file *)file->private_data)->private = cp;  	return 0;  } @@ -2696,33 +2756,42 @@ again:  	mutex_unlock(&cgroup_mutex);  	/* +	 * In general, subsystem has no css->refcnt after pre_destroy(). But +	 * in racy cases, subsystem may have to get css->refcnt after +	 * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes +	 * make rmdir return -EBUSY too often. To avoid that, we use waitqueue +	 * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir +	 * and subsystem's reference count handling. Please see css_get/put +	 * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation. +	 */ +	set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); + +	/*  	 * Call pre_destroy handlers of subsys. Notify subsystems  	 * that rmdir() request comes.  	 */  	ret = cgroup_call_pre_destroy(cgrp); -	if (ret) +	if (ret) { +		clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);  		return ret; +	}  	mutex_lock(&cgroup_mutex);  	parent = cgrp->parent;  	if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { +		clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);  		mutex_unlock(&cgroup_mutex);  		return -EBUSY;  	} -	/* -	 * css_put/get is provided for subsys to grab refcnt to css. In typical -	 * case, subsystem has no reference after pre_destroy(). But, under -	 * hierarchy management, some *temporal* refcnt can be hold. -	 * To avoid returning -EBUSY to a user, waitqueue is used. If subsys -	 * is really busy, it should return -EBUSY at pre_destroy(). wake_up -	 * is called when css_put() is called and refcnt goes down to 0. -	 */ -	set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);  	prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); -  	if (!cgroup_clear_css_refs(cgrp)) {  		mutex_unlock(&cgroup_mutex); -		schedule(); +		/* +		 * Because someone may call cgroup_wakeup_rmdir_waiter() before +		 * prepare_to_wait(), we need to check this flag. +		 */ +		if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)) +			schedule();  		finish_wait(&cgroup_rmdir_waitq, &wait);  		clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);  		if (signal_pending(current)) @@ -3294,7 +3363,7 @@ void __css_put(struct cgroup_subsys_state *css)  			set_bit(CGRP_RELEASABLE, &cgrp->flags);  			check_for_release(cgrp);  		} -		cgroup_wakeup_rmdir_waiters(cgrp); +		cgroup_wakeup_rmdir_waiter(cgrp);  	}  	rcu_read_unlock();  } diff --git a/kernel/fork.c b/kernel/fork.c index 9b42695f0d14..e6c04d462ab2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -567,18 +567,18 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)  	 * the value intact in a core dump, and to save the unnecessary  	 * trouble otherwise.  Userland only wants this done for a sys_exit.  	 */ -	if (tsk->clear_child_tid -	    && !(tsk->flags & PF_SIGNALED) -	    && atomic_read(&mm->mm_users) > 1) { -		u32 __user * tidptr = tsk->clear_child_tid; +	if (tsk->clear_child_tid) { +		if (!(tsk->flags & PF_SIGNALED) && +		    atomic_read(&mm->mm_users) > 1) { +			/* +			 * We don't check the error code - if userspace has +			 * not set up a proper pointer then tough luck. +			 */ +			put_user(0, tsk->clear_child_tid); +			sys_futex(tsk->clear_child_tid, FUTEX_WAKE, +					1, NULL, NULL, 0); +		}  		tsk->clear_child_tid = NULL; - -		/* -		 * We don't check the error code - if userspace has -		 * not set up a proper pointer then tough luck. -		 */ -		put_user(0, tidptr); -		sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL, 0);  	}  } @@ -815,11 +815,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)  {  	struct signal_struct *sig; -	if (clone_flags & CLONE_THREAD) { -		atomic_inc(¤t->signal->count); -		atomic_inc(¤t->signal->live); +	if (clone_flags & CLONE_THREAD)  		return 0; -	}  	sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);  	tsk->signal = sig; @@ -877,16 +874,6 @@ void __cleanup_signal(struct signal_struct *sig)  	kmem_cache_free(signal_cachep, sig);  } -static void cleanup_signal(struct task_struct *tsk) -{ -	struct signal_struct *sig = tsk->signal; - -	atomic_dec(&sig->live); - -	if (atomic_dec_and_test(&sig->count)) -		__cleanup_signal(sig); -} -  static void copy_flags(unsigned long clone_flags, struct task_struct *p)  {  	unsigned long new_flags = p->flags; @@ -1239,6 +1226,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,  	}  	if (clone_flags & CLONE_THREAD) { +		atomic_inc(¤t->signal->count); +		atomic_inc(¤t->signal->live);  		p->group_leader = current->group_leader;  		list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);  	} @@ -1268,6 +1257,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,  	write_unlock_irq(&tasklist_lock);  	proc_fork_connector(p);  	cgroup_post_fork(p); +	perf_counter_fork(p);  	return p;  bad_fork_free_pid: @@ -1281,7 +1271,8 @@ bad_fork_cleanup_mm:  	if (p->mm)  		mmput(p->mm);  bad_fork_cleanup_signal: -	cleanup_signal(p); +	if (!(clone_flags & CLONE_THREAD)) +		__cleanup_signal(p->signal);  bad_fork_cleanup_sighand:  	__cleanup_sighand(p->sighand);  bad_fork_cleanup_fs: @@ -1409,9 +1400,6 @@ long do_fork(unsigned long clone_flags,  			init_completion(&vfork);  		} -		if (!(clone_flags & CLONE_THREAD)) -			perf_counter_fork(p); -  		audit_finish_fork(p);  		tracehook_report_clone(regs, clone_flags, nr, p); diff --git a/kernel/futex.c b/kernel/futex.c index 0672ff88f159..e18cfbdc7190 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1010,15 +1010,19 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,   * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue   * q:	the futex_q   * key:	the key of the requeue target futex + * hb:  the hash_bucket of the requeue target futex   *   * During futex_requeue, with requeue_pi=1, it is possible to acquire the   * target futex if it is uncontended or via a lock steal.  Set the futex_q key   * to the requeue target futex so the waiter can detect the wakeup on the right   * futex, but remove it from the hb and NULL the rt_waiter so it can detect - * atomic lock acquisition.  Must be called with the q->lock_ptr held. + * atomic lock acquisition.  Set the q->lock_ptr to the requeue target hb->lock + * to protect access to the pi_state to fixup the owner later.  Must be called + * with both q->lock_ptr and hb->lock held.   */  static inline -void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key) +void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, +			   struct futex_hash_bucket *hb)  {  	drop_futex_key_refs(&q->key);  	get_futex_key_refs(key); @@ -1030,6 +1034,11 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key)  	WARN_ON(!q->rt_waiter);  	q->rt_waiter = NULL; +	q->lock_ptr = &hb->lock; +#ifdef CONFIG_DEBUG_PI_LIST +	q->list.plist.lock = &hb->lock; +#endif +  	wake_up_state(q->task, TASK_NORMAL);  } @@ -1088,7 +1097,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,  	ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,  				   set_waiters);  	if (ret == 1) -		requeue_pi_wake_futex(top_waiter, key2); +		requeue_pi_wake_futex(top_waiter, key2, hb2);  	return ret;  } @@ -1247,8 +1256,15 @@ retry_private:  		if (!match_futex(&this->key, &key1))  			continue; -		WARN_ON(!requeue_pi && this->rt_waiter); -		WARN_ON(requeue_pi && !this->rt_waiter); +		/* +		 * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always +		 * be paired with each other and no other futex ops. +		 */ +		if ((requeue_pi && !this->rt_waiter) || +		    (!requeue_pi && this->rt_waiter)) { +			ret = -EINVAL; +			break; +		}  		/*  		 * Wake nr_wake waiters.  For requeue_pi, if we acquired the @@ -1273,7 +1289,7 @@ retry_private:  							this->task, 1);  			if (ret == 1) {  				/* We got the lock. */ -				requeue_pi_wake_futex(this, &key2); +				requeue_pi_wake_futex(this, &key2, hb2);  				continue;  			} else if (ret) {  				/* -EDEADLK */ diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index d607a5b9ee29..235716556bf1 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -180,7 +180,8 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,  	int cmd = op & FUTEX_CMD_MASK;  	if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || -		      cmd == FUTEX_WAIT_BITSET)) { +		      cmd == FUTEX_WAIT_BITSET || +		      cmd == FUTEX_WAIT_REQUEUE_PI)) {  		if (get_compat_timespec(&ts, utime))  			return -EFAULT;  		if (!timespec_valid(&ts)) @@ -191,7 +192,8 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,  			t = ktime_add_safe(ktime_get(), t);  		tp = &t;  	} -	if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE) +	if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || +	    cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)  		val2 = (int) (unsigned long) utime;  	return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 61c679db4687..0ec9ed831737 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -607,7 +607,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)  		 */  		get_task_struct(t);  		new->thread = t; -		wake_up_process(t);  	}  	/* @@ -690,6 +689,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)  				(int)(new->flags & IRQF_TRIGGER_MASK));  	} +	new->irq = irq;  	*old_ptr = new;  	/* Reset broken irq detection when installing new handler */ @@ -707,7 +707,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)  	spin_unlock_irqrestore(&desc->lock, flags); -	new->irq = irq; +	/* +	 * Strictly no need to wake it up, but hung_task complains +	 * when no hard interrupt wakes the thread up. +	 */ +	if (new->thread) +		wake_up_process(new->thread); +  	register_irq_proc(irq, desc);  	new->dir = NULL;  	register_handler_proc(irq, new); @@ -761,7 +767,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)  {  	struct irq_desc *desc = irq_to_desc(irq);  	struct irqaction *action, **action_ptr; -	struct task_struct *irqthread;  	unsigned long flags;  	WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq); @@ -809,9 +814,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)  			desc->chip->disable(irq);  	} -	irqthread = action->thread; -	action->thread = NULL; -  	spin_unlock_irqrestore(&desc->lock, flags);  	unregister_handler_proc(irq, action); @@ -819,12 +821,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)  	/* Make sure it's not being used on another CPU: */  	synchronize_irq(irq); -	if (irqthread) { -		if (!test_bit(IRQTF_DIED, &action->thread_flags)) -			kthread_stop(irqthread); -		put_task_struct(irqthread); -	} -  #ifdef CONFIG_DEBUG_SHIRQ  	/*  	 * It's a shared IRQ -- the driver ought to be prepared for an IRQ @@ -840,6 +836,13 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)  		local_irq_restore(flags);  	}  #endif + +	if (action->thread) { +		if (!test_bit(IRQTF_DIED, &action->thread_flags)) +			kthread_stop(action->thread); +		put_task_struct(action->thread); +	} +  	return action;  } diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c index 2f69bee57bf2..3fd30197da2e 100644 --- a/kernel/irq/numa_migrate.c +++ b/kernel/irq/numa_migrate.c @@ -107,8 +107,8 @@ out_unlock:  struct irq_desc *move_irq_desc(struct irq_desc *desc, int node)  { -	/* those all static, do move them */ -	if (desc->irq < NR_IRQS_LEGACY) +	/* those static or target node is -1, do not move them */ +	if (desc->irq < NR_IRQS_LEGACY || node == -1)  		return desc;  	if (desc->node != node) diff --git a/kernel/kexec.c b/kernel/kexec.c index ae1c35201cc8..f336e2107f98 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1228,7 +1228,7 @@ static int __init parse_crashkernel_mem(char 			*cmdline,  	} while (*cur++ == ',');  	if (*crash_size > 0) { -		while (*cur != ' ' && *cur != '@') +		while (*cur && *cur != ' ' && *cur != '@')  			cur++;  		if (*cur == '@') {  			cur++; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 16b5739c516a..0540948e29ab 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -694,7 +694,7 @@ int __kprobes register_kprobe(struct kprobe *p)  	p->addr = addr;  	preempt_disable(); -	if (!__kernel_text_address((unsigned long) p->addr) || +	if (!kernel_text_address((unsigned long) p->addr) ||  	    in_kprobes_functions((unsigned long) p->addr)) {  		preempt_enable();  		return -EINVAL; diff --git a/kernel/kthread.c b/kernel/kthread.c index 9b1a7de26979..eb8751aa0418 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -180,10 +180,12 @@ EXPORT_SYMBOL(kthread_bind);   * @k: thread created by kthread_create().   *   * Sets kthread_should_stop() for @k to return true, wakes it, and - * waits for it to exit.  Your threadfn() must not call do_exit() - * itself if you use this function!  This can also be called after - * kthread_create() instead of calling wake_up_process(): the thread - * will exit without calling threadfn(). + * waits for it to exit. This can also be called after kthread_create() + * instead of calling wake_up_process(): the thread will exit without + * calling threadfn(). + * + * If threadfn() may call do_exit() itself, the caller must ensure + * task_struct can't go away.   *   * Returns the result of threadfn(), or %-EINTR if wake_up_process()   * was never called. diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index ec33c6ad58dd..d4b3dbc79fdb 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -680,7 +680,8 @@ static int __init lockdep_proc_init(void)  		    &proc_lockdep_stats_operations);  #ifdef CONFIG_LOCK_STAT -	proc_create("lock_stat", S_IRUSR, NULL, &proc_lock_stat_operations); +	proc_create("lock_stat", S_IRUSR | S_IWUSR, NULL, +		    &proc_lock_stat_operations);  #endif  	return 0; diff --git a/kernel/module.c b/kernel/module.c index 0a049837008e..eccb561dd8a3 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -909,16 +909,18 @@ void __symbol_put(const char *symbol)  }  EXPORT_SYMBOL(__symbol_put); +/* Note this assumes addr is a function, which it currently always is. */  void symbol_put_addr(void *addr)  {  	struct module *modaddr; +	unsigned long a = (unsigned long)dereference_function_descriptor(addr); -	if (core_kernel_text((unsigned long)addr)) +	if (core_kernel_text(a))  		return;  	/* module_text_address is safe here: we're supposed to have reference  	 * to module from symbol_get, so it can't go away. */ -	modaddr = __module_text_address((unsigned long)addr); +	modaddr = __module_text_address(a);  	BUG_ON(!modaddr);  	module_put(modaddr);  } @@ -1068,7 +1070,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,  {  	const unsigned long *crc; -	if (!find_symbol("module_layout", NULL, &crc, true, false)) +	if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL, +			 &crc, true, false))  		BUG();  	return check_version(sechdrs, versindex, "module_layout", mod, crc);  } @@ -2352,7 +2355,8 @@ static noinline struct module *load_module(void __user *umod,  	if (err < 0)  		goto unlink;  	add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); -	add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs); +	if (mod->sect_attrs) +		add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs);  	/* Get rid of temporary copy */  	vfree(hdr); diff --git a/kernel/panic.c b/kernel/panic.c index 984b3ecbd72c..512ab73b0ca3 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -301,6 +301,7 @@ int oops_may_print(void)   */  void oops_enter(void)  { +	tracing_off();  	/* can't trust the integrity of the kernel anymore: */  	debug_locks_off();  	do_oops_enter_exit(); diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 950931041954..f274e1959885 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -42,6 +42,7 @@ static int perf_overcommit __read_mostly = 1;  static atomic_t nr_counters __read_mostly;  static atomic_t nr_mmap_counters __read_mostly;  static atomic_t nr_comm_counters __read_mostly; +static atomic_t nr_task_counters __read_mostly;  /*   * perf counter paranoia level: @@ -87,6 +88,7 @@ void __weak hw_perf_disable(void)		{ barrier(); }  void __weak hw_perf_enable(void)		{ barrier(); }  void __weak hw_perf_counter_setup(int cpu)	{ barrier(); } +void __weak hw_perf_counter_setup_online(int cpu)	{ barrier(); }  int __weak  hw_perf_group_sched_in(struct perf_counter *group_leader, @@ -305,6 +307,10 @@ counter_sched_out(struct perf_counter *counter,  		return;  	counter->state = PERF_COUNTER_STATE_INACTIVE; +	if (counter->pending_disable) { +		counter->pending_disable = 0; +		counter->state = PERF_COUNTER_STATE_OFF; +	}  	counter->tstamp_stopped = ctx->time;  	counter->pmu->disable(counter);  	counter->oncpu = -1; @@ -1103,7 +1109,7 @@ static void perf_counter_sync_stat(struct perf_counter_context *ctx,  		__perf_counter_sync_stat(counter, next_counter);  		counter = list_next_entry(counter, event_entry); -		next_counter = list_next_entry(counter, event_entry); +		next_counter = list_next_entry(next_counter, event_entry);  	}  } @@ -1497,10 +1503,21 @@ static void perf_counter_enable_on_exec(struct task_struct *task)   */  static void __perf_counter_read(void *info)  { +	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);  	struct perf_counter *counter = info;  	struct perf_counter_context *ctx = counter->ctx;  	unsigned long flags; +	/* +	 * If this is a task context, we need to check whether it is +	 * the current task context of this cpu.  If not it has been +	 * scheduled out before the smp call arrived.  In that case +	 * counter->count would have been updated to a recent sample +	 * when the counter was scheduled out. +	 */ +	if (ctx->task && cpuctx->task_ctx != ctx) +		return; +  	local_irq_save(flags);  	if (ctx->is_active)  		update_context_time(ctx); @@ -1654,6 +1671,8 @@ static void free_counter(struct perf_counter *counter)  			atomic_dec(&nr_mmap_counters);  		if (counter->attr.comm)  			atomic_dec(&nr_comm_counters); +		if (counter->attr.task) +			atomic_dec(&nr_task_counters);  	}  	if (counter->destroy) @@ -1688,14 +1707,133 @@ static int perf_release(struct inode *inode, struct file *file)  	return 0;  } +static int perf_counter_read_size(struct perf_counter *counter) +{ +	int entry = sizeof(u64); /* value */ +	int size = 0; +	int nr = 1; + +	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) +		size += sizeof(u64); + +	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) +		size += sizeof(u64); + +	if (counter->attr.read_format & PERF_FORMAT_ID) +		entry += sizeof(u64); + +	if (counter->attr.read_format & PERF_FORMAT_GROUP) { +		nr += counter->group_leader->nr_siblings; +		size += sizeof(u64); +	} + +	size += entry * nr; + +	return size; +} + +static u64 perf_counter_read_value(struct perf_counter *counter) +{ +	struct perf_counter *child; +	u64 total = 0; + +	total += perf_counter_read(counter); +	list_for_each_entry(child, &counter->child_list, child_list) +		total += perf_counter_read(child); + +	return total; +} + +static int perf_counter_read_entry(struct perf_counter *counter, +				   u64 read_format, char __user *buf) +{ +	int n = 0, count = 0; +	u64 values[2]; + +	values[n++] = perf_counter_read_value(counter); +	if (read_format & PERF_FORMAT_ID) +		values[n++] = primary_counter_id(counter); + +	count = n * sizeof(u64); + +	if (copy_to_user(buf, values, count)) +		return -EFAULT; + +	return count; +} + +static int perf_counter_read_group(struct perf_counter *counter, +				   u64 read_format, char __user *buf) +{ +	struct perf_counter *leader = counter->group_leader, *sub; +	int n = 0, size = 0, err = -EFAULT; +	u64 values[3]; + +	values[n++] = 1 + leader->nr_siblings; +	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { +		values[n++] = leader->total_time_enabled + +			atomic64_read(&leader->child_total_time_enabled); +	} +	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { +		values[n++] = leader->total_time_running + +			atomic64_read(&leader->child_total_time_running); +	} + +	size = n * sizeof(u64); + +	if (copy_to_user(buf, values, size)) +		return -EFAULT; + +	err = perf_counter_read_entry(leader, read_format, buf + size); +	if (err < 0) +		return err; + +	size += err; + +	list_for_each_entry(sub, &leader->sibling_list, list_entry) { +		err = perf_counter_read_entry(sub, read_format, +				buf + size); +		if (err < 0) +			return err; + +		size += err; +	} + +	return size; +} + +static int perf_counter_read_one(struct perf_counter *counter, +				 u64 read_format, char __user *buf) +{ +	u64 values[4]; +	int n = 0; + +	values[n++] = perf_counter_read_value(counter); +	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { +		values[n++] = counter->total_time_enabled + +			atomic64_read(&counter->child_total_time_enabled); +	} +	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { +		values[n++] = counter->total_time_running + +			atomic64_read(&counter->child_total_time_running); +	} +	if (read_format & PERF_FORMAT_ID) +		values[n++] = primary_counter_id(counter); + +	if (copy_to_user(buf, values, n * sizeof(u64))) +		return -EFAULT; + +	return n * sizeof(u64); +} +  /*   * Read the performance counter - simple non blocking version for now   */  static ssize_t  perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)  { -	u64 values[4]; -	int n; +	u64 read_format = counter->attr.read_format; +	int ret;  	/*  	 * Return end-of-file for a read on a counter that is in @@ -1705,28 +1843,18 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)  	if (counter->state == PERF_COUNTER_STATE_ERROR)  		return 0; +	if (count < perf_counter_read_size(counter)) +		return -ENOSPC; +  	WARN_ON_ONCE(counter->ctx->parent_ctx);  	mutex_lock(&counter->child_mutex); -	values[0] = perf_counter_read(counter); -	n = 1; -	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) -		values[n++] = counter->total_time_enabled + -			atomic64_read(&counter->child_total_time_enabled); -	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) -		values[n++] = counter->total_time_running + -			atomic64_read(&counter->child_total_time_running); -	if (counter->attr.read_format & PERF_FORMAT_ID) -		values[n++] = primary_counter_id(counter); +	if (read_format & PERF_FORMAT_GROUP) +		ret = perf_counter_read_group(counter, read_format, buf); +	else +		ret = perf_counter_read_one(counter, read_format, buf);  	mutex_unlock(&counter->child_mutex); -	if (count < n * sizeof(u64)) -		return -EINVAL; -	count = n * sizeof(u64); - -	if (copy_to_user(buf, values, count)) -		return -EFAULT; - -	return count; +	return ret;  }  static ssize_t @@ -1891,6 +2019,10 @@ int perf_counter_task_disable(void)  	return 0;  } +#ifndef PERF_COUNTER_INDEX_OFFSET +# define PERF_COUNTER_INDEX_OFFSET 0 +#endif +  static int perf_counter_index(struct perf_counter *counter)  {  	if (counter->state != PERF_COUNTER_STATE_ACTIVE) @@ -2230,7 +2362,7 @@ static void perf_pending_counter(struct perf_pending_entry *entry)  	if (counter->pending_disable) {  		counter->pending_disable = 0; -		perf_counter_disable(counter); +		__perf_counter_disable(counter);  	}  	if (counter->pending_wakeup) { @@ -2615,7 +2747,80 @@ static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p)  	return task_pid_nr_ns(p, counter->ns);  } -static void perf_counter_output(struct perf_counter *counter, int nmi, +static void perf_output_read_one(struct perf_output_handle *handle, +				 struct perf_counter *counter) +{ +	u64 read_format = counter->attr.read_format; +	u64 values[4]; +	int n = 0; + +	values[n++] = atomic64_read(&counter->count); +	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { +		values[n++] = counter->total_time_enabled + +			atomic64_read(&counter->child_total_time_enabled); +	} +	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { +		values[n++] = counter->total_time_running + +			atomic64_read(&counter->child_total_time_running); +	} +	if (read_format & PERF_FORMAT_ID) +		values[n++] = primary_counter_id(counter); + +	perf_output_copy(handle, values, n * sizeof(u64)); +} + +/* + * XXX PERF_FORMAT_GROUP vs inherited counters seems difficult. + */ +static void perf_output_read_group(struct perf_output_handle *handle, +			    struct perf_counter *counter) +{ +	struct perf_counter *leader = counter->group_leader, *sub; +	u64 read_format = counter->attr.read_format; +	u64 values[5]; +	int n = 0; + +	values[n++] = 1 + leader->nr_siblings; + +	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) +		values[n++] = leader->total_time_enabled; + +	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) +		values[n++] = leader->total_time_running; + +	if (leader != counter) +		leader->pmu->read(leader); + +	values[n++] = atomic64_read(&leader->count); +	if (read_format & PERF_FORMAT_ID) +		values[n++] = primary_counter_id(leader); + +	perf_output_copy(handle, values, n * sizeof(u64)); + +	list_for_each_entry(sub, &leader->sibling_list, list_entry) { +		n = 0; + +		if (sub != counter) +			sub->pmu->read(sub); + +		values[n++] = atomic64_read(&sub->count); +		if (read_format & PERF_FORMAT_ID) +			values[n++] = primary_counter_id(sub); + +		perf_output_copy(handle, values, n * sizeof(u64)); +	} +} + +static void perf_output_read(struct perf_output_handle *handle, +			     struct perf_counter *counter) +{ +	if (counter->attr.read_format & PERF_FORMAT_GROUP) +		perf_output_read_group(handle, counter); +	else +		perf_output_read_one(handle, counter); +} + +void perf_counter_output(struct perf_counter *counter, int nmi,  				struct perf_sample_data *data)  {  	int ret; @@ -2626,10 +2831,6 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,  	struct {  		u32 pid, tid;  	} tid_entry; -	struct { -		u64 id; -		u64 counter; -	} group_entry;  	struct perf_callchain_entry *callchain = NULL;  	int callchain_size = 0;  	u64 time; @@ -2684,10 +2885,8 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,  	if (sample_type & PERF_SAMPLE_PERIOD)  		header.size += sizeof(u64); -	if (sample_type & PERF_SAMPLE_GROUP) { -		header.size += sizeof(u64) + -			counter->nr_siblings * sizeof(group_entry); -	} +	if (sample_type & PERF_SAMPLE_READ) +		header.size += perf_counter_read_size(counter);  	if (sample_type & PERF_SAMPLE_CALLCHAIN) {  		callchain = perf_callchain(data->regs); @@ -2699,6 +2898,18 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,  			header.size += sizeof(u64);  	} +	if (sample_type & PERF_SAMPLE_RAW) { +		int size = sizeof(u32); + +		if (data->raw) +			size += data->raw->size; +		else +			size += sizeof(u32); + +		WARN_ON_ONCE(size & (sizeof(u64)-1)); +		header.size += size; +	} +  	ret = perf_output_begin(&handle, counter, header.size, nmi, 1);  	if (ret)  		return; @@ -2732,26 +2943,8 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,  	if (sample_type & PERF_SAMPLE_PERIOD)  		perf_output_put(&handle, data->period); -	/* -	 * XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult. -	 */ -	if (sample_type & PERF_SAMPLE_GROUP) { -		struct perf_counter *leader, *sub; -		u64 nr = counter->nr_siblings; - -		perf_output_put(&handle, nr); - -		leader = counter->group_leader; -		list_for_each_entry(sub, &leader->sibling_list, list_entry) { -			if (sub != counter) -				sub->pmu->read(sub); - -			group_entry.id = primary_counter_id(sub); -			group_entry.counter = atomic64_read(&sub->count); - -			perf_output_put(&handle, group_entry); -		} -	} +	if (sample_type & PERF_SAMPLE_READ) +		perf_output_read(&handle, counter);  	if (sample_type & PERF_SAMPLE_CALLCHAIN) {  		if (callchain) @@ -2762,6 +2955,22 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,  		}  	} +	if (sample_type & PERF_SAMPLE_RAW) { +		if (data->raw) { +			perf_output_put(&handle, data->raw->size); +			perf_output_copy(&handle, data->raw->data, data->raw->size); +		} else { +			struct { +				u32	size; +				u32	data; +			} raw = { +				.size = sizeof(u32), +				.data = 0, +			}; +			perf_output_put(&handle, raw); +		} +	} +  	perf_output_end(&handle);  } @@ -2774,8 +2983,6 @@ struct perf_read_event {  	u32				pid;  	u32				tid; -	u64				value; -	u64				format[3];  };  static void @@ -2787,80 +2994,74 @@ perf_counter_read_event(struct perf_counter *counter,  		.header = {  			.type = PERF_EVENT_READ,  			.misc = 0, -			.size = sizeof(event) - sizeof(event.format), +			.size = sizeof(event) + perf_counter_read_size(counter),  		},  		.pid = perf_counter_pid(counter, task),  		.tid = perf_counter_tid(counter, task), -		.value = atomic64_read(&counter->count),  	}; -	int ret, i = 0; - -	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { -		event.header.size += sizeof(u64); -		event.format[i++] = counter->total_time_enabled; -	} - -	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { -		event.header.size += sizeof(u64); -		event.format[i++] = counter->total_time_running; -	} - -	if (counter->attr.read_format & PERF_FORMAT_ID) { -		event.header.size += sizeof(u64); -		event.format[i++] = primary_counter_id(counter); -	} +	int ret;  	ret = perf_output_begin(&handle, counter, event.header.size, 0, 0);  	if (ret)  		return; -	perf_output_copy(&handle, &event, event.header.size); +	perf_output_put(&handle, event); +	perf_output_read(&handle, counter); +  	perf_output_end(&handle);  }  /* - * fork tracking + * task tracking -- fork/exit + * + * enabled by: attr.comm | attr.mmap | attr.task   */ -struct perf_fork_event { -	struct task_struct	*task; +struct perf_task_event { +	struct task_struct		*task; +	struct perf_counter_context	*task_ctx;  	struct {  		struct perf_event_header	header;  		u32				pid;  		u32				ppid; +		u32				tid; +		u32				ptid;  	} event;  }; -static void perf_counter_fork_output(struct perf_counter *counter, -				     struct perf_fork_event *fork_event) +static void perf_counter_task_output(struct perf_counter *counter, +				     struct perf_task_event *task_event)  {  	struct perf_output_handle handle; -	int size = fork_event->event.header.size; -	struct task_struct *task = fork_event->task; +	int size = task_event->event.header.size; +	struct task_struct *task = task_event->task;  	int ret = perf_output_begin(&handle, counter, size, 0, 0);  	if (ret)  		return; -	fork_event->event.pid = perf_counter_pid(counter, task); -	fork_event->event.ppid = perf_counter_pid(counter, task->real_parent); +	task_event->event.pid = perf_counter_pid(counter, task); +	task_event->event.ppid = perf_counter_pid(counter, current); -	perf_output_put(&handle, fork_event->event); +	task_event->event.tid = perf_counter_tid(counter, task); +	task_event->event.ptid = perf_counter_tid(counter, current); + +	perf_output_put(&handle, task_event->event);  	perf_output_end(&handle);  } -static int perf_counter_fork_match(struct perf_counter *counter) +static int perf_counter_task_match(struct perf_counter *counter)  { -	if (counter->attr.comm || counter->attr.mmap) +	if (counter->attr.comm || counter->attr.mmap || counter->attr.task)  		return 1;  	return 0;  } -static void perf_counter_fork_ctx(struct perf_counter_context *ctx, -				  struct perf_fork_event *fork_event) +static void perf_counter_task_ctx(struct perf_counter_context *ctx, +				  struct perf_task_event *task_event)  {  	struct perf_counter *counter; @@ -2869,54 +3070,62 @@ static void perf_counter_fork_ctx(struct perf_counter_context *ctx,  	rcu_read_lock();  	list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { -		if (perf_counter_fork_match(counter)) -			perf_counter_fork_output(counter, fork_event); +		if (perf_counter_task_match(counter)) +			perf_counter_task_output(counter, task_event);  	}  	rcu_read_unlock();  } -static void perf_counter_fork_event(struct perf_fork_event *fork_event) +static void perf_counter_task_event(struct perf_task_event *task_event)  {  	struct perf_cpu_context *cpuctx; -	struct perf_counter_context *ctx; +	struct perf_counter_context *ctx = task_event->task_ctx;  	cpuctx = &get_cpu_var(perf_cpu_context); -	perf_counter_fork_ctx(&cpuctx->ctx, fork_event); +	perf_counter_task_ctx(&cpuctx->ctx, task_event);  	put_cpu_var(perf_cpu_context);  	rcu_read_lock(); -	/* -	 * doesn't really matter which of the child contexts the -	 * events ends up in. -	 */ -	ctx = rcu_dereference(current->perf_counter_ctxp); +	if (!ctx) +		ctx = rcu_dereference(task_event->task->perf_counter_ctxp);  	if (ctx) -		perf_counter_fork_ctx(ctx, fork_event); +		perf_counter_task_ctx(ctx, task_event);  	rcu_read_unlock();  } -void perf_counter_fork(struct task_struct *task) +static void perf_counter_task(struct task_struct *task, +			      struct perf_counter_context *task_ctx, +			      int new)  { -	struct perf_fork_event fork_event; +	struct perf_task_event task_event;  	if (!atomic_read(&nr_comm_counters) && -	    !atomic_read(&nr_mmap_counters)) +	    !atomic_read(&nr_mmap_counters) && +	    !atomic_read(&nr_task_counters))  		return; -	fork_event = (struct perf_fork_event){ -		.task	= task, -		.event  = { +	task_event = (struct perf_task_event){ +		.task	  = task, +		.task_ctx = task_ctx, +		.event    = {  			.header = { -				.type = PERF_EVENT_FORK, +				.type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT,  				.misc = 0, -				.size = sizeof(fork_event.event), +				.size = sizeof(task_event.event),  			},  			/* .pid  */  			/* .ppid */ +			/* .tid  */ +			/* .ptid */  		},  	}; -	perf_counter_fork_event(&fork_event); +	perf_counter_task_event(&task_event); +} + +void perf_counter_fork(struct task_struct *task) +{ +	perf_counter_task(task, NULL, 1);  }  /* @@ -3305,125 +3514,111 @@ int perf_counter_overflow(struct perf_counter *counter, int nmi,   * Generic software counter infrastructure   */ -static void perf_swcounter_update(struct perf_counter *counter) +/* + * We directly increment counter->count and keep a second value in + * counter->hw.period_left to count intervals. This period counter + * is kept in the range [-sample_period, 0] so that we can use the + * sign as trigger. + */ + +static u64 perf_swcounter_set_period(struct perf_counter *counter)  {  	struct hw_perf_counter *hwc = &counter->hw; -	u64 prev, now; -	s64 delta; +	u64 period = hwc->last_period; +	u64 nr, offset; +	s64 old, val; + +	hwc->last_period = hwc->sample_period;  again: -	prev = atomic64_read(&hwc->prev_count); -	now = atomic64_read(&hwc->count); -	if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev) -		goto again; +	old = val = atomic64_read(&hwc->period_left); +	if (val < 0) +		return 0; -	delta = now - prev; +	nr = div64_u64(period + val, period); +	offset = nr * period; +	val -= offset; +	if (atomic64_cmpxchg(&hwc->period_left, old, val) != old) +		goto again; -	atomic64_add(delta, &counter->count); -	atomic64_sub(delta, &hwc->period_left); +	return nr;  } -static void perf_swcounter_set_period(struct perf_counter *counter) +static void perf_swcounter_overflow(struct perf_counter *counter, +				    int nmi, struct perf_sample_data *data)  {  	struct hw_perf_counter *hwc = &counter->hw; -	s64 left = atomic64_read(&hwc->period_left); -	s64 period = hwc->sample_period; +	u64 overflow; -	if (unlikely(left <= -period)) { -		left = period; -		atomic64_set(&hwc->period_left, left); -		hwc->last_period = period; -	} +	data->period = counter->hw.last_period; +	overflow = perf_swcounter_set_period(counter); -	if (unlikely(left <= 0)) { -		left += period; -		atomic64_add(period, &hwc->period_left); -		hwc->last_period = period; -	} +	if (hwc->interrupts == MAX_INTERRUPTS) +		return; -	atomic64_set(&hwc->prev_count, -left); -	atomic64_set(&hwc->count, -left); +	for (; overflow; overflow--) { +		if (perf_counter_overflow(counter, nmi, data)) { +			/* +			 * We inhibit the overflow from happening when +			 * hwc->interrupts == MAX_INTERRUPTS. +			 */ +			break; +		} +	}  } -static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) +static void perf_swcounter_unthrottle(struct perf_counter *counter)  { -	enum hrtimer_restart ret = HRTIMER_RESTART; -	struct perf_sample_data data; -	struct perf_counter *counter; -	u64 period; - -	counter	= container_of(hrtimer, struct perf_counter, hw.hrtimer); -	counter->pmu->read(counter); - -	data.addr = 0; -	data.regs = get_irq_regs();  	/* -	 * In case we exclude kernel IPs or are somehow not in interrupt -	 * context, provide the next best thing, the user IP. +	 * Nothing to do, we already reset hwc->interrupts.  	 */ -	if ((counter->attr.exclude_kernel || !data.regs) && -			!counter->attr.exclude_user) -		data.regs = task_pt_regs(current); +} -	if (data.regs) { -		if (perf_counter_overflow(counter, 0, &data)) -			ret = HRTIMER_NORESTART; -	} +static void perf_swcounter_add(struct perf_counter *counter, u64 nr, +			       int nmi, struct perf_sample_data *data) +{ +	struct hw_perf_counter *hwc = &counter->hw; -	period = max_t(u64, 10000, counter->hw.sample_period); -	hrtimer_forward_now(hrtimer, ns_to_ktime(period)); +	atomic64_add(nr, &counter->count); -	return ret; -} +	if (!hwc->sample_period) +		return; -static void perf_swcounter_overflow(struct perf_counter *counter, -				    int nmi, struct perf_sample_data *data) -{ -	data->period = counter->hw.last_period; +	if (!data->regs) +		return; -	perf_swcounter_update(counter); -	perf_swcounter_set_period(counter); -	if (perf_counter_overflow(counter, nmi, data)) -		/* soft-disable the counter */ -		; +	if (!atomic64_add_negative(nr, &hwc->period_left)) +		perf_swcounter_overflow(counter, nmi, data);  }  static int perf_swcounter_is_counting(struct perf_counter *counter)  { -	struct perf_counter_context *ctx; -	unsigned long flags; -	int count; - +	/* +	 * The counter is active, we're good! +	 */  	if (counter->state == PERF_COUNTER_STATE_ACTIVE)  		return 1; +	/* +	 * The counter is off/error, not counting. +	 */  	if (counter->state != PERF_COUNTER_STATE_INACTIVE)  		return 0;  	/* -	 * If the counter is inactive, it could be just because -	 * its task is scheduled out, or because it's in a group -	 * which could not go on the PMU.  We want to count in -	 * the first case but not the second.  If the context is -	 * currently active then an inactive software counter must -	 * be the second case.  If it's not currently active then -	 * we need to know whether the counter was active when the -	 * context was last active, which we can determine by -	 * comparing counter->tstamp_stopped with ctx->time. -	 * -	 * We are within an RCU read-side critical section, -	 * which protects the existence of *ctx. +	 * The counter is inactive, if the context is active +	 * we're part of a group that didn't make it on the 'pmu', +	 * not counting.  	 */ -	ctx = counter->ctx; -	spin_lock_irqsave(&ctx->lock, flags); -	count = 1; -	/* Re-check state now we have the lock */ -	if (counter->state < PERF_COUNTER_STATE_INACTIVE || -	    counter->ctx->is_active || -	    counter->tstamp_stopped < ctx->time) -		count = 0; -	spin_unlock_irqrestore(&ctx->lock, flags); -	return count; +	if (counter->ctx->is_active) +		return 0; + +	/* +	 * We're inactive and the context is too, this means the +	 * task is scheduled out, we're counting events that happen +	 * to us, like migration events. +	 */ +	return 1;  }  static int perf_swcounter_match(struct perf_counter *counter, @@ -3449,15 +3644,6 @@ static int perf_swcounter_match(struct perf_counter *counter,  	return 1;  } -static void perf_swcounter_add(struct perf_counter *counter, u64 nr, -			       int nmi, struct perf_sample_data *data) -{ -	int neg = atomic64_add_negative(nr, &counter->hw.count); - -	if (counter->hw.sample_period && !neg && data->regs) -		perf_swcounter_overflow(counter, nmi, data); -} -  static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,  				     enum perf_type_id type,  				     u32 event, u64 nr, int nmi, @@ -3536,27 +3722,66 @@ void __perf_swcounter_event(u32 event, u64 nr, int nmi,  static void perf_swcounter_read(struct perf_counter *counter)  { -	perf_swcounter_update(counter);  }  static int perf_swcounter_enable(struct perf_counter *counter)  { -	perf_swcounter_set_period(counter); +	struct hw_perf_counter *hwc = &counter->hw; + +	if (hwc->sample_period) { +		hwc->last_period = hwc->sample_period; +		perf_swcounter_set_period(counter); +	}  	return 0;  }  static void perf_swcounter_disable(struct perf_counter *counter)  { -	perf_swcounter_update(counter);  }  static const struct pmu perf_ops_generic = {  	.enable		= perf_swcounter_enable,  	.disable	= perf_swcounter_disable,  	.read		= perf_swcounter_read, +	.unthrottle	= perf_swcounter_unthrottle,  };  /* + * hrtimer based swcounter callback + */ + +static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) +{ +	enum hrtimer_restart ret = HRTIMER_RESTART; +	struct perf_sample_data data; +	struct perf_counter *counter; +	u64 period; + +	counter	= container_of(hrtimer, struct perf_counter, hw.hrtimer); +	counter->pmu->read(counter); + +	data.addr = 0; +	data.regs = get_irq_regs(); +	/* +	 * In case we exclude kernel IPs or are somehow not in interrupt +	 * context, provide the next best thing, the user IP. +	 */ +	if ((counter->attr.exclude_kernel || !data.regs) && +			!counter->attr.exclude_user) +		data.regs = task_pt_regs(current); + +	if (data.regs) { +		if (perf_counter_overflow(counter, 0, &data)) +			ret = HRTIMER_NORESTART; +	} + +	period = max_t(u64, 10000, counter->hw.sample_period); +	hrtimer_forward_now(hrtimer, ns_to_ktime(period)); + +	return ret; +} + +/*   * Software counter: cpu wall time clock   */ @@ -3673,17 +3898,24 @@ static const struct pmu perf_ops_task_clock = {  };  #ifdef CONFIG_EVENT_PROFILE -void perf_tpcounter_event(int event_id) +void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record, +			  int entry_size)  { +	struct perf_raw_record raw = { +		.size = entry_size, +		.data = record, +	}; +  	struct perf_sample_data data = {  		.regs = get_irq_regs(), -		.addr = 0, +		.addr = addr, +		.raw = &raw,  	};  	if (!data.regs)  		data.regs = task_pt_regs(current); -	do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, &data); +	do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data);  }  EXPORT_SYMBOL_GPL(perf_tpcounter_event); @@ -3697,6 +3929,14 @@ static void tp_perf_counter_destroy(struct perf_counter *counter)  static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)  { +	/* +	 * Raw tracepoint data is a severe data leak, only allow root to +	 * have these. +	 */ +	if ((counter->attr.sample_type & PERF_SAMPLE_RAW) && +			!capable(CAP_SYS_ADMIN)) +		return ERR_PTR(-EPERM); +  	if (ftrace_profile_enable(counter->attr.config))  		return NULL; @@ -3830,9 +4070,9 @@ perf_counter_alloc(struct perf_counter_attr *attr,  	atomic64_set(&hwc->period_left, hwc->sample_period);  	/* -	 * we currently do not support PERF_SAMPLE_GROUP on inherited counters +	 * we currently do not support PERF_FORMAT_GROUP on inherited counters  	 */ -	if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP)) +	if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))  		goto done;  	switch (attr->type) { @@ -3875,6 +4115,8 @@ done:  			atomic_inc(&nr_mmap_counters);  		if (counter->attr.comm)  			atomic_inc(&nr_comm_counters); +		if (counter->attr.task) +			atomic_inc(&nr_task_counters);  	}  	return counter; @@ -4236,8 +4478,10 @@ void perf_counter_exit_task(struct task_struct *child)  	struct perf_counter_context *child_ctx;  	unsigned long flags; -	if (likely(!child->perf_counter_ctxp)) +	if (likely(!child->perf_counter_ctxp)) { +		perf_counter_task(child, NULL, 0);  		return; +	}  	local_irq_save(flags);  	/* @@ -4262,8 +4506,14 @@ void perf_counter_exit_task(struct task_struct *child)  	 * the counters from it.  	 */  	unclone_ctx(child_ctx); -	spin_unlock(&child_ctx->lock); -	local_irq_restore(flags); +	spin_unlock_irqrestore(&child_ctx->lock, flags); + +	/* +	 * Report the task dead after unscheduling the counters so that we +	 * won't get any samples after PERF_EVENT_EXIT. We can however still +	 * get a few PERF_EVENT_READ events. +	 */ +	perf_counter_task(child, child_ctx, 0);  	/*  	 * We can recurse on the same lock type through: @@ -4484,6 +4734,11 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)  		perf_counter_init_cpu(cpu);  		break; +	case CPU_ONLINE: +	case CPU_ONLINE_FROZEN: +		hw_perf_counter_setup_online(cpu); +		break; +  	case CPU_DOWN_PREPARE:  	case CPU_DOWN_PREPARE_FROZEN:  		perf_counter_exit_cpu(cpu); @@ -4508,6 +4763,8 @@ void __init perf_counter_init(void)  {  	perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,  			(void *)(long)smp_processor_id()); +	perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE, +			(void *)(long)smp_processor_id());  	register_cpu_notifier(&perf_cpu_nb);  } diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index bece7c0b67b2..e33a21cb9407 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -521,11 +521,12 @@ void posix_cpu_timers_exit(struct task_struct *tsk)  }  void posix_cpu_timers_exit_group(struct task_struct *tsk)  { -	struct task_cputime cputime; +	struct signal_struct *const sig = tsk->signal; -	thread_group_cputimer(tsk, &cputime);  	cleanup_timers(tsk->signal->cpu_timers, -		       cputime.utime, cputime.stime, cputime.sum_exec_runtime); +		       cputime_add(tsk->utime, sig->utime), +		       cputime_add(tsk->stime, sig->stime), +		       tsk->se.sum_exec_runtime + sig->sum_sched_runtime);  }  static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 052ec4d195c7..d089d052c4a9 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -202,6 +202,12 @@ static int no_timer_create(struct k_itimer *new_timer)  	return -EOPNOTSUPP;  } +static int no_nsleep(const clockid_t which_clock, int flags, +		     struct timespec *tsave, struct timespec __user *rmtp) +{ +	return -EOPNOTSUPP; +} +  /*   * Return nonzero if we know a priori this clockid_t value is bogus.   */ @@ -254,6 +260,7 @@ static __init int init_posix_timers(void)  		.clock_get = posix_get_monotonic_raw,  		.clock_set = do_posix_clock_nosettime,  		.timer_create = no_timer_create, +		.nsleep = no_nsleep,  	};  	register_posix_clock(CLOCK_REALTIME, &clock_realtime); diff --git a/kernel/profile.c b/kernel/profile.c index 69911b5745eb..419250ebec4d 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -117,11 +117,12 @@ int __ref profile_init(void)  	cpumask_copy(prof_cpu_mask, cpu_possible_mask); -	prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL); +	prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL|__GFP_NOWARN);  	if (prof_buffer)  		return 0; -	prof_buffer = alloc_pages_exact(buffer_bytes, GFP_KERNEL|__GFP_ZERO); +	prof_buffer = alloc_pages_exact(buffer_bytes, +					GFP_KERNEL|__GFP_ZERO|__GFP_NOWARN);  	if (prof_buffer)  		return 0; diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index fcd107a78c5a..29bd4baf9e75 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -1039,16 +1039,14 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,  	if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) {  		/* We got the lock for task. */  		debug_rt_mutex_lock(lock); -  		rt_mutex_set_owner(lock, task, 0); - +		spin_unlock(&lock->wait_lock);  		rt_mutex_deadlock_account_lock(lock, task);  		return 1;  	}  	ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock); -  	if (ret && !waiter->task) {  		/*  		 * Reset the return value. We might have diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index e6c251790dde..d014efbf947a 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c @@ -81,8 +81,21 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,  		if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)  			continue; -		if (lowest_mask) +		if (lowest_mask) {  			cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask); + +			/* +			 * We have to ensure that we have at least one bit +			 * still set in the array, since the map could have +			 * been concurrently emptied between the first and +			 * second reads of vec->mask.  If we hit this +			 * condition, simply act as though we never hit this +			 * priority level and continue on. +			 */ +			if (cpumask_any(lowest_mask) >= nr_cpu_ids) +				continue; +		} +  		return 1;  	} diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 9ffb2b2ceba4..652e8bdef9aa 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -611,9 +611,13 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)  static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)  {  #ifdef CONFIG_SCHEDSTATS +	struct task_struct *tsk = NULL; + +	if (entity_is_task(se)) +		tsk = task_of(se); +  	if (se->sleep_start) {  		u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; -		struct task_struct *tsk = task_of(se);  		if ((s64)delta < 0)  			delta = 0; @@ -624,11 +628,11 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)  		se->sleep_start = 0;  		se->sum_sleep_runtime += delta; -		account_scheduler_latency(tsk, delta >> 10, 1); +		if (tsk) +			account_scheduler_latency(tsk, delta >> 10, 1);  	}  	if (se->block_start) {  		u64 delta = rq_of(cfs_rq)->clock - se->block_start; -		struct task_struct *tsk = task_of(se);  		if ((s64)delta < 0)  			delta = 0; @@ -639,17 +643,19 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)  		se->block_start = 0;  		se->sum_sleep_runtime += delta; -		/* -		 * Blocking time is in units of nanosecs, so shift by 20 to -		 * get a milliseconds-range estimation of the amount of -		 * time that the task spent sleeping: -		 */ -		if (unlikely(prof_on == SLEEP_PROFILING)) { - -			profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), -				     delta >> 20); +		if (tsk) { +			/* +			 * Blocking time is in units of nanosecs, so shift by +			 * 20 to get a milliseconds-range estimation of the +			 * amount of time that the task spent sleeping: +			 */ +			if (unlikely(prof_on == SLEEP_PROFILING)) { +				profile_hits(SLEEP_PROFILING, +						(void *)get_wchan(tsk), +						delta >> 20); +			} +			account_scheduler_latency(tsk, delta >> 10, 0);  		} -		account_scheduler_latency(tsk, delta >> 10, 0);  	}  #endif  } diff --git a/kernel/signal.c b/kernel/signal.c index ccf1ceedaebe..64c5deeaca5d 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2454,11 +2454,9 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s  	stack_t oss;  	int error; -	if (uoss) { -		oss.ss_sp = (void __user *) current->sas_ss_sp; -		oss.ss_size = current->sas_ss_size; -		oss.ss_flags = sas_ss_flags(sp); -	} +	oss.ss_sp = (void __user *) current->sas_ss_sp; +	oss.ss_size = current->sas_ss_size; +	oss.ss_flags = sas_ss_flags(sp);  	if (uss) {  		void __user *ss_sp; @@ -2466,10 +2464,12 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s  		int ss_flags;  		error = -EFAULT; -		if (!access_ok(VERIFY_READ, uss, sizeof(*uss)) -		    || __get_user(ss_sp, &uss->ss_sp) -		    || __get_user(ss_flags, &uss->ss_flags) -		    || __get_user(ss_size, &uss->ss_size)) +		if (!access_ok(VERIFY_READ, uss, sizeof(*uss))) +			goto out; +		error = __get_user(ss_sp, &uss->ss_sp) | +			__get_user(ss_flags, &uss->ss_flags) | +			__get_user(ss_size, &uss->ss_size); +		if (error)  			goto out;  		error = -EPERM; @@ -2501,13 +2501,16 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s  		current->sas_ss_size = ss_size;  	} +	error = 0;  	if (uoss) {  		error = -EFAULT; -		if (copy_to_user(uoss, &oss, sizeof(oss))) +		if (!access_ok(VERIFY_WRITE, uoss, sizeof(*uoss)))  			goto out; +		error = __put_user(oss.ss_sp, &uoss->ss_sp) | +			__put_user(oss.ss_size, &uoss->ss_size) | +			__put_user(oss.ss_flags, &uoss->ss_flags);  	} -	error = 0;  out:  	return error;  } diff --git a/kernel/smp.c b/kernel/smp.c index ad63d8501207..94188b8ecc33 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -57,7 +57,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)  			return NOTIFY_BAD;  		break; -#ifdef CONFIG_CPU_HOTPLUG +#ifdef CONFIG_HOTPLUG_CPU  	case CPU_UP_CANCELED:  	case CPU_UP_CANCELED_FROZEN: diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 98e02328c67d..58be76017fd0 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -49,6 +49,7 @@  #include <linux/acpi.h>  #include <linux/reboot.h>  #include <linux/ftrace.h> +#include <linux/security.h>  #include <linux/slow-work.h>  #include <linux/perf_counter.h> @@ -1306,10 +1307,10 @@ static struct ctl_table vm_table[] = {  	{  		.ctl_name	= CTL_UNNUMBERED,  		.procname	= "mmap_min_addr", -		.data		= &mmap_min_addr, -		.maxlen         = sizeof(unsigned long), +		.data		= &dac_mmap_min_addr, +		.maxlen		= sizeof(unsigned long),  		.mode		= 0644, -		.proc_handler	= &proc_doulongvec_minmax, +		.proc_handler	= &mmap_min_addr_handler,  	},  #ifdef CONFIG_NUMA  	{ diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index a6dcd67b041d..620b58abdc32 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -137,11 +137,12 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,   */  int clockevents_register_notifier(struct notifier_block *nb)  { +	unsigned long flags;  	int ret; -	spin_lock(&clockevents_lock); +	spin_lock_irqsave(&clockevents_lock, flags);  	ret = raw_notifier_chain_register(&clockevents_chain, nb); -	spin_unlock(&clockevents_lock); +	spin_unlock_irqrestore(&clockevents_lock, flags);  	return ret;  } @@ -178,16 +179,18 @@ static void clockevents_notify_released(void)   */  void clockevents_register_device(struct clock_event_device *dev)  { +	unsigned long flags; +  	BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);  	BUG_ON(!dev->cpumask); -	spin_lock(&clockevents_lock); +	spin_lock_irqsave(&clockevents_lock, flags);  	list_add(&dev->list, &clockevent_devices);  	clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev);  	clockevents_notify_released(); -	spin_unlock(&clockevents_lock); +	spin_unlock_irqrestore(&clockevents_lock, flags);  }  EXPORT_SYMBOL_GPL(clockevents_register_device); @@ -235,8 +238,9 @@ void clockevents_exchange_device(struct clock_event_device *old,  void clockevents_notify(unsigned long reason, void *arg)  {  	struct list_head *node, *tmp; +	unsigned long flags; -	spin_lock(&clockevents_lock); +	spin_lock_irqsave(&clockevents_lock, flags);  	clockevents_do_notify(reason, arg);  	switch (reason) { @@ -251,7 +255,7 @@ void clockevents_notify(unsigned long reason, void *arg)  	default:  		break;  	} -	spin_unlock(&clockevents_lock); +	spin_unlock_irqrestore(&clockevents_lock, flags);  }  EXPORT_SYMBOL_GPL(clockevents_notify);  #endif diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 877dbedc3118..c2ec25087a35 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -205,11 +205,11 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)   * Powerstate information: The system enters/leaves a state, where   * affected devices might stop   */ -static void tick_do_broadcast_on_off(void *why) +static void tick_do_broadcast_on_off(unsigned long *reason)  {  	struct clock_event_device *bc, *dev;  	struct tick_device *td; -	unsigned long flags, *reason = why; +	unsigned long flags;  	int cpu, bc_stopped;  	spin_lock_irqsave(&tick_broadcast_lock, flags); @@ -276,8 +276,7 @@ void tick_broadcast_on_off(unsigned long reason, int *oncpu)  		printk(KERN_ERR "tick-broadcast: ignoring broadcast for "  		       "offline CPU #%d\n", *oncpu);  	else -		smp_call_function_single(*oncpu, tick_do_broadcast_on_off, -					 &reason, 1); +		tick_do_broadcast_on_off(&reason);  }  /* diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index a999b92a1277..fddd69d16e03 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -286,7 +286,7 @@ static int __init init_timer_list_procfs(void)  {  	struct proc_dir_entry *pe; -	pe = proc_create("timer_list", 0644, NULL, &timer_list_fops); +	pe = proc_create("timer_list", 0444, NULL, &timer_list_fops);  	if (!pe)  		return -ENOMEM;  	return 0; diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 1090b0aed9ba..7a34cb563fec 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -267,8 +267,8 @@ static void blk_trace_free(struct blk_trace *bt)  {  	debugfs_remove(bt->msg_file);  	debugfs_remove(bt->dropped_file); -	debugfs_remove(bt->dir);  	relay_close(bt->rchan); +	debugfs_remove(bt->dir);  	free_percpu(bt->sequence);  	free_percpu(bt->msg_data);  	kfree(bt); @@ -378,18 +378,8 @@ static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,  static int blk_remove_buf_file_callback(struct dentry *dentry)  { -	struct dentry *parent = dentry->d_parent;  	debugfs_remove(dentry); -	/* -	* this will fail for all but the last file, but that is ok. what we -	* care about is the top level buts->name directory going away, when -	* the last trace file is gone. Then we don't have to rmdir() that -	* manually on trace stop, so it nicely solves the issue with -	* force killing of running traces. -	*/ - -	debugfs_remove(parent);  	return 0;  } diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4521c77d1a1a..25edd5cc5935 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1662,7 +1662,7 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)  	mutex_lock(&ftrace_regex_lock);  	if ((file->f_mode & FMODE_WRITE) && -	    !(file->f_flags & O_APPEND)) +	    (file->f_flags & O_TRUNC))  		ftrace_filter_reset(enable);  	if (file->f_mode & FMODE_READ) { @@ -2278,7 +2278,11 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,  	read++;  	cnt--; -	if (!(iter->flags & ~FTRACE_ITER_CONT)) { +	/* +	 * If the parser haven't finished with the last write, +	 * continue reading the user input without skipping spaces. +	 */ +	if (!(iter->flags & FTRACE_ITER_CONT)) {  		/* skip white space */  		while (cnt && isspace(ch)) {  			ret = get_user(ch, ubuf++); @@ -2288,8 +2292,9 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,  			cnt--;  		} +		/* only spaces were written */  		if (isspace(ch)) { -			file->f_pos += read; +			*ppos += read;  			ret = read;  			goto out;  		} @@ -2319,12 +2324,12 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,  		if (ret)  			goto out;  		iter->buffer_idx = 0; -	} else +	} else {  		iter->flags |= FTRACE_ITER_CONT; +		iter->buffer[iter->buffer_idx++] = ch; +	} - -	file->f_pos += read; - +	*ppos += read;  	ret = read;   out:  	mutex_unlock(&ftrace_regex_lock); @@ -2577,7 +2582,7 @@ ftrace_graph_open(struct inode *inode, struct file *file)  	mutex_lock(&graph_lock);  	if ((file->f_mode & FMODE_WRITE) && -	    !(file->f_flags & O_APPEND)) { +	    (file->f_flags & O_TRUNC)) {  		ftrace_graph_count = 0;  		memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs));  	} @@ -2596,6 +2601,14 @@ ftrace_graph_open(struct inode *inode, struct file *file)  }  static int +ftrace_graph_release(struct inode *inode, struct file *file) +{ +	if (file->f_mode & FMODE_READ) +		seq_release(inode, file); +	return 0; +} + +static int  ftrace_set_func(unsigned long *array, int *idx, char *buffer)  {  	struct dyn_ftrace *rec; @@ -2724,9 +2737,10 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,  }  static const struct file_operations ftrace_graph_fops = { -	.open = ftrace_graph_open, -	.read = seq_read, -	.write = ftrace_graph_write, +	.open		= ftrace_graph_open, +	.read		= seq_read, +	.write		= ftrace_graph_write, +	.release	= ftrace_graph_release,  };  #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index bf27bb7a63e2..a330513d96ce 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -735,6 +735,7 @@ ring_buffer_free(struct ring_buffer *buffer)  	put_online_cpus(); +	kfree(buffer->buffers);  	free_cpumask_var(buffer->cpumask);  	kfree(buffer); @@ -1785,7 +1786,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,  	 */  	RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing)); -	if (!rb_try_to_discard(cpu_buffer, event)) +	if (rb_try_to_discard(cpu_buffer, event))  		goto out;  	/* @@ -2383,7 +2384,6 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)  		 * the box. Return the padding, and we will release  		 * the current locks, and try again.  		 */ -		rb_advance_reader(cpu_buffer);  		return event;  	case RINGBUF_TYPE_TIME_EXTEND: @@ -2486,7 +2486,7 @@ static inline int rb_ok_to_lock(void)  	 * buffer too. A one time deal is all you get from reading  	 * the ring buffer from an NMI.  	 */ -	if (likely(!in_nmi() && !oops_in_progress)) +	if (likely(!in_nmi()))  		return 1;  	tracing_off_permanent(); @@ -2519,6 +2519,8 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)  	if (dolock)  		spin_lock(&cpu_buffer->reader_lock);  	event = rb_buffer_peek(buffer, cpu, ts); +	if (event && event->type_len == RINGBUF_TYPE_PADDING) +		rb_advance_reader(cpu_buffer);  	if (dolock)  		spin_unlock(&cpu_buffer->reader_lock);  	local_irq_restore(flags); @@ -2590,12 +2592,9 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)  		spin_lock(&cpu_buffer->reader_lock);  	event = rb_buffer_peek(buffer, cpu, ts); -	if (!event) -		goto out_unlock; - -	rb_advance_reader(cpu_buffer); +	if (event) +		rb_advance_reader(cpu_buffer); - out_unlock:  	if (dolock)  		spin_unlock(&cpu_buffer->reader_lock);  	local_irq_restore(flags); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8bc8d8afea6a..8c358395d338 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -848,6 +848,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,  		((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |  		(need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);  } +EXPORT_SYMBOL_GPL(tracing_generic_entry_update);  struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,  						    int type, @@ -2031,7 +2032,7 @@ static int tracing_open(struct inode *inode, struct file *file)  	/* If this file was open for write, then erase contents */  	if ((file->f_mode & FMODE_WRITE) && -	    !(file->f_flags & O_APPEND)) { +	    (file->f_flags & O_TRUNC)) {  		long cpu = (long) inode->i_private;  		if (cpu == TRACE_PIPE_ALL_CPU) @@ -3085,7 +3086,8 @@ tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter)  			break;  		} -		trace_consume(iter); +		if (ret != TRACE_TYPE_NO_CONSUME) +			trace_consume(iter);  		rem -= count;  		if (!find_next_entry_inc(iter))	{  			rem = 0; @@ -3894,17 +3896,9 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,  	if (ret < 0)  		return ret; -	switch (val) { -	case 0: -		trace_flags &= ~(1 << index); -		break; -	case 1: -		trace_flags |= 1 << index; -		break; - -	default: +	if (val != 0 && val != 1)  		return -EINVAL; -	} +	set_tracer_flags(1 << index, val);  	*ppos += cnt; @@ -4233,8 +4227,11 @@ static void __ftrace_dump(bool disable_tracing)  		iter.pos = -1;  		if (find_next_entry_inc(&iter) != NULL) { -			print_trace_line(&iter); -			trace_consume(&iter); +			int ret; + +			ret = print_trace_line(&iter); +			if (ret != TRACE_TYPE_NO_CONSUME) +				trace_consume(&iter);  		}  		trace_printk_seq(&iter.seq); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 3548ae5cc780..8b9f4f6e9559 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -438,10 +438,6 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,  struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,  					  int *ent_cpu, u64 *ent_ts); -void tracing_generic_entry_update(struct trace_entry *entry, -				  unsigned long flags, -				  int pc); -  void default_wait_pipe(struct trace_iterator *iter);  void poll_wait_pipe(struct trace_iterator *iter); diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c index 5b5895afecfe..11ba5bb4ed0a 100644 --- a/kernel/trace/trace_event_profile.c +++ b/kernel/trace/trace_event_profile.c @@ -14,7 +14,7 @@ int ftrace_profile_enable(int event_id)  	mutex_lock(&event_mutex);  	list_for_each_entry(event, &ftrace_events, list) { -		if (event->id == event_id) { +		if (event->id == event_id && event->profile_enable) {  			ret = event->profile_enable(event);  			break;  		} diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 53c8fd376a88..e75276a49cf5 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -376,7 +376,7 @@ ftrace_event_seq_open(struct inode *inode, struct file *file)  	const struct seq_operations *seq_ops;  	if ((file->f_mode & FMODE_WRITE) && -	    !(file->f_flags & O_APPEND)) +	    (file->f_flags & O_TRUNC))  		ftrace_clear_events();  	seq_ops = inode->i_private; @@ -940,7 +940,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,  		entry = trace_create_file("enable", 0644, call->dir, call,  					  enable); -	if (call->id) +	if (call->id && call->profile_enable)  		entry = trace_create_file("id", 0444, call->dir, call,  					  id); diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 936c621bbf46..f32dc9d1ea7b 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -624,9 +624,6 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps,  		return -ENOSPC;  	} -	filter->preds[filter->n_preds] = pred; -	filter->n_preds++; -  	list_for_each_entry(call, &ftrace_events, list) {  		if (!call->define_fields) @@ -643,6 +640,9 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps,  		}  		replace_filter_string(call->filter, filter_string);  	} + +	filter->preds[filter->n_preds] = pred; +	filter->n_preds++;  out:  	return err;  } @@ -1029,12 +1029,17 @@ static int replace_preds(struct event_subsystem *system,  		if (elt->op == OP_AND || elt->op == OP_OR) {  			pred = create_logical_pred(elt->op); +			if (!pred) +				return -ENOMEM;  			if (call) {  				err = filter_add_pred(ps, call, pred);  				filter_free_pred(pred); -			} else +			} else {  				err = filter_add_subsystem_pred(ps, system,  							pred, filter_string); +				if (err) +					filter_free_pred(pred); +			}  			if (err)  				return err; @@ -1048,12 +1053,17 @@ static int replace_preds(struct event_subsystem *system,  		}  		pred = create_pred(elt->op, operand1, operand2); +		if (!pred) +			return -ENOMEM;  		if (call) {  			err = filter_add_pred(ps, call, pred);  			filter_free_pred(pred); -		} else +		} else {  			err = filter_add_subsystem_pred(ps, system, pred,  							filter_string); +			if (err) +				filter_free_pred(pred); +		}  		if (err)  			return err; diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index d2249abafb53..420ec3487579 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -843,9 +843,16 @@ print_graph_function(struct trace_iterator *iter)  	switch (entry->type) {  	case TRACE_GRAPH_ENT: { -		struct ftrace_graph_ent_entry *field; +		/* +		 * print_graph_entry() may consume the current event, +		 * thus @field may become invalid, so we need to save it. +		 * sizeof(struct ftrace_graph_ent_entry) is very small, +		 * it can be safely saved at the stack. +		 */ +		struct ftrace_graph_ent_entry *field, saved;  		trace_assign_type(field, entry); -		return print_graph_entry(field, s, iter); +		saved = *field; +		return print_graph_entry(&saved, s, iter);  	}  	case TRACE_GRAPH_RET: {  		struct ftrace_graph_ret_entry *field; diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 7b6278110827..687699d365ae 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -176,7 +176,7 @@ static int t_show(struct seq_file *m, void *v)  	const char *str = *fmt;  	int i; -	seq_printf(m, "0x%lx : \"", (unsigned long)fmt); +	seq_printf(m, "0x%lx : \"", *(unsigned long *)fmt);  	/*  	 * Tabs and new lines need to be converted. diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index e644af910124..6a2a9d484cd6 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -301,17 +301,14 @@ static const struct seq_operations stack_trace_seq_ops = {  static int stack_trace_open(struct inode *inode, struct file *file)  { -	int ret; - -	ret = seq_open(file, &stack_trace_seq_ops); - -	return ret; +	return seq_open(file, &stack_trace_seq_ops);  }  static const struct file_operations stack_trace_fops = {  	.open		= stack_trace_open,  	.read		= seq_read,  	.llseek		= seq_lseek, +	.release	= seq_release,  };  int diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index e66f5e493342..aea321c82fa0 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -73,7 +73,7 @@ static struct rb_node *release_next(struct rb_node *node)  	}  } -static void reset_stat_session(struct stat_session *session) +static void __reset_stat_session(struct stat_session *session)  {  	struct rb_node *node = session->stat_root.rb_node; @@ -83,10 +83,17 @@ static void reset_stat_session(struct stat_session *session)  	session->stat_root = RB_ROOT;  } +static void reset_stat_session(struct stat_session *session) +{ +	mutex_lock(&session->stat_mutex); +	__reset_stat_session(session); +	mutex_unlock(&session->stat_mutex); +} +  static void destroy_session(struct stat_session *session)  {  	debugfs_remove(session->file); -	reset_stat_session(session); +	__reset_stat_session(session);  	mutex_destroy(&session->stat_mutex);  	kfree(session);  } @@ -150,7 +157,7 @@ static int stat_seq_init(struct stat_session *session)  	int i;  	mutex_lock(&session->stat_mutex); -	reset_stat_session(session); +	__reset_stat_session(session);  	if (!ts->stat_cmp)  		ts->stat_cmp = dummy_cmp; @@ -183,7 +190,7 @@ exit:  	return ret;  exit_free_rbtree: -	reset_stat_session(session); +	__reset_stat_session(session);  	mutex_unlock(&session->stat_mutex);  	return ret;  } @@ -250,16 +257,21 @@ static const struct seq_operations trace_stat_seq_ops = {  static int tracing_stat_open(struct inode *inode, struct file *file)  {  	int ret; - +	struct seq_file *m;  	struct stat_session *session = inode->i_private; +	ret = stat_seq_init(session); +	if (ret) +		return ret; +  	ret = seq_open(file, &trace_stat_seq_ops); -	if (!ret) { -		struct seq_file *m = file->private_data; -		m->private = session; -		ret = stat_seq_init(session); +	if (ret) { +		reset_stat_session(session); +		return ret;  	} +	m = file->private_data; +	m->private = session;  	return ret;  } @@ -270,11 +282,9 @@ static int tracing_stat_release(struct inode *i, struct file *f)  {  	struct stat_session *session = i->i_private; -	mutex_lock(&session->stat_mutex);  	reset_stat_session(session); -	mutex_unlock(&session->stat_mutex); -	return 0; +	return seq_release(i, f);  }  static const struct file_operations tracing_stat_fops = { diff --git a/kernel/wait.c b/kernel/wait.c index ea7c3b4275cf..c4bd3d825f35 100644 --- a/kernel/wait.c +++ b/kernel/wait.c @@ -10,13 +10,14 @@  #include <linux/wait.h>  #include <linux/hash.h> -void init_waitqueue_head(wait_queue_head_t *q) +void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *key)  {  	spin_lock_init(&q->lock); +	lockdep_set_class(&q->lock, key);  	INIT_LIST_HEAD(&q->task_list);  } -EXPORT_SYMBOL(init_waitqueue_head); +EXPORT_SYMBOL(__init_waitqueue_head);  void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)  {  | 
