diff options
Diffstat (limited to 'arch/x86/kernel/ds.c')
| -rw-r--r-- | arch/x86/kernel/ds.c | 1147 | 
1 files changed, 644 insertions, 503 deletions
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index a2d1176c38ee..da91701a2348 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -6,14 +6,13 @@   * precise-event based sampling (PEBS).   *   * It manages: - * - per-thread and per-cpu allocation of BTS and PEBS - * - buffer memory allocation (optional) - * - buffer overflow handling + * - DS and BTS hardware configuration + * - buffer overflow handling (to be done)   * - buffer access   * - * It assumes: - * - get_task_struct on all parameter tasks - * - current is allowed to trace parameter tasks + * It does not do: + * - security checking (is the caller allowed to trace the task) + * - buffer allocation (memory accounting)   *   *   * Copyright (C) 2007-2008 Intel Corporation. @@ -28,22 +27,69 @@  #include <linux/slab.h>  #include <linux/sched.h>  #include <linux/mm.h> +#include <linux/kernel.h>  /*   * The configuration for a particular DS hardware implementation.   */  struct ds_configuration { -	/* the size of the DS structure in bytes */ -	unsigned char  sizeof_ds; -	/* the size of one pointer-typed field in the DS structure in bytes; -	   this covers the first 8 fields related to buffer management. */ +	/* the name of the configuration */ +	const char *name; +	/* the size of one pointer-typed field in the DS structure and +	   in the BTS and PEBS buffers in bytes; +	   this covers the first 8 DS fields related to buffer management. */  	unsigned char  sizeof_field;  	/* the size of a BTS/PEBS record in bytes */  	unsigned char  sizeof_rec[2]; +	/* a series of bit-masks to control various features indexed +	 * by enum ds_feature */ +	unsigned long ctl[dsf_ctl_max];  }; -static struct ds_configuration ds_cfg; +static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array); +#define ds_cfg per_cpu(ds_cfg_array, smp_processor_id()) + +#define MAX_SIZEOF_DS (12 * 8)	/* maximal size of a DS configuration */ +#define MAX_SIZEOF_BTS (3 * 8)	/* maximal size of a BTS record */ +#define DS_ALIGNMENT (1 << 3)	/* BTS and PEBS buffer alignment */ + +#define BTS_CONTROL \ + (ds_cfg.ctl[dsf_bts] | ds_cfg.ctl[dsf_bts_kernel] | ds_cfg.ctl[dsf_bts_user] |\ +  ds_cfg.ctl[dsf_bts_overflow]) + + +/* + * A BTS or PEBS tracer. + * + * This holds the configuration of the tracer and serves as a handle + * to identify tracers. + */ +struct ds_tracer { +	/* the DS context (partially) owned by this tracer */ +	struct ds_context *context; +	/* the buffer provided on ds_request() and its size in bytes */ +	void *buffer; +	size_t size; +}; + +struct bts_tracer { +	/* the common DS part */ +	struct ds_tracer ds; +	/* the trace including the DS configuration */ +	struct bts_trace trace; +	/* buffer overflow notification function */ +	bts_ovfl_callback_t ovfl; +}; + +struct pebs_tracer { +	/* the common DS part */ +	struct ds_tracer ds; +	/* the trace including the DS configuration */ +	struct pebs_trace trace; +	/* buffer overflow notification function */ +	pebs_ovfl_callback_t ovfl; +};  /*   * Debug Store (DS) save area configuration (see Intel64 and IA32 @@ -109,32 +155,9 @@ static inline void ds_set(unsigned char *base, enum ds_qualifier qual,  /* - * Locking is done only for allocating BTS or PEBS resources and for - * guarding context and buffer memory allocation. - * - * Most functions require the current task to own the ds context part - * they are going to access. All the locking is done when validating - * access to the context. + * Locking is done only for allocating BTS or PEBS resources.   */ -static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock); - -/* - * Validate that the current task is allowed to access the BTS/PEBS - * buffer of the parameter task. - * - * Returns 0, if access is granted; -Eerrno, otherwise. - */ -static inline int ds_validate_access(struct ds_context *context, -				     enum ds_qualifier qual) -{ -	if (!context) -		return -EPERM; - -	if (context->owner[qual] == current) -		return 0; - -	return -EPERM; -} +static DEFINE_SPINLOCK(ds_lock);  /* @@ -150,27 +173,32 @@ static inline int ds_validate_access(struct ds_context *context,   *   >0  number of per-thread tracers   *   <0  number of per-cpu tracers   * - * The below functions to get and put tracers and to check the - * allocation type require the ds_lock to be held by the caller. - *   * Tracers essentially gives the number of ds contexts for a certain   * type of allocation.   */ -static long tracers; +static atomic_t tracers = ATOMIC_INIT(0);  static inline void get_tracer(struct task_struct *task)  { -	tracers += (task ? 1 : -1); +	if (task) +		atomic_inc(&tracers); +	else +		atomic_dec(&tracers);  }  static inline void put_tracer(struct task_struct *task)  { -	tracers -= (task ? 1 : -1); +	if (task) +		atomic_dec(&tracers); +	else +		atomic_inc(&tracers);  }  static inline int check_tracer(struct task_struct *task)  { -	return (task ? (tracers >= 0) : (tracers <= 0)); +	return task ? +		(atomic_read(&tracers) >= 0) : +		(atomic_read(&tracers) <= 0);  } @@ -183,99 +211,70 @@ static inline int check_tracer(struct task_struct *task)   *   * Contexts are use-counted. They are allocated on first access and   * deallocated when the last user puts the context. - * - * We distinguish between an allocating and a non-allocating get of a - * context: - * - the allocating get is used for requesting BTS/PEBS resources. It - *   requires the caller to hold the global ds_lock. - * - the non-allocating get is used for all other cases. A - *   non-existing context indicates an error. It acquires and releases - *   the ds_lock itself for obtaining the context. - * - * A context and its DS configuration are allocated and deallocated - * together. A context always has a DS configuration of the - * appropriate size. - */ -static DEFINE_PER_CPU(struct ds_context *, system_context); - -#define this_system_context per_cpu(system_context, smp_processor_id()) - -/* - * Returns the pointer to the parameter task's context or to the - * system-wide context, if task is NULL. - * - * Increases the use count of the returned context, if not NULL.   */ -static inline struct ds_context *ds_get_context(struct task_struct *task) -{ -	struct ds_context *context; -	unsigned long irq; +struct ds_context { +	/* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */ +	unsigned char ds[MAX_SIZEOF_DS]; +	/* the owner of the BTS and PEBS configuration, respectively */ +	struct bts_tracer *bts_master; +	struct pebs_tracer *pebs_master; +	/* use count */ +	unsigned long count; +	/* a pointer to the context location inside the thread_struct +	 * or the per_cpu context array */ +	struct ds_context **this; +	/* a pointer to the task owning this context, or NULL, if the +	 * context is owned by a cpu */ +	struct task_struct *task; +}; -	spin_lock_irqsave(&ds_lock, irq); +static DEFINE_PER_CPU(struct ds_context *, system_context_array); -	context = (task ? task->thread.ds_ctx : this_system_context); -	if (context) -		context->count++; +#define system_context per_cpu(system_context_array, smp_processor_id()) -	spin_unlock_irqrestore(&ds_lock, irq); - -	return context; -} -/* - * Same as ds_get_context, but allocates the context and it's DS - * structure, if necessary; returns NULL; if out of memory. - */ -static inline struct ds_context *ds_alloc_context(struct task_struct *task) +static inline struct ds_context *ds_get_context(struct task_struct *task)  {  	struct ds_context **p_context = -		(task ? &task->thread.ds_ctx : &this_system_context); -	struct ds_context *context = *p_context; +		(task ? &task->thread.ds_ctx : &system_context); +	struct ds_context *context = NULL; +	struct ds_context *new_context = NULL;  	unsigned long irq; -	if (!context) { -		context = kzalloc(sizeof(*context), GFP_KERNEL); -		if (!context) -			return NULL; - -		context->ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL); -		if (!context->ds) { -			kfree(context); -			return NULL; -		} +	/* Chances are small that we already have a context. */ +	new_context = kzalloc(sizeof(*new_context), GFP_KERNEL); +	if (!new_context) +		return NULL; -		spin_lock_irqsave(&ds_lock, irq); +	spin_lock_irqsave(&ds_lock, irq); -		if (*p_context) { -			kfree(context->ds); -			kfree(context); +	context = *p_context; +	if (!context) { +		context = new_context; -			context = *p_context; -		} else { -			*p_context = context; +		context->this = p_context; +		context->task = task; +		context->count = 0; -			context->this = p_context; -			context->task = task; +		if (task) +			set_tsk_thread_flag(task, TIF_DS_AREA_MSR); -			if (task) -				set_tsk_thread_flag(task, TIF_DS_AREA_MSR); +		if (!task || (task == current)) +			wrmsrl(MSR_IA32_DS_AREA, (unsigned long)context->ds); -			if (!task || (task == current)) -				wrmsrl(MSR_IA32_DS_AREA, -				       (unsigned long)context->ds); -		} -		spin_unlock_irqrestore(&ds_lock, irq); +		*p_context = context;  	}  	context->count++; +	spin_unlock_irqrestore(&ds_lock, irq); + +	if (context != new_context) +		kfree(new_context); +  	return context;  } -/* - * Decreases the use count of the parameter context, if not NULL. - * Deallocates the context, if the use count reaches zero. - */  static inline void ds_put_context(struct ds_context *context)  {  	unsigned long irq; @@ -285,8 +284,10 @@ static inline void ds_put_context(struct ds_context *context)  	spin_lock_irqsave(&ds_lock, irq); -	if (--context->count) -		goto out; +	if (--context->count) { +		spin_unlock_irqrestore(&ds_lock, irq); +		return; +	}  	*(context->this) = NULL; @@ -296,135 +297,263 @@ static inline void ds_put_context(struct ds_context *context)  	if (!context->task || (context->task == current))  		wrmsrl(MSR_IA32_DS_AREA, 0); -	put_tracer(context->task); +	spin_unlock_irqrestore(&ds_lock, irq); -	/* free any leftover buffers from tracers that did not -	 * deallocate them properly. */ -	kfree(context->buffer[ds_bts]); -	kfree(context->buffer[ds_pebs]); -	kfree(context->ds);  	kfree(context); - out: -	spin_unlock_irqrestore(&ds_lock, irq);  }  /* - * Handle a buffer overflow + * Call the tracer's callback on a buffer overflow.   * - * task: the task whose buffers are overflowing; - *       NULL for a buffer overflow on the current cpu   * context: the ds context   * qual: the buffer type   */ -static void ds_overflow(struct task_struct *task, struct ds_context *context, -			enum ds_qualifier qual) +static void ds_overflow(struct ds_context *context, enum ds_qualifier qual)  { -	if (!context) -		return; - -	if (context->callback[qual]) -		(*context->callback[qual])(task); - -	/* todo: do some more overflow handling */ +	switch (qual) { +	case ds_bts: +		if (context->bts_master && +		    context->bts_master->ovfl) +			context->bts_master->ovfl(context->bts_master); +		break; +	case ds_pebs: +		if (context->pebs_master && +		    context->pebs_master->ovfl) +			context->pebs_master->ovfl(context->pebs_master); +		break; +	}  }  /* - * Allocate a non-pageable buffer of the parameter size. - * Checks the memory and the locked memory rlimit. + * Write raw data into the BTS or PEBS buffer.   * - * Returns the buffer, if successful; - *         NULL, if out of memory or rlimit exceeded. + * The remainder of any partially written record is zeroed out.   * - * size: the requested buffer size in bytes - * pages (out): if not NULL, contains the number of pages reserved + * context: the DS context + * qual: the buffer type + * record: the data to write + * size: the size of the data   */ -static inline void *ds_allocate_buffer(size_t size, unsigned int *pages) +static int ds_write(struct ds_context *context, enum ds_qualifier qual, +		    const void *record, size_t size)  { -	unsigned long rlim, vm, pgsz; -	void *buffer; +	int bytes_written = 0; -	pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; +	if (!record) +		return -EINVAL; -	rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; -	vm   = current->mm->total_vm  + pgsz; -	if (rlim < vm) -		return NULL; +	while (size) { +		unsigned long base, index, end, write_end, int_th; +		unsigned long write_size, adj_write_size; -	rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; -	vm   = current->mm->locked_vm  + pgsz; -	if (rlim < vm) -		return NULL; +		/* +		 * write as much as possible without producing an +		 * overflow interrupt. +		 * +		 * interrupt_threshold must either be +		 * - bigger than absolute_maximum or +		 * - point to a record between buffer_base and absolute_maximum +		 * +		 * index points to a valid record. +		 */ +		base   = ds_get(context->ds, qual, ds_buffer_base); +		index  = ds_get(context->ds, qual, ds_index); +		end    = ds_get(context->ds, qual, ds_absolute_maximum); +		int_th = ds_get(context->ds, qual, ds_interrupt_threshold); -	buffer = kzalloc(size, GFP_KERNEL); -	if (!buffer) -		return NULL; +		write_end = min(end, int_th); -	current->mm->total_vm  += pgsz; -	current->mm->locked_vm += pgsz; +		/* if we are already beyond the interrupt threshold, +		 * we fill the entire buffer */ +		if (write_end <= index) +			write_end = end; -	if (pages) -		*pages = pgsz; +		if (write_end <= index) +			break; + +		write_size = min((unsigned long) size, write_end - index); +		memcpy((void *)index, record, write_size); -	return buffer; +		record = (const char *)record + write_size; +		size -= write_size; +		bytes_written += write_size; + +		adj_write_size = write_size / ds_cfg.sizeof_rec[qual]; +		adj_write_size *= ds_cfg.sizeof_rec[qual]; + +		/* zero out trailing bytes */ +		memset((char *)index + write_size, 0, +		       adj_write_size - write_size); +		index += adj_write_size; + +		if (index >= end) +			index = base; +		ds_set(context->ds, qual, ds_index, index); + +		if (index >= int_th) +			ds_overflow(context, qual); +	} + +	return bytes_written;  } -static int ds_request(struct task_struct *task, void *base, size_t size, -		      ds_ovfl_callback_t ovfl, enum ds_qualifier qual) + +/* + * Branch Trace Store (BTS) uses the following format. Different + * architectures vary in the size of those fields. + * - source linear address + * - destination linear address + * - flags + * + * Later architectures use 64bit pointers throughout, whereas earlier + * architectures use 32bit pointers in 32bit mode. + * + * We compute the base address for the first 8 fields based on: + * - the field size stored in the DS configuration + * - the relative field position + * + * In order to store additional information in the BTS buffer, we use + * a special source address to indicate that the record requires + * special interpretation. + * + * Netburst indicated via a bit in the flags field whether the branch + * was predicted; this is ignored. + * + * We use two levels of abstraction: + * - the raw data level defined here + * - an arch-independent level defined in ds.h + */ + +enum bts_field { +	bts_from, +	bts_to, +	bts_flags, + +	bts_qual = bts_from, +	bts_jiffies = bts_to, +	bts_pid = bts_flags, + +	bts_qual_mask = (bts_qual_max - 1), +	bts_escape = ((unsigned long)-1 & ~bts_qual_mask) +}; + +static inline unsigned long bts_get(const char *base, enum bts_field field)  { -	struct ds_context *context; -	unsigned long buffer, adj; -	const unsigned long alignment = (1 << 3); -	unsigned long irq; -	int error = 0; +	base += (ds_cfg.sizeof_field * field); +	return *(unsigned long *)base; +} + +static inline void bts_set(char *base, enum bts_field field, unsigned long val) +{ +	base += (ds_cfg.sizeof_field * field);; +	(*(unsigned long *)base) = val; +} -	if (!ds_cfg.sizeof_ds) -		return -EOPNOTSUPP; -	/* we require some space to do alignment adjustments below */ -	if (size < (alignment + ds_cfg.sizeof_rec[qual])) +/* + * The raw BTS data is architecture dependent. + * + * For higher-level users, we give an arch-independent view. + * - ds.h defines struct bts_struct + * - bts_read translates one raw bts record into a bts_struct + * - bts_write translates one bts_struct into the raw format and + *   writes it into the top of the parameter tracer's buffer. + * + * return: bytes read/written on success; -Eerrno, otherwise + */ +static int bts_read(struct bts_tracer *tracer, const void *at, +		    struct bts_struct *out) +{ +	if (!tracer)  		return -EINVAL; -	/* buffer overflow notification is not yet implemented */ -	if (ovfl) -		return -EOPNOTSUPP; +	if (at < tracer->trace.ds.begin) +		return -EINVAL; +	if (tracer->trace.ds.end < (at + tracer->trace.ds.size)) +		return -EINVAL; -	context = ds_alloc_context(task); -	if (!context) -		return -ENOMEM; +	memset(out, 0, sizeof(*out)); +	if ((bts_get(at, bts_qual) & ~bts_qual_mask) == bts_escape) { +		out->qualifier = (bts_get(at, bts_qual) & bts_qual_mask); +		out->variant.timestamp.jiffies = bts_get(at, bts_jiffies); +		out->variant.timestamp.pid = bts_get(at, bts_pid); +	} else { +		out->qualifier = bts_branch; +		out->variant.lbr.from = bts_get(at, bts_from); +		out->variant.lbr.to   = bts_get(at, bts_to); + +		if (!out->variant.lbr.from && !out->variant.lbr.to) +			out->qualifier = bts_invalid; +	} -	spin_lock_irqsave(&ds_lock, irq); +	return ds_cfg.sizeof_rec[ds_bts]; +} -	error = -EPERM; -	if (!check_tracer(task)) -		goto out_unlock; +static int bts_write(struct bts_tracer *tracer, const struct bts_struct *in) +{ +	unsigned char raw[MAX_SIZEOF_BTS]; -	get_tracer(task); +	if (!tracer) +		return -EINVAL; -	error = -EALREADY; -	if (context->owner[qual] == current) -		goto out_put_tracer; -	error = -EPERM; -	if (context->owner[qual] != NULL) -		goto out_put_tracer; -	context->owner[qual] = current; +	if (MAX_SIZEOF_BTS < ds_cfg.sizeof_rec[ds_bts]) +		return -EOVERFLOW; -	spin_unlock_irqrestore(&ds_lock, irq); +	switch (in->qualifier) { +	case bts_invalid: +		bts_set(raw, bts_from, 0); +		bts_set(raw, bts_to, 0); +		bts_set(raw, bts_flags, 0); +		break; +	case bts_branch: +		bts_set(raw, bts_from, in->variant.lbr.from); +		bts_set(raw, bts_to,   in->variant.lbr.to); +		bts_set(raw, bts_flags, 0); +		break; +	case bts_task_arrives: +	case bts_task_departs: +		bts_set(raw, bts_qual, (bts_escape | in->qualifier)); +		bts_set(raw, bts_jiffies, in->variant.timestamp.jiffies); +		bts_set(raw, bts_pid, in->variant.timestamp.pid); +		break; +	default: +		return -EINVAL; +	} +	return ds_write(tracer->ds.context, ds_bts, raw, +			ds_cfg.sizeof_rec[ds_bts]); +} -	error = -ENOMEM; -	if (!base) { -		base = ds_allocate_buffer(size, &context->pages[qual]); -		if (!base) -			goto out_release; -		context->buffer[qual]   = base; -	} -	error = 0; +static void ds_write_config(struct ds_context *context, +			    struct ds_trace *cfg, enum ds_qualifier qual) +{ +	unsigned char *ds = context->ds; + +	ds_set(ds, qual, ds_buffer_base, (unsigned long)cfg->begin); +	ds_set(ds, qual, ds_index, (unsigned long)cfg->top); +	ds_set(ds, qual, ds_absolute_maximum, (unsigned long)cfg->end); +	ds_set(ds, qual, ds_interrupt_threshold, (unsigned long)cfg->ith); +} + +static void ds_read_config(struct ds_context *context, +			   struct ds_trace *cfg, enum ds_qualifier qual) +{ +	unsigned char *ds = context->ds; -	context->callback[qual] = ovfl; +	cfg->begin = (void *)ds_get(ds, qual, ds_buffer_base); +	cfg->top = (void *)ds_get(ds, qual, ds_index); +	cfg->end = (void *)ds_get(ds, qual, ds_absolute_maximum); +	cfg->ith = (void *)ds_get(ds, qual, ds_interrupt_threshold); +} + +static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual, +			     void *base, size_t size, size_t ith, +			     unsigned int flags) { +	unsigned long buffer, adj;  	/* adjust the buffer address and size to meet alignment  	 * constraints: @@ -436,410 +565,383 @@ static int ds_request(struct task_struct *task, void *base, size_t size,  	 */  	buffer = (unsigned long)base; -	adj = ALIGN(buffer, alignment) - buffer; +	adj = ALIGN(buffer, DS_ALIGNMENT) - buffer;  	buffer += adj;  	size   -= adj; -	size /= ds_cfg.sizeof_rec[qual]; -	size *= ds_cfg.sizeof_rec[qual]; - -	ds_set(context->ds, qual, ds_buffer_base, buffer); -	ds_set(context->ds, qual, ds_index, buffer); -	ds_set(context->ds, qual, ds_absolute_maximum, buffer + size); +	trace->n = size / ds_cfg.sizeof_rec[qual]; +	trace->size = ds_cfg.sizeof_rec[qual]; -	if (ovfl) { -		/* todo: select a suitable interrupt threshold */ -	} else -		ds_set(context->ds, qual, -		       ds_interrupt_threshold, buffer + size + 1); +	size = (trace->n * trace->size); -	/* we keep the context until ds_release */ -	return error; - - out_release: -	context->owner[qual] = NULL; -	ds_put_context(context); -	put_tracer(task); -	return error; - - out_put_tracer: -	spin_unlock_irqrestore(&ds_lock, irq); -	ds_put_context(context); -	put_tracer(task); -	return error; +	trace->begin = (void *)buffer; +	trace->top = trace->begin; +	trace->end = (void *)(buffer + size); +	/* The value for 'no threshold' is -1, which will set the +	 * threshold outside of the buffer, just like we want it. +	 */ +	trace->ith = (void *)(buffer + size - ith); - out_unlock: -	spin_unlock_irqrestore(&ds_lock, irq); -	ds_put_context(context); -	return error; +	trace->flags = flags;  } -int ds_request_bts(struct task_struct *task, void *base, size_t size, -		   ds_ovfl_callback_t ovfl) -{ -	return ds_request(task, base, size, ovfl, ds_bts); -} -int ds_request_pebs(struct task_struct *task, void *base, size_t size, -		    ds_ovfl_callback_t ovfl) -{ -	return ds_request(task, base, size, ovfl, ds_pebs); -} - -static int ds_release(struct task_struct *task, enum ds_qualifier qual) +static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace, +		      enum ds_qualifier qual, struct task_struct *task, +		      void *base, size_t size, size_t th, unsigned int flags)  {  	struct ds_context *context;  	int error; -	context = ds_get_context(task); -	error = ds_validate_access(context, qual); -	if (error < 0) +	error = -EINVAL; +	if (!base)  		goto out; -	kfree(context->buffer[qual]); -	context->buffer[qual] = NULL; - -	current->mm->total_vm  -= context->pages[qual]; -	current->mm->locked_vm -= context->pages[qual]; -	context->pages[qual] = 0; -	context->owner[qual] = NULL; - -	/* -	 * we put the context twice: -	 *   once for the ds_get_context -	 *   once for the corresponding ds_request -	 */ -	ds_put_context(context); - out: -	ds_put_context(context); -	return error; -} +	/* we require some space to do alignment adjustments below */ +	error = -EINVAL; +	if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual])) +		goto out; -int ds_release_bts(struct task_struct *task) -{ -	return ds_release(task, ds_bts); -} +	if (th != (size_t)-1) { +		th *= ds_cfg.sizeof_rec[qual]; -int ds_release_pebs(struct task_struct *task) -{ -	return ds_release(task, ds_pebs); -} +		error = -EINVAL; +		if (size <= th) +			goto out; +	} -static int ds_get_index(struct task_struct *task, size_t *pos, -			enum ds_qualifier qual) -{ -	struct ds_context *context; -	unsigned long base, index; -	int error; +	tracer->buffer = base; +	tracer->size = size; +	error = -ENOMEM;  	context = ds_get_context(task); -	error = ds_validate_access(context, qual); -	if (error < 0) +	if (!context)  		goto out; +	tracer->context = context; -	base  = ds_get(context->ds, qual, ds_buffer_base); -	index = ds_get(context->ds, qual, ds_index); +	ds_init_ds_trace(trace, qual, base, size, th, flags); -	error = ((index - base) / ds_cfg.sizeof_rec[qual]); -	if (pos) -		*pos = error; +	error = 0;   out: -	ds_put_context(context);  	return error;  } -int ds_get_bts_index(struct task_struct *task, size_t *pos) -{ -	return ds_get_index(task, pos, ds_bts); -} - -int ds_get_pebs_index(struct task_struct *task, size_t *pos) +struct bts_tracer *ds_request_bts(struct task_struct *task, +				  void *base, size_t size, +				  bts_ovfl_callback_t ovfl, size_t th, +				  unsigned int flags)  { -	return ds_get_index(task, pos, ds_pebs); -} - -static int ds_get_end(struct task_struct *task, size_t *pos, -		      enum ds_qualifier qual) -{ -	struct ds_context *context; -	unsigned long base, end; +	struct bts_tracer *tracer; +	unsigned long irq;  	int error; -	context = ds_get_context(task); -	error = ds_validate_access(context, qual); -	if (error < 0) +	error = -EOPNOTSUPP; +	if (!ds_cfg.ctl[dsf_bts])  		goto out; -	base = ds_get(context->ds, qual, ds_buffer_base); -	end  = ds_get(context->ds, qual, ds_absolute_maximum); +	/* buffer overflow notification is not yet implemented */ +	error = -EOPNOTSUPP; +	if (ovfl) +		goto out; -	error = ((end - base) / ds_cfg.sizeof_rec[qual]); -	if (pos) -		*pos = error; - out: -	ds_put_context(context); -	return error; -} +	error = -ENOMEM; +	tracer = kzalloc(sizeof(*tracer), GFP_KERNEL); +	if (!tracer) +		goto out; +	tracer->ovfl = ovfl; -int ds_get_bts_end(struct task_struct *task, size_t *pos) -{ -	return ds_get_end(task, pos, ds_bts); -} +	error = ds_request(&tracer->ds, &tracer->trace.ds, +			   ds_bts, task, base, size, th, flags); +	if (error < 0) +		goto out_tracer; -int ds_get_pebs_end(struct task_struct *task, size_t *pos) -{ -	return ds_get_end(task, pos, ds_pebs); -} -static int ds_access(struct task_struct *task, size_t index, -		     const void **record, enum ds_qualifier qual) -{ -	struct ds_context *context; -	unsigned long base, idx; -	int error; +	spin_lock_irqsave(&ds_lock, irq); -	if (!record) -		return -EINVAL; +	error = -EPERM; +	if (!check_tracer(task)) +		goto out_unlock; +	get_tracer(task); -	context = ds_get_context(task); -	error = ds_validate_access(context, qual); -	if (error < 0) -		goto out; +	error = -EPERM; +	if (tracer->ds.context->bts_master) +		goto out_put_tracer; +	tracer->ds.context->bts_master = tracer; -	base = ds_get(context->ds, qual, ds_buffer_base); -	idx = base + (index * ds_cfg.sizeof_rec[qual]); +	spin_unlock_irqrestore(&ds_lock, irq); -	error = -EINVAL; -	if (idx > ds_get(context->ds, qual, ds_absolute_maximum)) -		goto out; -	*record = (const void *)idx; -	error = ds_cfg.sizeof_rec[qual]; - out: -	ds_put_context(context); -	return error; -} +	tracer->trace.read  = bts_read; +	tracer->trace.write = bts_write; -int ds_access_bts(struct task_struct *task, size_t index, const void **record) -{ -	return ds_access(task, index, record, ds_bts); -} +	ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts); +	ds_resume_bts(tracer); -int ds_access_pebs(struct task_struct *task, size_t index, const void **record) -{ -	return ds_access(task, index, record, ds_pebs); +	return tracer; + + out_put_tracer: +	put_tracer(task); + out_unlock: +	spin_unlock_irqrestore(&ds_lock, irq); +	ds_put_context(tracer->ds.context); + out_tracer: +	kfree(tracer); + out: +	return ERR_PTR(error);  } -static int ds_write(struct task_struct *task, const void *record, size_t size, -		    enum ds_qualifier qual, int force) +struct pebs_tracer *ds_request_pebs(struct task_struct *task, +				    void *base, size_t size, +				    pebs_ovfl_callback_t ovfl, size_t th, +				    unsigned int flags)  { -	struct ds_context *context; +	struct pebs_tracer *tracer; +	unsigned long irq;  	int error; -	if (!record) -		return -EINVAL; +	/* buffer overflow notification is not yet implemented */ +	error = -EOPNOTSUPP; +	if (ovfl) +		goto out; -	error = -EPERM; -	context = ds_get_context(task); -	if (!context) +	error = -ENOMEM; +	tracer = kzalloc(sizeof(*tracer), GFP_KERNEL); +	if (!tracer)  		goto out; +	tracer->ovfl = ovfl; -	if (!force) { -		error = ds_validate_access(context, qual); -		if (error < 0) -			goto out; -	} +	error = ds_request(&tracer->ds, &tracer->trace.ds, +			   ds_pebs, task, base, size, th, flags); +	if (error < 0) +		goto out_tracer; -	error = 0; -	while (size) { -		unsigned long base, index, end, write_end, int_th; -		unsigned long write_size, adj_write_size; +	spin_lock_irqsave(&ds_lock, irq); -		/* -		 * write as much as possible without producing an -		 * overflow interrupt. -		 * -		 * interrupt_threshold must either be -		 * - bigger than absolute_maximum or -		 * - point to a record between buffer_base and absolute_maximum -		 * -		 * index points to a valid record. -		 */ -		base   = ds_get(context->ds, qual, ds_buffer_base); -		index  = ds_get(context->ds, qual, ds_index); -		end    = ds_get(context->ds, qual, ds_absolute_maximum); -		int_th = ds_get(context->ds, qual, ds_interrupt_threshold); +	error = -EPERM; +	if (!check_tracer(task)) +		goto out_unlock; +	get_tracer(task); -		write_end = min(end, int_th); +	error = -EPERM; +	if (tracer->ds.context->pebs_master) +		goto out_put_tracer; +	tracer->ds.context->pebs_master = tracer; -		/* if we are already beyond the interrupt threshold, -		 * we fill the entire buffer */ -		if (write_end <= index) -			write_end = end; +	spin_unlock_irqrestore(&ds_lock, irq); -		if (write_end <= index) -			goto out; +	ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts); +	ds_resume_pebs(tracer); -		write_size = min((unsigned long) size, write_end - index); -		memcpy((void *)index, record, write_size); +	return tracer; -		record = (const char *)record + write_size; -		size  -= write_size; -		error += write_size; + out_put_tracer: +	put_tracer(task); + out_unlock: +	spin_unlock_irqrestore(&ds_lock, irq); +	ds_put_context(tracer->ds.context); + out_tracer: +	kfree(tracer); + out: +	return ERR_PTR(error); +} -		adj_write_size = write_size / ds_cfg.sizeof_rec[qual]; -		adj_write_size *= ds_cfg.sizeof_rec[qual]; +void ds_release_bts(struct bts_tracer *tracer) +{ +	if (!tracer) +		return; -		/* zero out trailing bytes */ -		memset((char *)index + write_size, 0, -		       adj_write_size - write_size); -		index += adj_write_size; +	ds_suspend_bts(tracer); -		if (index >= end) -			index = base; -		ds_set(context->ds, qual, ds_index, index); +	WARN_ON_ONCE(tracer->ds.context->bts_master != tracer); +	tracer->ds.context->bts_master = NULL; -		if (index >= int_th) -			ds_overflow(task, context, qual); -	} +	put_tracer(tracer->ds.context->task); +	ds_put_context(tracer->ds.context); - out: -	ds_put_context(context); -	return error; +	kfree(tracer);  } -int ds_write_bts(struct task_struct *task, const void *record, size_t size) +void ds_suspend_bts(struct bts_tracer *tracer)  { -	return ds_write(task, record, size, ds_bts, /* force = */ 0); -} +	struct task_struct *task; -int ds_write_pebs(struct task_struct *task, const void *record, size_t size) -{ -	return ds_write(task, record, size, ds_pebs, /* force = */ 0); -} +	if (!tracer) +		return; -int ds_unchecked_write_bts(struct task_struct *task, -			   const void *record, size_t size) -{ -	return ds_write(task, record, size, ds_bts, /* force = */ 1); -} +	task = tracer->ds.context->task; -int ds_unchecked_write_pebs(struct task_struct *task, -			    const void *record, size_t size) -{ -	return ds_write(task, record, size, ds_pebs, /* force = */ 1); +	if (!task || (task == current)) +		update_debugctlmsr(get_debugctlmsr() & ~BTS_CONTROL); + +	if (task) { +		task->thread.debugctlmsr &= ~BTS_CONTROL; + +		if (!task->thread.debugctlmsr) +			clear_tsk_thread_flag(task, TIF_DEBUGCTLMSR); +	}  } -static int ds_reset_or_clear(struct task_struct *task, -			     enum ds_qualifier qual, int clear) +void ds_resume_bts(struct bts_tracer *tracer)  { -	struct ds_context *context; -	unsigned long base, end; -	int error; +	struct task_struct *task; +	unsigned long control; -	context = ds_get_context(task); -	error = ds_validate_access(context, qual); -	if (error < 0) -		goto out; +	if (!tracer) +		return; -	base = ds_get(context->ds, qual, ds_buffer_base); -	end  = ds_get(context->ds, qual, ds_absolute_maximum); +	task = tracer->ds.context->task; -	if (clear) -		memset((void *)base, 0, end - base); +	control = ds_cfg.ctl[dsf_bts]; +	if (!(tracer->trace.ds.flags & BTS_KERNEL)) +		control |= ds_cfg.ctl[dsf_bts_kernel]; +	if (!(tracer->trace.ds.flags & BTS_USER)) +		control |= ds_cfg.ctl[dsf_bts_user]; -	ds_set(context->ds, qual, ds_index, base); +	if (task) { +		task->thread.debugctlmsr |= control; +		set_tsk_thread_flag(task, TIF_DEBUGCTLMSR); +	} -	error = 0; - out: -	ds_put_context(context); -	return error; +	if (!task || (task == current)) +		update_debugctlmsr(get_debugctlmsr() | control);  } -int ds_reset_bts(struct task_struct *task) +void ds_release_pebs(struct pebs_tracer *tracer)  { -	return ds_reset_or_clear(task, ds_bts, /* clear = */ 0); +	if (!tracer) +		return; + +	ds_suspend_pebs(tracer); + +	WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer); +	tracer->ds.context->pebs_master = NULL; + +	put_tracer(tracer->ds.context->task); +	ds_put_context(tracer->ds.context); + +	kfree(tracer);  } -int ds_reset_pebs(struct task_struct *task) +void ds_suspend_pebs(struct pebs_tracer *tracer)  { -	return ds_reset_or_clear(task, ds_pebs, /* clear = */ 0); +  } -int ds_clear_bts(struct task_struct *task) +void ds_resume_pebs(struct pebs_tracer *tracer)  { -	return ds_reset_or_clear(task, ds_bts, /* clear = */ 1); +  } -int ds_clear_pebs(struct task_struct *task) +const struct bts_trace *ds_read_bts(struct bts_tracer *tracer)  { -	return ds_reset_or_clear(task, ds_pebs, /* clear = */ 1); +	if (!tracer) +		return NULL; + +	ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_bts); +	return &tracer->trace;  } -int ds_get_pebs_reset(struct task_struct *task, u64 *value) +const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer)  { -	struct ds_context *context; -	int error; +	if (!tracer) +		return NULL; + +	ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs); +	tracer->trace.reset_value = +		*(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)); -	if (!value) +	return &tracer->trace; +} + +int ds_reset_bts(struct bts_tracer *tracer) +{ +	if (!tracer)  		return -EINVAL; -	context = ds_get_context(task); -	error = ds_validate_access(context, ds_pebs); -	if (error < 0) -		goto out; +	tracer->trace.ds.top = tracer->trace.ds.begin; -	*value = *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)); +	ds_set(tracer->ds.context->ds, ds_bts, ds_index, +	       (unsigned long)tracer->trace.ds.top); -	error = 0; - out: -	ds_put_context(context); -	return error; +	return 0;  } -int ds_set_pebs_reset(struct task_struct *task, u64 value) +int ds_reset_pebs(struct pebs_tracer *tracer)  { -	struct ds_context *context; -	int error; +	if (!tracer) +		return -EINVAL; -	context = ds_get_context(task); -	error = ds_validate_access(context, ds_pebs); -	if (error < 0) -		goto out; +	tracer->trace.ds.top = tracer->trace.ds.begin; -	*(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)) = value; +	ds_set(tracer->ds.context->ds, ds_bts, ds_index, +	       (unsigned long)tracer->trace.ds.top); -	error = 0; - out: -	ds_put_context(context); -	return error; +	return 0; +} + +int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value) +{ +	if (!tracer) +		return -EINVAL; + +	*(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)) = value; + +	return 0;  } -static const struct ds_configuration ds_cfg_var = { -	.sizeof_ds    = sizeof(long) * 12, -	.sizeof_field = sizeof(long), -	.sizeof_rec[ds_bts]   = sizeof(long) * 3, +static const struct ds_configuration ds_cfg_netburst = { +	.name = "netburst", +	.ctl[dsf_bts]		= (1 << 2) | (1 << 3), +	.ctl[dsf_bts_kernel]	= (1 << 5), +	.ctl[dsf_bts_user]	= (1 << 6), + +	.sizeof_field		= sizeof(long), +	.sizeof_rec[ds_bts]	= sizeof(long) * 3,  #ifdef __i386__ -	.sizeof_rec[ds_pebs]  = sizeof(long) * 10 +	.sizeof_rec[ds_pebs]	= sizeof(long) * 10,  #else -	.sizeof_rec[ds_pebs]  = sizeof(long) * 18 +	.sizeof_rec[ds_pebs]	= sizeof(long) * 18,  #endif  }; -static const struct ds_configuration ds_cfg_64 = { -	.sizeof_ds    = 8 * 12, -	.sizeof_field = 8, -	.sizeof_rec[ds_bts]   = 8 * 3, +static const struct ds_configuration ds_cfg_pentium_m = { +	.name = "pentium m", +	.ctl[dsf_bts]		= (1 << 6) | (1 << 7), + +	.sizeof_field		= sizeof(long), +	.sizeof_rec[ds_bts]	= sizeof(long) * 3,  #ifdef __i386__ -	.sizeof_rec[ds_pebs]  = 8 * 10 +	.sizeof_rec[ds_pebs]	= sizeof(long) * 10,  #else -	.sizeof_rec[ds_pebs]  = 8 * 18 +	.sizeof_rec[ds_pebs]	= sizeof(long) * 18,  #endif  }; +static const struct ds_configuration ds_cfg_core2 = { +	.name = "core 2", +	.ctl[dsf_bts]		= (1 << 6) | (1 << 7), +	.ctl[dsf_bts_kernel]	= (1 << 9), +	.ctl[dsf_bts_user]	= (1 << 10), + +	.sizeof_field		= 8, +	.sizeof_rec[ds_bts]	= 8 * 3, +	.sizeof_rec[ds_pebs]	= 8 * 18, +}; -static inline void +static void  ds_configure(const struct ds_configuration *cfg)  { +	memset(&ds_cfg, 0, sizeof(ds_cfg));  	ds_cfg = *cfg; + +	printk(KERN_INFO "[ds] using %s configuration\n", ds_cfg.name); + +	if (!cpu_has_bts) { +		ds_cfg.ctl[dsf_bts] = 0; +		printk(KERN_INFO "[ds] bts not available\n"); +	} +	if (!cpu_has_pebs) +		printk(KERN_INFO "[ds] pebs not available\n"); + +	WARN_ON_ONCE(MAX_SIZEOF_DS < (12 * ds_cfg.sizeof_field));  }  void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) @@ -847,16 +949,15 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)  	switch (c->x86) {  	case 0x6:  		switch (c->x86_model) { +		case 0 ... 0xC: +			/* sorry, don't know about them */ +			break;  		case 0xD:  		case 0xE: /* Pentium M */ -			ds_configure(&ds_cfg_var); +			ds_configure(&ds_cfg_pentium_m);  			break; -		case 0xF: /* Core2 */ -		case 0x1C: /* Atom */ -			ds_configure(&ds_cfg_64); -			break; -		default: -			/* sorry, don't know about them */ +		default: /* Core2, Atom, ... */ +			ds_configure(&ds_cfg_core2);  			break;  		}  		break; @@ -865,7 +966,7 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)  		case 0x0:  		case 0x1:  		case 0x2: /* Netburst */ -			ds_configure(&ds_cfg_var); +			ds_configure(&ds_cfg_netburst);  			break;  		default:  			/* sorry, don't know about them */ @@ -878,12 +979,52 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)  	}  } -void ds_free(struct ds_context *context) +/* + * Change the DS configuration from tracing prev to tracing next. + */ +void ds_switch_to(struct task_struct *prev, struct task_struct *next) +{ +	struct ds_context *prev_ctx = prev->thread.ds_ctx; +	struct ds_context *next_ctx = next->thread.ds_ctx; + +	if (prev_ctx) { +		update_debugctlmsr(0); + +		if (prev_ctx->bts_master && +		    (prev_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) { +			struct bts_struct ts = { +				.qualifier = bts_task_departs, +				.variant.timestamp.jiffies = jiffies_64, +				.variant.timestamp.pid = prev->pid +			}; +			bts_write(prev_ctx->bts_master, &ts); +		} +	} + +	if (next_ctx) { +		if (next_ctx->bts_master && +		    (next_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) { +			struct bts_struct ts = { +				.qualifier = bts_task_arrives, +				.variant.timestamp.jiffies = jiffies_64, +				.variant.timestamp.pid = next->pid +			}; +			bts_write(next_ctx->bts_master, &ts); +		} + +		wrmsrl(MSR_IA32_DS_AREA, (unsigned long)next_ctx->ds); +	} + +	update_debugctlmsr(next->thread.debugctlmsr); +} + +void ds_copy_thread(struct task_struct *tsk, struct task_struct *father) +{ +	clear_tsk_thread_flag(tsk, TIF_DS_AREA_MSR); +	tsk->thread.ds_ctx = NULL; +} + +void ds_exit_thread(struct task_struct *tsk)  { -	/* This is called when the task owning the parameter context -	 * is dying. There should not be any user of that context left -	 * to disturb us, anymore. */ -	unsigned long leftovers = context->count; -	while (leftovers--) -		ds_put_context(context); +	WARN_ON(tsk->thread.ds_ctx);  }  | 
