diff options
| -rw-r--r-- | fs/orangefs/dir.c | 273 | 
1 files changed, 185 insertions, 88 deletions
| diff --git a/fs/orangefs/dir.c b/fs/orangefs/dir.c index 9744fb3ad144..7e9814fc6cc3 100644 --- a/fs/orangefs/dir.c +++ b/fs/orangefs/dir.c @@ -6,6 +6,22 @@  #include "orangefs-kernel.h"  #include "orangefs-bufmap.h" +struct orangefs_dir_part { +	struct orangefs_dir_part *next; +	size_t len; +}; + +struct orangefs_dir { +	__u64 token; +	struct orangefs_dir_part *part; +	loff_t end; +	int error; +}; + +#define PART_SHIFT (24) +#define PART_SIZE (1<<24) +#define PART_MASK (~(PART_SIZE - 1)) +  /*   * There can be up to 512 directory entries.  Each entry is encoded as   * follows: @@ -15,42 +31,39 @@   * padding to 8 bytes   * 16 bytes: khandle   * padding to 8 bytes - */ -#define MAX_DIRECTORY ((4 + 257 + 3 + 16)*512) - -struct orangefs_dir { -	__u64 token; -	void *directory; -	size_t len; -	int error; -}; - -/* - * The userspace component sends several directory entries of the - * following format.  The first four bytes are the string length not - * including a trailing zero byte.  This is followed by the string and a - * trailing zero padded to the next four byte boundry.  This is followed - * by the sixteen byte khandle padded to the next eight byte boundry.   *   * The trailer_buf starts with a struct orangefs_readdir_response_s   * which must be skipped to get to the directory data. + * + * The data which is received from the userspace daemon is termed a + * part and is stored in a linked list in case more than one part is + * needed for a large directory. + * + * The position pointer (ctx->pos) encodes the part and offset on which + * to begin reading at.  Bits above PART_SHIFT encode the part and bits + * below PART_SHIFT encode the offset.  Parts are stored in a linked + * list which grows as data is received from the server.  The overhead + * associated with managing the list is presumed to be small compared to + * the overhead of communicating with the server. + * + * As data is received from the server, it is placed at the end of the + * part list.  Data is parsed from the current position as it is needed. + * When data is determined to be corrupt, it is either because the + * userspace component has sent back corrupt data or because the file + * pointer has been moved to an invalid location.  Since the two cannot + * be differentiated, return EIO. + * + * Part zero is synthesized to contains `.' and `..'.  Part one is the + * first part of the part list.   */ -static int orangefs_dir_more(struct orangefs_inode_s *oi, -    struct orangefs_dir *od, struct dentry *dentry) +static int do_readdir(struct orangefs_inode_s *oi, +    struct orangefs_dir *od, struct dentry *dentry, +    struct orangefs_kernel_op_s *op)  { -	const size_t offset = -	    sizeof(struct orangefs_readdir_response_s);  	struct orangefs_readdir_response_s *resp; -	struct orangefs_kernel_op_s *op;  	int bufi, r; -	op = op_alloc(ORANGEFS_VFS_OP_READDIR); -	if (!op) { -		od->error = -ENOMEM; -		return -ENOMEM; -	} -  	/*  	 * Despite the badly named field, readdir does not use shared  	 * memory.  However, there are a limited number of readdir @@ -66,7 +79,6 @@ static int orangefs_dir_more(struct orangefs_inode_s *oi,  again:  	bufi = orangefs_readdir_index_get();  	if (bufi < 0) { -		op_release(op);  		od->error = bufi;  		return bufi;  	} @@ -84,7 +96,6 @@ again:  			goto again;  		} else if (r == -EIO) {  			vfree(op->downcall.trailer_buf); -			op_release(op);  			od->error = r;  			return r;  		} @@ -92,82 +103,166 @@ again:  	if (r < 0) {  		vfree(op->downcall.trailer_buf); -		op_release(op);  		od->error = r;  		return r;  	} else if (op->downcall.status) {  		vfree(op->downcall.trailer_buf); -		op_release(op);  		od->error = op->downcall.status;  		return op->downcall.status;  	} +	/* +	 * The maximum size is size per entry times the 512 entries plus +	 * the header.  This is well under the limit. +	 */ +	if (op->downcall.trailer_size > PART_SIZE) { +		vfree(op->downcall.trailer_buf); +		od->error = -EIO; +		return -EIO; +	} +  	resp = (struct orangefs_readdir_response_s *)  	    op->downcall.trailer_buf;  	od->token = resp->token; +	return 0; +} -	if (od->len + op->downcall.trailer_size - offset <= -	    MAX_DIRECTORY) { -		memcpy(od->directory + od->len, -		    op->downcall.trailer_buf + offset, -		    op->downcall.trailer_size - offset); -		od->len += op->downcall.trailer_size - offset; -	} else { -		/* This limit was chosen based on protocol limits. */ -		gossip_err("orangefs_dir_more: userspace sent too much data\n"); -		vfree(op->downcall.trailer_buf); -		op_release(op); -		od->error = -EIO; -		return -EIO; +static int parse_readdir(struct orangefs_dir *od, +    struct orangefs_kernel_op_s *op) +{ +	struct orangefs_dir_part *part, *new; +	size_t count; + +	count = 1; +	part = od->part; +	while (part && part->next) { +		part = part->next; +		count++;  	} -	vfree(op->downcall.trailer_buf); -	op_release(op); +	new = (void *)op->downcall.trailer_buf; +	new->next = NULL; +	new->len = op->downcall.trailer_size - +	    sizeof(struct orangefs_readdir_response_s); +	if (!od->part) +		od->part = new; +	else +		part->next = new; +	count++; +	od->end = count << PART_SHIFT; +  	return 0;  } -static int orangefs_dir_fill(struct orangefs_inode_s *oi, -    struct orangefs_dir *od, struct dentry *dentry, +static int orangefs_dir_more(struct orangefs_inode_s *oi, +    struct orangefs_dir *od, struct dentry *dentry) +{ +	struct orangefs_kernel_op_s *op; +	int r; + +	op = op_alloc(ORANGEFS_VFS_OP_READDIR); +	if (!op) { +		od->error = -ENOMEM; +		return -ENOMEM; +	} +	r = do_readdir(oi, od, dentry, op); +	if (r) { +		od->error = r; +		goto out; +	} +	r = parse_readdir(od, op); +	if (r) { +		od->error = r; +		goto out; +	} + +	od->error = 0; +out: +	op_release(op); +	return od->error; +} + +static int fill_from_part(struct orangefs_dir_part *part,      struct dir_context *ctx)  { +	const int offset = sizeof(struct orangefs_readdir_response_s);  	struct orangefs_khandle *khandle;  	__u32 *len, padlen;  	loff_t i;  	char *s; -	i = ctx->pos - 2; -	while (i < od->len) { -		if (od->len < i + sizeof *len) -			goto eio; -		len = od->directory + i; +	i = ctx->pos & ~PART_MASK; + +	/* The file offset from userspace is too large. */ +	if (i > part->len) +		return -EIO; + +	while (i < part->len) { +		if (part->len < i + sizeof *len) +			return -EIO; +		len = (void *)part + offset + i;  		/*  		 * len is the size of the string itself.  padlen is the  		 * total size of the encoded string.  		 */  		padlen = (sizeof *len + *len + 1) + -		    (4 - (sizeof *len + *len + 1)%8)%8; -		if (od->len < i + padlen + sizeof *khandle) -			goto eio; -		s = od->directory + i + sizeof *len; +		    (8 - (sizeof *len + *len + 1)%8)%8; +		if (part->len < i + padlen + sizeof *khandle) +			return -EIO; +		s = (void *)part + offset + i + sizeof *len;  		if (s[*len] != 0) -			goto eio; -		khandle = od->directory + i + padlen; - +			return -EIO; +		khandle = (void *)part + offset + i + padlen;  		if (!dir_emit(ctx, s, *len, -		    orangefs_khandle_to_ino(khandle), DT_UNKNOWN)) +		    orangefs_khandle_to_ino(khandle), +		    DT_UNKNOWN))  			return 0;  		i += padlen + sizeof *khandle;  		i = i + (8 - i%8)%8; -		ctx->pos = i + 2; +		BUG_ON(i > part->len); +		ctx->pos = (ctx->pos & PART_MASK) | i; +	} +	return 1; +} + +static int orangefs_dir_fill(struct orangefs_inode_s *oi, +    struct orangefs_dir *od, struct dentry *dentry, +    struct dir_context *ctx) +{ +	struct orangefs_dir_part *part; +	size_t count; + +	count = ((ctx->pos & PART_MASK) >> PART_SHIFT) - 1; + +	part = od->part; +	while (part->next && count) { +		count--; +		part = part->next; +	} +	/* This means the userspace file offset is invalid. */ +	if (count) { +		od->error = -EIO; +		return -EIO; +	} + +	while (part && part->len) { +		int r; +		r = fill_from_part(part, ctx); +		if (r < 0) { +			od->error = r; +			return r; +		} else if (r == 0) { +			/* Userspace buffer is full. */ +			break; +		} else { +			/* +			 * The part ran out of data.  Move to the next +			 * part. */ +			ctx->pos = (ctx->pos & PART_MASK) + +			    (1 << PART_SHIFT); +			part = part->next; +		}  	} -	BUG_ON(i > od->len);  	return 0; -eio: -	/* -	 * Here either data from userspace is corrupt or the application -	 * has sought to an invalid location. -	 */ -	od->error = -EIO; -	return -EIO;  }  static int orangefs_dir_iterate(struct file *file, @@ -193,28 +288,33 @@ static int orangefs_dir_iterate(struct file *file,  	if (ctx->pos == 1) {  		if (!dir_emit_dotdot(file, ctx))  			return 0; -		ctx->pos++; +		ctx->pos = 1 << PART_SHIFT;  	} +	/* +	 * The seek position is in the first synthesized part but is not +	 * valid. +	 */ +	if ((ctx->pos & PART_MASK) == 0) +		return -EIO; +  	r = 0;  	/*  	 * Must read more if the user has sought past what has been read  	 * so far.  Stop a user who has sought past the end.  	 */ -	while (od->token != ORANGEFS_READDIR_END && ctx->pos - 2 > -	    od->len) { +	while (od->token != ORANGEFS_READDIR_END && +	    ctx->pos > od->end) {  		r = orangefs_dir_more(oi, od, dentry);  		if (r)  			return r;  	} -	if (od->token == ORANGEFS_READDIR_END && ctx->pos - 2 > -	    od->len) { +	if (od->token == ORANGEFS_READDIR_END && ctx->pos > od->end)  		return -EIO; -	}  	/* Then try to fill if there's any left in the buffer. */ -	if (ctx->pos - 2 < od->len) { +	if (ctx->pos < od->end) {  		r = orangefs_dir_fill(oi, od, dentry, ctx);  		if (r)  			return r; @@ -240,16 +340,8 @@ static int orangefs_dir_open(struct inode *inode, struct file *file)  		return -ENOMEM;  	od = file->private_data;  	od->token = ORANGEFS_READDIR_START; -	/* -	 * XXX: It seems wasteful to allocate such a large buffer for -	 * each request.  Most will be much smaller. -	 */ -	od->directory = alloc_pages_exact(MAX_DIRECTORY, GFP_KERNEL); -	if (!od->directory) { -		kfree(file->private_data); -		return -ENOMEM; -	} -	od->len = 0; +	od->part = NULL; +	od->end = 1 << PART_SHIFT;  	od->error = 0;  	return 0;  } @@ -257,8 +349,13 @@ static int orangefs_dir_open(struct inode *inode, struct file *file)  static int orangefs_dir_release(struct inode *inode, struct file *file)  {  	struct orangefs_dir *od = file->private_data; +	struct orangefs_dir_part *part = od->part;  	orangefs_flush_inode(inode); -	free_pages_exact(od->directory, MAX_DIRECTORY); +	while (part) { +		struct orangefs_dir_part *next = part->next; +		vfree(part); +		part = next; +	}  	kfree(od);  	return 0;  } | 
