diff options
| author | Wu Fengguang <fengguang.wu@intel.com> | 2011-11-30 11:08:55 -0600 | 
|---|---|---|
| committer | Wu Fengguang <fengguang.wu@intel.com> | 2011-12-18 14:20:28 +0800 | 
| commit | 7ccb9ad5364d6ac0c803096c67e76a7545cf7a77 (patch) | |
| tree | 53894333454bca278f20f9c5841dd1b45c384721 /mm/page-writeback.c | |
| parent | 83712358ba0a1497ce59a4f84ce4dd0f803fe6fc (diff) | |
writeback: max, min and target dirty pause time
Control the pause time and the call intervals to balance_dirty_pages()
with three parameters:
1) max_pause, limited by bdi_dirty and MAX_PAUSE
2) the target pause time, grows with the number of dd tasks
   and is normally limited by max_pause/2
3) the minimal pause, set to half the target pause
   and is used to skip short sleeps and accumulate them into bigger ones
The typical behaviors after patch:
- if ever task_ratelimit is far below dirty_ratelimit, the pause time
  will remain constant at max_pause and nr_dirtied_pause will be
  fluctuating with task_ratelimit
- in the normal cases, nr_dirtied_pause will remain stable (keep in the
  same pace with dirty_ratelimit) and the pause time will be fluctuating
  with task_ratelimit
In summary, someone has to fluctuate with task_ratelimit, because
	task_ratelimit = nr_dirtied_pause / pause
We normally prefer a stable nr_dirtied_pause, until reaching max_pause.
The notable behavior changes are:
- in stable workloads, there will no longer be sudden big trajectory
  switching of nr_dirtied_pause as concerned by Peter. It will be as
  smooth as dirty_ratelimit and changing proportionally with it (as
  always, assuming bdi bandwidth does not fluctuate across 2^N lines,
  otherwise nr_dirtied_pause will show up in 2+ parallel trajectories)
- in the rare cases when something keeps task_ratelimit far below
  dirty_ratelimit, the smoothness can no longer be retained and
  nr_dirtied_pause will be "dancing" with task_ratelimit. This fixes a
  (not that destructive but still not good) bug that
	  dirty_ratelimit gets brought down undesirably
	  <= balanced_dirty_ratelimit is under estimated
	  <= weakly executed task_ratelimit
	  <= pause goes too large and gets trimmed down to max_pause
	  <= nr_dirtied_pause (based on dirty_ratelimit) is set too large
	  <= dirty_ratelimit being much larger than task_ratelimit
- introduce min_pause to avoid small pause sleeps
- when pause is trimmed down to max_pause, try to compensate it at the
  next pause time
The "refactor" type of changes are:
The max_pause equation is slightly transformed to make it slightly more
efficient.
We now scale target_pause by (N * 10ms) on 2^N concurrent tasks, which
is effectively equal to the original scaling max_pause by (N * 20ms)
because the original code does implicit target_pause ~= max_pause / 2.
Based on the same implicit ratio, target_pause starts with 10ms on 1 dd.
CC: Jan Kara <jack@suse.cz>
CC: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Diffstat (limited to 'mm/page-writeback.c')
| -rw-r--r-- | mm/page-writeback.c | 125 | 
1 files changed, 81 insertions, 44 deletions
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 491932155825..5830991f261a 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -962,40 +962,81 @@ static unsigned long dirty_poll_interval(unsigned long dirty,  	return 1;  } -static unsigned long bdi_max_pause(struct backing_dev_info *bdi, -				   unsigned long bdi_dirty) +static long bdi_max_pause(struct backing_dev_info *bdi, +			  unsigned long bdi_dirty)  { -	unsigned long bw = bdi->avg_write_bandwidth; -	unsigned long hi = ilog2(bw); -	unsigned long lo = ilog2(bdi->dirty_ratelimit); -	unsigned long t; +	long bw = bdi->avg_write_bandwidth; +	long t; -	/* target for 20ms max pause on 1-dd case */ -	t = HZ / 50; +	/* +	 * Limit pause time for small memory systems. If sleeping for too long +	 * time, a small pool of dirty/writeback pages may go empty and disk go +	 * idle. +	 * +	 * 8 serves as the safety ratio. +	 */ +	t = bdi_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8)); +	t++; + +	return min_t(long, t, MAX_PAUSE); +} + +static long bdi_min_pause(struct backing_dev_info *bdi, +			  long max_pause, +			  unsigned long task_ratelimit, +			  unsigned long dirty_ratelimit, +			  int *nr_dirtied_pause) +{ +	long hi = ilog2(bdi->avg_write_bandwidth); +	long lo = ilog2(bdi->dirty_ratelimit); +	long t;		/* target pause */ +	long pause;	/* estimated next pause */ +	int pages;	/* target nr_dirtied_pause */ + +	/* target for 10ms pause on 1-dd case */ +	t = max(1, HZ / 100);  	/*  	 * Scale up pause time for concurrent dirtiers in order to reduce CPU  	 * overheads.  	 * -	 * (N * 20ms) on 2^N concurrent tasks. +	 * (N * 10ms) on 2^N concurrent tasks.  	 */  	if (hi > lo) -		t += (hi - lo) * (20 * HZ) / 1024; +		t += (hi - lo) * (10 * HZ) / 1024;  	/* -	 * Limit pause time for small memory systems. If sleeping for too long -	 * time, a small pool of dirty/writeback pages may go empty and disk go -	 * idle. +	 * This is a bit convoluted. We try to base the next nr_dirtied_pause +	 * on the much more stable dirty_ratelimit. However the next pause time +	 * will be computed based on task_ratelimit and the two rate limits may +	 * depart considerably at some time. Especially if task_ratelimit goes +	 * below dirty_ratelimit/2 and the target pause is max_pause, the next +	 * pause time will be max_pause*2 _trimmed down_ to max_pause.  As a +	 * result task_ratelimit won't be executed faithfully, which could +	 * eventually bring down dirty_ratelimit.  	 * -	 * 8 serves as the safety ratio. +	 * We apply two rules to fix it up: +	 * 1) try to estimate the next pause time and if necessary, use a lower +	 *    nr_dirtied_pause so as not to exceed max_pause. When this happens, +	 *    nr_dirtied_pause will be "dancing" with task_ratelimit. +	 * 2) limit the target pause time to max_pause/2, so that the normal +	 *    small fluctuations of task_ratelimit won't trigger rule (1) and +	 *    nr_dirtied_pause will remain as stable as dirty_ratelimit.  	 */ -	t = min(t, bdi_dirty * HZ / (8 * bw + 1)); +	t = min(t, 1 + max_pause / 2); +	pages = dirty_ratelimit * t / roundup_pow_of_two(HZ); + +	pause = HZ * pages / (task_ratelimit + 1); +	if (pause > max_pause) { +		t = max_pause; +		pages = task_ratelimit * t / roundup_pow_of_two(HZ); +	} +	*nr_dirtied_pause = pages;  	/* -	 * The pause time will be settled within range (max_pause/4, max_pause). -	 * Apply a minimal value of 4 to get a non-zero max_pause/4. +	 * The minimal pause time will normally be half the target pause time.  	 */ -	return clamp_val(t, 4, MAX_PAUSE); +	return 1 + t / 2;  }  /* @@ -1017,11 +1058,13 @@ static void balance_dirty_pages(struct address_space *mapping,  	unsigned long dirty_thresh;  	unsigned long bdi_thresh;  	long period; -	long pause = 0; -	long uninitialized_var(max_pause); +	long pause; +	long max_pause; +	long min_pause; +	int nr_dirtied_pause;  	bool dirty_exceeded = false;  	unsigned long task_ratelimit; -	unsigned long uninitialized_var(dirty_ratelimit); +	unsigned long dirty_ratelimit;  	unsigned long pos_ratio;  	struct backing_dev_info *bdi = mapping->backing_dev_info;  	unsigned long start_time = jiffies; @@ -1051,6 +1094,8 @@ static void balance_dirty_pages(struct address_space *mapping,  		if (nr_dirty <= freerun) {  			current->dirty_paused_when = now;  			current->nr_dirtied = 0; +			current->nr_dirtied_pause = +				dirty_poll_interval(nr_dirty, dirty_thresh);  			break;  		} @@ -1101,14 +1146,17 @@ static void balance_dirty_pages(struct address_space *mapping,  				     nr_dirty, bdi_thresh, bdi_dirty,  				     start_time); -		max_pause = bdi_max_pause(bdi, bdi_dirty); -  		dirty_ratelimit = bdi->dirty_ratelimit;  		pos_ratio = bdi_position_ratio(bdi, dirty_thresh,  					       background_thresh, nr_dirty,  					       bdi_thresh, bdi_dirty);  		task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >>  							RATELIMIT_CALC_SHIFT; +		max_pause = bdi_max_pause(bdi, bdi_dirty); +		min_pause = bdi_min_pause(bdi, max_pause, +					  task_ratelimit, dirty_ratelimit, +					  &nr_dirtied_pause); +  		if (unlikely(task_ratelimit == 0)) {  			period = max_pause;  			pause = max_pause; @@ -1125,7 +1173,7 @@ static void balance_dirty_pages(struct address_space *mapping,  		 * future periods by updating the virtual time; otherwise just  		 * do a reset, as it may be a light dirtier.  		 */ -		if (unlikely(pause <= 0)) { +		if (pause < min_pause) {  			trace_balance_dirty_pages(bdi,  						  dirty_thresh,  						  background_thresh, @@ -1136,7 +1184,7 @@ static void balance_dirty_pages(struct address_space *mapping,  						  task_ratelimit,  						  pages_dirtied,  						  period, -						  pause, +						  min(pause, 0L),  						  start_time);  			if (pause < -HZ) {  				current->dirty_paused_when = now; @@ -1144,11 +1192,15 @@ static void balance_dirty_pages(struct address_space *mapping,  			} else if (period) {  				current->dirty_paused_when += period;  				current->nr_dirtied = 0; -			} -			pause = 1; /* avoid resetting nr_dirtied_pause below */ +			} else if (current->nr_dirtied_pause <= pages_dirtied) +				current->nr_dirtied_pause += pages_dirtied;  			break;  		} -		pause = min(pause, max_pause); +		if (unlikely(pause > max_pause)) { +			/* for occasional dropped task_ratelimit */ +			now += min(pause - max_pause, max_pause); +			pause = max_pause; +		}  pause:  		trace_balance_dirty_pages(bdi, @@ -1168,6 +1220,7 @@ pause:  		current->dirty_paused_when = now + pause;  		current->nr_dirtied = 0; +		current->nr_dirtied_pause = nr_dirtied_pause;  		/*  		 * This is typically equal to (nr_dirty < dirty_thresh) and can @@ -1196,22 +1249,6 @@ pause:  	if (!dirty_exceeded && bdi->dirty_exceeded)  		bdi->dirty_exceeded = 0; -	if (pause == 0) { /* in freerun area */ -		current->nr_dirtied_pause = -				dirty_poll_interval(nr_dirty, dirty_thresh); -	} else if (period <= max_pause / 4 && -		   pages_dirtied >= current->nr_dirtied_pause) { -		current->nr_dirtied_pause = clamp_val( -					dirty_ratelimit * (max_pause / 2) / HZ, -					pages_dirtied + pages_dirtied / 8, -					pages_dirtied * 4); -	} else if (pause >= max_pause) { -		current->nr_dirtied_pause = 1 | clamp_val( -					dirty_ratelimit * (max_pause / 2) / HZ, -					pages_dirtied / 4, -					pages_dirtied - pages_dirtied / 8); -	} -  	if (writeback_in_progress(bdi))  		return;  | 
