From 2f1b6248682f8b39ca3c7e549dfc216d26c4109b Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 25 Sep 2006 23:31:13 -0700
Subject: [PATCH] reduce MAX_NR_ZONES: use enum to define zones, reformat and
 comment

Use enum for zones and reformat zones dependent information

Add comments explaning the use of zones and add a zones_t type for zone
numbers.

Line up information that will be #ifdefd by the following patches.

[akpm@osdl.org: comment cleanups]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mmzone.h | 66 +++++++++++++++++++++++++++++++++++---------------
 1 file changed, 47 insertions(+), 19 deletions(-)

(limited to 'include/linux/mmzone.h')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index f45163c528e8..03a5a6eb0ffa 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -88,14 +88,53 @@ struct per_cpu_pageset {
 #define zone_pcp(__z, __cpu) (&(__z)->pageset[(__cpu)])
 #endif
 
-#define ZONE_DMA		0
-#define ZONE_DMA32		1
-#define ZONE_NORMAL		2
-#define ZONE_HIGHMEM		3
+enum zone_type {
+	/*
+	 * ZONE_DMA is used when there are devices that are not able
+	 * to do DMA to all of addressable memory (ZONE_NORMAL). Then we
+	 * carve out the portion of memory that is needed for these devices.
+	 * The range is arch specific.
+	 *
+	 * Some examples
+	 *
+	 * Architecture		Limit
+	 * ---------------------------
+	 * parisc, ia64, sparc	<4G
+	 * s390			<2G
+	 * arm26		<48M
+	 * arm			Various
+	 * alpha		Unlimited or 0-16MB.
+	 *
+	 * i386, x86_64 and multiple other arches
+	 * 			<16M.
+	 */
+	ZONE_DMA,
+	/*
+	 * x86_64 needs two ZONE_DMAs because it supports devices that are
+	 * only able to do DMA to the lower 16M but also 32 bit devices that
+	 * can only do DMA areas below 4G.
+	 */
+	ZONE_DMA32,
+	/*
+	 * Normal addressable memory is in ZONE_NORMAL. DMA operations can be
+	 * performed on pages in ZONE_NORMAL if the DMA devices support
+	 * transfers to all addressable memory.
+	 */
+	ZONE_NORMAL,
+	/*
+	 * A memory area that is only addressable by the kernel through
+	 * mapping portions into its own address space. This is for example
+	 * used by i386 to allow the kernel to address the memory beyond
+	 * 900MB. The kernel will set up special mappings (page
+	 * table entries on i386) for each page that the kernel needs to
+	 * access.
+	 */
+	ZONE_HIGHMEM,
 
-#define MAX_NR_ZONES		4	/* Sync this with ZONES_SHIFT */
-#define ZONES_SHIFT		2	/* ceil(log2(MAX_NR_ZONES)) */
+	MAX_NR_ZONES
+};
 
+#define	ZONES_SHIFT		2	/* ceil(log2(MAX_NR_ZONES)) */
 
 /*
  * When a memory allocation must conform to specific limitations (such
@@ -126,16 +165,6 @@ struct per_cpu_pageset {
 /* #define GFP_ZONETYPES       (GFP_ZONEMASK + 1) */           /* Non-loner */
 #define GFP_ZONETYPES  ((GFP_ZONEMASK + 1) / 2 + 1)            /* Loner */
 
-/*
- * On machines where it is needed (eg PCs) we divide physical memory
- * into multiple physical zones. On a 32bit PC we have 4 zones:
- *
- * ZONE_DMA	  < 16 MB	ISA DMA capable memory
- * ZONE_DMA32	     0 MB 	Empty
- * ZONE_NORMAL	16-896 MB	direct mapped by the kernel
- * ZONE_HIGHMEM	 > 896 MB	only page cache and user processes
- */
-
 struct zone {
 	/* Fields commonly accessed by the page allocator */
 	unsigned long		free_pages;
@@ -266,7 +295,6 @@ struct zone {
 	char			*name;
 } ____cacheline_internodealigned_in_smp;
 
-
 /*
  * The "priority" of VM scanning is how much of the queues we will scan in one
  * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the
@@ -373,12 +401,12 @@ static inline int populated_zone(struct zone *zone)
 	return (!!zone->present_pages);
 }
 
-static inline int is_highmem_idx(int idx)
+static inline int is_highmem_idx(enum zone_type idx)
 {
 	return (idx == ZONE_HIGHMEM);
 }
 
-static inline int is_normal_idx(int idx)
+static inline int is_normal_idx(enum zone_type idx)
 {
 	return (idx == ZONE_NORMAL);
 }
-- 
cgit 


From fb0e7942bdcbbd2f90e61cb4cfa4fa892a873f8a Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 25 Sep 2006 23:31:13 -0700
Subject: [PATCH] reduce MAX_NR_ZONES: make ZONE_DMA32 optional

Make ZONE_DMA32 optional

- Add #ifdefs around ZONE_DMA32 specific code and definitions.

- Add CONFIG_ZONE_DMA32 config option and use that for x86_64
  that alone needs this zone.

- Remove the use of CONFIG_DMA_IS_DMA32 and CONFIG_DMA_IS_NORMAL
  for ia64 and fix up the way per node ZVCs are calculated.

- Fall back to prior GFP_ZONEMASK of 0x03 if there is no
  DMA32 zone.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mmzone.h | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'include/linux/mmzone.h')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 03a5a6eb0ffa..adae3c915938 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -109,12 +109,14 @@ enum zone_type {
 	 * 			<16M.
 	 */
 	ZONE_DMA,
+#ifdef CONFIG_ZONE_DMA32
 	/*
 	 * x86_64 needs two ZONE_DMAs because it supports devices that are
 	 * only able to do DMA to the lower 16M but also 32 bit devices that
 	 * can only do DMA areas below 4G.
 	 */
 	ZONE_DMA32,
+#endif
 	/*
 	 * Normal addressable memory is in ZONE_NORMAL. DMA operations can be
 	 * performed on pages in ZONE_NORMAL if the DMA devices support
@@ -161,9 +163,13 @@ enum zone_type {
  *
  * NOTE! Make sure this matches the zones in <linux/gfp.h>
  */
-#define GFP_ZONEMASK	0x07
-/* #define GFP_ZONETYPES       (GFP_ZONEMASK + 1) */           /* Non-loner */
-#define GFP_ZONETYPES  ((GFP_ZONEMASK + 1) / 2 + 1)            /* Loner */
+#define GFP_ZONETYPES		((GFP_ZONEMASK + 1) / 2 + 1)    /* Loner */
+
+#ifdef CONFIG_ZONE_DMA32
+#define GFP_ZONEMASK		0x07
+#else
+#define GFP_ZONEMASK		0x03
+#endif
 
 struct zone {
 	/* Fields commonly accessed by the page allocator */
@@ -429,7 +435,11 @@ static inline int is_normal(struct zone *zone)
 
 static inline int is_dma32(struct zone *zone)
 {
+#ifdef CONFIG_ZONE_DMA32
 	return zone == zone->zone_pgdat->node_zones + ZONE_DMA32;
+#else
+	return 0;
+#endif
 }
 
 static inline int is_dma(struct zone *zone)
-- 
cgit 


From e53ef38d05dd59ed281a35590e4a5b64d8ff4c52 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 25 Sep 2006 23:31:14 -0700
Subject: [PATCH] reduce MAX_NR_ZONES: make ZONE_HIGHMEM optional

Make ZONE_HIGHMEM optional

- ifdef out code and definitions related to CONFIG_HIGHMEM

- __GFP_HIGHMEM falls back to normal allocations if there is no
  ZONE_HIGHMEM

- GFP_ZONEMASK becomes 0x01 if there is no DMA32 and no HIGHMEM
  zone.

[jdike@addtoit.com: build fix]
Signed-off-by: Jeff Dike <jdike@addtoit.com>
Signed-off-by: Christoph Lameter <clameter@engr.sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mmzone.h | 36 +++++++++++++++++++++++++++++++++---
 1 file changed, 33 insertions(+), 3 deletions(-)

(limited to 'include/linux/mmzone.h')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index adae3c915938..76d33e688593 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -123,6 +123,7 @@ enum zone_type {
 	 * transfers to all addressable memory.
 	 */
 	ZONE_NORMAL,
+#ifdef CONFIG_HIGHMEM
 	/*
 	 * A memory area that is only addressable by the kernel through
 	 * mapping portions into its own address space. This is for example
@@ -132,11 +133,10 @@ enum zone_type {
 	 * access.
 	 */
 	ZONE_HIGHMEM,
-
+#endif
 	MAX_NR_ZONES
 };
 
-#define	ZONES_SHIFT		2	/* ceil(log2(MAX_NR_ZONES)) */
 
 /*
  * When a memory allocation must conform to specific limitations (such
@@ -163,12 +163,34 @@ enum zone_type {
  *
  * NOTE! Make sure this matches the zones in <linux/gfp.h>
  */
-#define GFP_ZONETYPES		((GFP_ZONEMASK + 1) / 2 + 1)    /* Loner */
 
 #ifdef CONFIG_ZONE_DMA32
+
+#ifdef CONFIG_HIGHMEM
+#define GFP_ZONETYPES		((GFP_ZONEMASK + 1) / 2 + 1)    /* Loner */
 #define GFP_ZONEMASK		0x07
+#define ZONES_SHIFT		2	/* ceil(log2(MAX_NR_ZONES)) */
 #else
+#define GFP_ZONETYPES		((0x07 + 1) / 2 + 1)    /* Loner */
+/* Mask __GFP_HIGHMEM */
+#define GFP_ZONEMASK		0x05
+#define ZONES_SHIFT		2
+#endif
+
+#else
+#ifdef CONFIG_HIGHMEM
+
 #define GFP_ZONEMASK		0x03
+#define ZONES_SHIFT		2
+#define GFP_ZONETYPES		3
+
+#else
+
+#define GFP_ZONEMASK		0x01
+#define ZONES_SHIFT		1
+#define GFP_ZONETYPES		2
+
+#endif
 #endif
 
 struct zone {
@@ -409,7 +431,11 @@ static inline int populated_zone(struct zone *zone)
 
 static inline int is_highmem_idx(enum zone_type idx)
 {
+#ifdef CONFIG_HIGHMEM
 	return (idx == ZONE_HIGHMEM);
+#else
+	return 0;
+#endif
 }
 
 static inline int is_normal_idx(enum zone_type idx)
@@ -425,7 +451,11 @@ static inline int is_normal_idx(enum zone_type idx)
  */
 static inline int is_highmem(struct zone *zone)
 {
+#ifdef CONFIG_HIGHMEM
 	return zone == zone->zone_pgdat->node_zones + ZONE_HIGHMEM;
+#else
+	return 0;
+#endif
 }
 
 static inline int is_normal(struct zone *zone)
-- 
cgit 


From 19655d3487001d7df0e10e9cbfc27c758b77c2b5 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 25 Sep 2006 23:31:19 -0700
Subject: [PATCH] linearly index zone->node_zonelists[]

I wonder why we need this bitmask indexing into zone->node_zonelists[]?

We always start with the highest zone and then include all lower zones
if we build zonelists.

Are there really cases where we need allocation from ZONE_DMA or
ZONE_HIGHMEM but not ZONE_NORMAL? It seems that the current implementation
of highest_zone() makes that already impossible.

If we go linear on the index then gfp_zone() == highest_zone() and a lot
of definitions fall by the wayside.

We can now revert back to the use of gfp_zone() in mempolicy.c ;-)

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mmzone.h | 52 +++++---------------------------------------------
 1 file changed, 5 insertions(+), 47 deletions(-)

(limited to 'include/linux/mmzone.h')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 76d33e688593..7fe317164b73 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -137,60 +137,18 @@ enum zone_type {
 	MAX_NR_ZONES
 };
 
-
 /*
  * When a memory allocation must conform to specific limitations (such
  * as being suitable for DMA) the caller will pass in hints to the
  * allocator in the gfp_mask, in the zone modifier bits.  These bits
  * are used to select a priority ordered list of memory zones which
- * match the requested limits.  GFP_ZONEMASK defines which bits within
- * the gfp_mask should be considered as zone modifiers.  Each valid
- * combination of the zone modifier bits has a corresponding list
- * of zones (in node_zonelists).  Thus for two zone modifiers there
- * will be a maximum of 4 (2 ** 2) zonelists, for 3 modifiers there will
- * be 8 (2 ** 3) zonelists.  GFP_ZONETYPES defines the number of possible
- * combinations of zone modifiers in "zone modifier space".
- *
- * As an optimisation any zone modifier bits which are only valid when
- * no other zone modifier bits are set (loners) should be placed in
- * the highest order bits of this field.  This allows us to reduce the
- * extent of the zonelists thus saving space.  For example in the case
- * of three zone modifier bits, we could require up to eight zonelists.
- * If the left most zone modifier is a "loner" then the highest valid
- * zonelist would be four allowing us to allocate only five zonelists.
- * Use the first form for GFP_ZONETYPES when the left most bit is not
- * a "loner", otherwise use the second.
- *
- * NOTE! Make sure this matches the zones in <linux/gfp.h>
+ * match the requested limits. See gfp_zone() in include/linux/gfp.h
  */
 
-#ifdef CONFIG_ZONE_DMA32
-
-#ifdef CONFIG_HIGHMEM
-#define GFP_ZONETYPES		((GFP_ZONEMASK + 1) / 2 + 1)    /* Loner */
-#define GFP_ZONEMASK		0x07
-#define ZONES_SHIFT		2	/* ceil(log2(MAX_NR_ZONES)) */
-#else
-#define GFP_ZONETYPES		((0x07 + 1) / 2 + 1)    /* Loner */
-/* Mask __GFP_HIGHMEM */
-#define GFP_ZONEMASK		0x05
-#define ZONES_SHIFT		2
-#endif
-
-#else
-#ifdef CONFIG_HIGHMEM
-
-#define GFP_ZONEMASK		0x03
-#define ZONES_SHIFT		2
-#define GFP_ZONETYPES		3
-
+#if !defined(CONFIG_ZONE_DMA32) && !defined(CONFIG_HIGHMEM)
+#define ZONES_SHIFT 1
 #else
-
-#define GFP_ZONEMASK		0x01
-#define ZONES_SHIFT		1
-#define GFP_ZONETYPES		2
-
-#endif
+#define ZONES_SHIFT 2
 #endif
 
 struct zone {
@@ -360,7 +318,7 @@ struct zonelist {
 struct bootmem_data;
 typedef struct pglist_data {
 	struct zone node_zones[MAX_NR_ZONES];
-	struct zonelist node_zonelists[GFP_ZONETYPES];
+	struct zonelist node_zonelists[MAX_NR_ZONES];
 	int nr_zones;
 #ifdef CONFIG_FLAT_NODE_MEM_MAP
 	struct page *node_mem_map;
-- 
cgit 


From 8417bba4b151346ed475fcc923693c9e3be89063 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 25 Sep 2006 23:31:51 -0700
Subject: [PATCH] Replace min_unmapped_ratio by min_unmapped_pages in struct
 zone

*_pages is a better description of the role of the variable.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mmzone.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux/mmzone.h')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 7fe317164b73..a703527e2b45 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -169,7 +169,7 @@ struct zone {
 	/*
 	 * zone reclaim becomes active if more unmapped pages exist.
 	 */
-	unsigned long		min_unmapped_ratio;
+	unsigned long		min_unmapped_pages;
 	struct per_cpu_pageset	*pageset[NR_CPUS];
 #else
 	struct per_cpu_pageset	pageset[NR_CPUS];
-- 
cgit 


From 972d1a7b140569084439a81265a0f15b74e924e0 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 25 Sep 2006 23:31:51 -0700
Subject: [PATCH] ZVC: Support NR_SLAB_RECLAIMABLE / NR_SLAB_UNRECLAIMABLE

Remove the atomic counter for slab_reclaim_pages and replace the counter
and NR_SLAB with two ZVC counter that account for unreclaimable and
reclaimable slab pages: NR_SLAB_RECLAIMABLE and NR_SLAB_UNRECLAIMABLE.

Change the check in vmscan.c to refer to to NR_SLAB_RECLAIMABLE.  The
intend seems to be to check for slab pages that could be freed.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mmzone.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux/mmzone.h')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index a703527e2b45..08c41b9f92e0 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -51,7 +51,8 @@ enum zone_stat_item {
 	NR_FILE_MAPPED,	/* pagecache pages mapped into pagetables.
 			   only modified from process context */
 	NR_FILE_PAGES,
-	NR_SLAB,	/* Pages used by slab allocator */
+	NR_SLAB_RECLAIMABLE,
+	NR_SLAB_UNRECLAIMABLE,
 	NR_PAGETABLE,	/* used for pagetables */
 	NR_FILE_DIRTY,
 	NR_WRITEBACK,
-- 
cgit 


From 0ff38490c836dc379ff7ec45b10a15a662f4e5f6 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 25 Sep 2006 23:31:52 -0700
Subject: [PATCH] zone_reclaim: dynamic slab reclaim

Currently one can enable slab reclaim by setting an explicit option in
/proc/sys/vm/zone_reclaim_mode.  Slab reclaim is then used as a final
option if the freeing of unmapped file backed pages is not enough to free
enough pages to allow a local allocation.

However, that means that the slab can grow excessively and that most memory
of a node may be used by slabs.  We have had a case where a machine with
46GB of memory was using 40-42GB for slab.  Zone reclaim was effective in
dealing with pagecache pages.  However, slab reclaim was only done during
global reclaim (which is a bit rare on NUMA systems).

This patch implements slab reclaim during zone reclaim.  Zone reclaim
occurs if there is a danger of an off node allocation.  At that point we

1. Shrink the per node page cache if the number of pagecache
   pages is more than min_unmapped_ratio percent of pages in a zone.

2. Shrink the slab cache if the number of the nodes reclaimable slab pages
   (patch depends on earlier one that implements that counter)
   are more than min_slab_ratio (a new /proc/sys/vm tunable).

The shrinking of the slab cache is a bit problematic since it is not node
specific.  So we simply calculate what point in the slab we want to reach
(current per node slab use minus the number of pages that neeed to be
allocated) and then repeately run the global reclaim until that is
unsuccessful or we have reached the limit.  I hope we will have zone based
slab reclaim at some point which will make that easier.

The default for the min_slab_ratio is 5%

Also remove the slab option from /proc/sys/vm/zone_reclaim_mode.

[akpm@osdl.org: cleanups]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mmzone.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux/mmzone.h')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 08c41b9f92e0..3693f1a52788 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -171,6 +171,7 @@ struct zone {
 	 * zone reclaim becomes active if more unmapped pages exist.
 	 */
 	unsigned long		min_unmapped_pages;
+	unsigned long		min_slab_pages;
 	struct per_cpu_pageset	*pageset[NR_CPUS];
 #else
 	struct per_cpu_pageset	pageset[NR_CPUS];
@@ -448,6 +449,8 @@ int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int, struct file
 					void __user *, size_t *, loff_t *);
 int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int,
 			struct file *, void __user *, size_t *, loff_t *);
+int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int,
+			struct file *, void __user *, size_t *, loff_t *);
 
 #include <linux/topology.h>
 /* Returns the number of the current Node. */
-- 
cgit