From d32311fed70d12f14e585feb4653571b1e2b0e6d Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sat, 17 Sep 2005 14:41:40 +1000
Subject: [PATCH 01/91] [PATCH] Introduce sg_set_buf

sg_init_one is a nice tool for the block layer.  However, users
of struct scatterlist in other subsystems don't usually need the
DMA attributes.  For them it's a waste of time and space to
initialise the whole struct scatterlist structure.

Therefore this patch adds a new function sg_set_buf to initialise
a scatterlist without zeroing the DMA attributes.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/scatterlist.h | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index 7f717e95ae37..66ff545552f7 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -1,14 +1,23 @@
 #ifndef _LINUX_SCATTERLIST_H
 #define _LINUX_SCATTERLIST_H
 
-static inline void sg_init_one(struct scatterlist *sg,
-			       u8 *buf, unsigned int buflen)
-{
-	memset(sg, 0, sizeof(*sg));
+#include <asm/scatterlist.h>
+#include <linux/mm.h>
+#include <linux/string.h>
 
+static inline void sg_set_buf(struct scatterlist *sg, void *buf,
+			      unsigned int buflen)
+{
 	sg->page = virt_to_page(buf);
 	sg->offset = offset_in_page(buf);
 	sg->length = buflen;
 }
 
+static inline void sg_init_one(struct scatterlist *sg, void *buf,
+			       unsigned int buflen)
+{
+	memset(sg, 0, sizeof(*sg));
+	sg_set_buf(sg, buf, buflen);
+}
+
 #endif /* _LINUX_SCATTERLIST_H */

From 378f058cc49bcda7fa63d3cd86d2f9a0a5188b1c Mon Sep 17 00:00:00 2001
From: David Hardeman <david@2gen.com>
Date: Sat, 17 Sep 2005 17:55:31 +1000
Subject: [PATCH 02/91] [PATCH] Use sg_set_buf/sg_init_one where applicable

This patch uses sg_set_buf/sg_init_one in some places where it was
duplicated.

Signed-off-by: David Hardeman <david@2gen.com>
Cc: James Bottomley <James.Bottomley@steeleye.com>
Cc: Greg KH <greg@kroah.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jeff Garzik <jgarzik@pobox.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/hmac.c                         | 19 +++-------
 crypto/tcrypt.c                       | 52 ++++++++-------------------
 drivers/md/dm-crypt.c                 | 12 +++----
 drivers/net/wireless/airo.c           |  5 ++-
 drivers/scsi/arm/scsi.h               |  6 ++--
 drivers/scsi/libata-core.c            | 10 ++----
 drivers/scsi/sg.c                     |  5 ++-
 drivers/usb/misc/usbtest.c            |  7 ++--
 net/ipv6/addrconf.c                   | 10 ++----
 net/sunrpc/auth_gss/gss_krb5_crypto.c | 23 ++++--------
 10 files changed, 44 insertions(+), 105 deletions(-)

diff --git a/crypto/hmac.c b/crypto/hmac.c
index da0456b37109..46120dee5ada 100644
--- a/crypto/hmac.c
+++ b/crypto/hmac.c
@@ -18,18 +18,15 @@
 #include <linux/mm.h>
 #include <linux/highmem.h>
 #include <linux/slab.h>
-#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
 #include "internal.h"
 
 static void hash_key(struct crypto_tfm *tfm, u8 *key, unsigned int keylen)
 {
 	struct scatterlist tmp;
 	
-	tmp.page = virt_to_page(key);
-	tmp.offset = offset_in_page(key);
-	tmp.length = keylen;
+	sg_set_buf(&tmp, key, keylen);
 	crypto_digest_digest(tfm, &tmp, 1, key);
-		
 }
 
 int crypto_alloc_hmac_block(struct crypto_tfm *tfm)
@@ -69,9 +66,7 @@ void crypto_hmac_init(struct crypto_tfm *tfm, u8 *key, unsigned int *keylen)
 	for (i = 0; i < crypto_tfm_alg_blocksize(tfm); i++)
 		ipad[i] ^= 0x36;
 
-	tmp.page = virt_to_page(ipad);
-	tmp.offset = offset_in_page(ipad);
-	tmp.length = crypto_tfm_alg_blocksize(tfm);
+	sg_set_buf(&tmp, ipad, crypto_tfm_alg_blocksize(tfm));
 	
 	crypto_digest_init(tfm);
 	crypto_digest_update(tfm, &tmp, 1);
@@ -103,16 +98,12 @@ void crypto_hmac_final(struct crypto_tfm *tfm, u8 *key,
 	for (i = 0; i < crypto_tfm_alg_blocksize(tfm); i++)
 		opad[i] ^= 0x5c;
 
-	tmp.page = virt_to_page(opad);
-	tmp.offset = offset_in_page(opad);
-	tmp.length = crypto_tfm_alg_blocksize(tfm);
+	sg_set_buf(&tmp, opad, crypto_tfm_alg_blocksize(tfm));
 
 	crypto_digest_init(tfm);
 	crypto_digest_update(tfm, &tmp, 1);
 	
-	tmp.page = virt_to_page(out);
-	tmp.offset = offset_in_page(out);
-	tmp.length = crypto_tfm_alg_digestsize(tfm);
+	sg_set_buf(&tmp, out, crypto_tfm_alg_digestsize(tfm));
 	
 	crypto_digest_update(tfm, &tmp, 1);
 	crypto_digest_final(tfm, out);
diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c
index 68639419c5bd..577a3aff3113 100644
--- a/crypto/tcrypt.c
+++ b/crypto/tcrypt.c
@@ -21,7 +21,7 @@
 #include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
-#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
 #include <linux/string.h>
 #include <linux/crypto.h>
 #include <linux/highmem.h>
@@ -86,7 +86,6 @@ static void hexdump(unsigned char *buf, unsigned int len)
 static void test_hash(char *algo, struct hash_testvec *template,
 		      unsigned int tcount)
 {
-	char *p;
 	unsigned int i, j, k, temp;
 	struct scatterlist sg[8];
 	char result[64];
@@ -116,10 +115,7 @@ static void test_hash(char *algo, struct hash_testvec *template,
 		printk("test %u:\n", i + 1);
 		memset(result, 0, 64);
 
-		p = hash_tv[i].plaintext;
-		sg[0].page = virt_to_page(p);
-		sg[0].offset = offset_in_page(p);
-		sg[0].length = hash_tv[i].psize;
+		sg_set_buf(&sg[0], hash_tv[i].plaintext, hash_tv[i].psize);
 
 		crypto_digest_init(tfm);
 		if (tfm->crt_u.digest.dit_setkey) {
@@ -154,10 +150,8 @@ static void test_hash(char *algo, struct hash_testvec *template,
 				       hash_tv[i].plaintext + temp,
 				       hash_tv[i].tap[k]);
 				temp += hash_tv[i].tap[k];
-				p = &xbuf[IDX[k]];
-				sg[k].page = virt_to_page(p);
-				sg[k].offset = offset_in_page(p);
-				sg[k].length = hash_tv[i].tap[k];
+				sg_set_buf(&sg[k], &xbuf[IDX[k]],
+					    hash_tv[i].tap[k]);
 			}
 
 			crypto_digest_digest(tfm, sg, hash_tv[i].np, result);
@@ -179,7 +173,6 @@ static void test_hash(char *algo, struct hash_testvec *template,
 static void test_hmac(char *algo, struct hmac_testvec *template,
 		      unsigned int tcount)
 {
-	char *p;
 	unsigned int i, j, k, temp;
 	struct scatterlist sg[8];
 	char result[64];
@@ -210,11 +203,8 @@ static void test_hmac(char *algo, struct hmac_testvec *template,
 		printk("test %u:\n", i + 1);
 		memset(result, 0, sizeof (result));
 
-		p = hmac_tv[i].plaintext;
 		klen = hmac_tv[i].ksize;
-		sg[0].page = virt_to_page(p);
-		sg[0].offset = offset_in_page(p);
-		sg[0].length = hmac_tv[i].psize;
+		sg_set_buf(&sg[0], hmac_tv[i].plaintext, hmac_tv[i].psize);
 
 		crypto_hmac(tfm, hmac_tv[i].key, &klen, sg, 1, result);
 
@@ -243,10 +233,8 @@ static void test_hmac(char *algo, struct hmac_testvec *template,
 				       hmac_tv[i].plaintext + temp,
 				       hmac_tv[i].tap[k]);
 				temp += hmac_tv[i].tap[k];
-				p = &xbuf[IDX[k]];
-				sg[k].page = virt_to_page(p);
-				sg[k].offset = offset_in_page(p);
-				sg[k].length = hmac_tv[i].tap[k];
+				sg_set_buf(&sg[k], &xbuf[IDX[k]],
+					    hmac_tv[i].tap[k]);
 			}
 
 			crypto_hmac(tfm, hmac_tv[i].key, &klen, sg,
@@ -270,7 +258,7 @@ static void test_cipher(char *algo, int mode, int enc,
 {
 	unsigned int ret, i, j, k, temp;
 	unsigned int tsize;
-	char *p, *q;
+	char *q;
 	struct crypto_tfm *tfm;
 	char *key;
 	struct cipher_testvec *cipher_tv;
@@ -330,10 +318,8 @@ static void test_cipher(char *algo, int mode, int enc,
 					goto out;
 			}
 
-			p = cipher_tv[i].input;
-			sg[0].page = virt_to_page(p);
-			sg[0].offset = offset_in_page(p);
-			sg[0].length = cipher_tv[i].ilen;
+			sg_set_buf(&sg[0], cipher_tv[i].input,
+				   cipher_tv[i].ilen);
 
 			if (!mode) {
 				crypto_cipher_set_iv(tfm, cipher_tv[i].iv,
@@ -389,10 +375,8 @@ static void test_cipher(char *algo, int mode, int enc,
 				       cipher_tv[i].input + temp,
 				       cipher_tv[i].tap[k]);
 				temp += cipher_tv[i].tap[k];
-				p = &xbuf[IDX[k]];
-				sg[k].page = virt_to_page(p);
-				sg[k].offset = offset_in_page(p);
-				sg[k].length = cipher_tv[i].tap[k];
+				sg_set_buf(&sg[k], &xbuf[IDX[k]],
+					   cipher_tv[i].tap[k]);
 			}
 
 			if (!mode) {
@@ -436,9 +420,7 @@ static int test_cipher_jiffies(struct crypto_tfm *tfm, int enc, char *p,
 	int bcount;
 	int ret;
 
-	sg[0].page = virt_to_page(p);
-	sg[0].offset = offset_in_page(p);
-	sg[0].length = blen;
+	sg_set_buf(&sg[0], p, blen);
 
 	for (start = jiffies, end = start + sec * HZ, bcount = 0;
 	     time_before(jiffies, end); bcount++) {
@@ -464,9 +446,7 @@ static int test_cipher_cycles(struct crypto_tfm *tfm, int enc, char *p,
 	int ret = 0;
 	int i;
 
-	sg[0].page = virt_to_page(p);
-	sg[0].offset = offset_in_page(p);
-	sg[0].length = blen;
+	sg_set_buf(&sg[0], p, blen);
 
 	local_bh_disable();
 	local_irq_disable();
@@ -709,9 +689,7 @@ static void test_crc32c(void)
 	for (i = 0; i < NUMVEC; i++) {
 		for (j = 0; j < VECSIZE; j++)
 			test_vec[i][j] = ++b;
-		sg[i].page = virt_to_page(test_vec[i]);
-		sg[i].offset = offset_in_page(test_vec[i]);
-		sg[i].length = VECSIZE;
+		sg_set_buf(&sg[i], test_vec[i], VECSIZE);
 	}
 
 	seed = SEEDTESTVAL;
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 28c1a628621f..cf6631056683 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -15,7 +15,7 @@
 #include <linux/crypto.h>
 #include <linux/workqueue.h>
 #include <asm/atomic.h>
-#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
 #include <asm/page.h>
 
 #include "dm.h"
@@ -164,9 +164,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
 		return -ENOMEM;
 	}
 
-	sg.page = virt_to_page(cc->key);
-	sg.offset = offset_in_page(cc->key);
-	sg.length = cc->key_size;
+	sg_set_buf(&sg, cc->key, cc->key_size);
 	crypto_digest_digest(hash_tfm, &sg, 1, salt);
 	crypto_free_tfm(hash_tfm);
 
@@ -207,14 +205,12 @@ static void crypt_iv_essiv_dtr(struct crypt_config *cc)
 
 static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
 {
-	struct scatterlist sg = { NULL, };
+	struct scatterlist sg;
 
 	memset(iv, 0, cc->iv_size);
 	*(u64 *)iv = cpu_to_le64(sector);
 
-	sg.page = virt_to_page(iv);
-	sg.offset = offset_in_page(iv);
-	sg.length = cc->iv_size;
+	sg_set_buf(&sg, iv, cc->iv_size);
 	crypto_cipher_encrypt((struct crypto_tfm *)cc->iv_gen_private,
 	                      &sg, &sg, cc->iv_size);
 
diff --git a/drivers/net/wireless/airo.c b/drivers/net/wireless/airo.c
index 4c11699bad91..1609ce11389d 100644
--- a/drivers/net/wireless/airo.c
+++ b/drivers/net/wireless/airo.c
@@ -35,6 +35,7 @@
 #include <linux/interrupt.h>
 #include <linux/in.h>
 #include <linux/bitops.h>
+#include <linux/scatterlist.h>
 #include <asm/io.h>
 #include <asm/system.h>
 
@@ -1590,9 +1591,7 @@ static void emmh32_setseed(emmh32_context *context, u8 *pkey, int keylen, struct
 		aes_counter[12] = (u8)(counter >> 24);
 		counter++;
 		memcpy (plain, aes_counter, 16);
-		sg[0].page = virt_to_page(plain);
-		sg[0].offset = ((long) plain & ~PAGE_MASK);
-		sg[0].length = 16;
+		sg_set_buf(&sg[0], plain, 16);
 		crypto_cipher_encrypt(tfm, sg, sg, 16);
 		cipher = kmap(sg[0].page) + sg[0].offset;
 		for (j=0; (j<16) && (i< (sizeof(context->coeff)/sizeof(context->coeff[0]))); ) {
diff --git a/drivers/scsi/arm/scsi.h b/drivers/scsi/arm/scsi.h
index 48e1c4d9738b..19937640e2e7 100644
--- a/drivers/scsi/arm/scsi.h
+++ b/drivers/scsi/arm/scsi.h
@@ -10,6 +10,8 @@
  *  Commonly used scsi driver functions.
  */
 
+#include <linux/scatterlist.h>
+
 #define BELT_AND_BRACES
 
 /*
@@ -22,9 +24,7 @@ static inline int copy_SCp_to_sg(struct scatterlist *sg, Scsi_Pointer *SCp, int
 
 	BUG_ON(bufs + 1 > max);
 
-	sg->page   = virt_to_page(SCp->ptr);
-	sg->offset = offset_in_page(SCp->ptr);
-	sg->length = SCp->this_residual;
+	sg_set_buf(sg, SCp->ptr, SCp->this_residual);
 
 	if (bufs)
 		memcpy(sg + 1, SCp->buffer + 1,
diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index b1b1c6f01419..5ca97605ff35 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -49,6 +49,7 @@
 #include <linux/suspend.h>
 #include <linux/workqueue.h>
 #include <linux/jiffies.h>
+#include <linux/scatterlist.h>
 #include <scsi/scsi.h>
 #include "scsi.h"
 #include "scsi_priv.h"
@@ -2576,19 +2577,12 @@ void ata_qc_prep(struct ata_queued_cmd *qc)
 
 void ata_sg_init_one(struct ata_queued_cmd *qc, void *buf, unsigned int buflen)
 {
-	struct scatterlist *sg;
-
 	qc->flags |= ATA_QCFLAG_SINGLE;
 
-	memset(&qc->sgent, 0, sizeof(qc->sgent));
 	qc->sg = &qc->sgent;
 	qc->n_elem = 1;
 	qc->buf_virt = buf;
-
-	sg = qc->sg;
-	sg->page = virt_to_page(buf);
-	sg->offset = (unsigned long) buf & ~PAGE_MASK;
-	sg->length = buflen;
+	sg_init_one(qc->sg, buf, buflen);
 }
 
 /**
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 861e51375d70..07fee811c09e 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -49,6 +49,7 @@ static int sg_version_num = 30533;	/* 2 digits for each component */
 #include <linux/seq_file.h>
 #include <linux/blkdev.h>
 #include <linux/delay.h>
+#include <linux/scatterlist.h>
 
 #include "scsi.h"
 #include <scsi/scsi_dbg.h>
@@ -1992,9 +1993,7 @@ sg_build_indirect(Sg_scatter_hold * schp, Sg_fd * sfp, int buff_size)
 				if (!p)
 					break;
 			}
-			sclp->page = virt_to_page(p);
-			sclp->offset = offset_in_page(p);
-			sclp->length = ret_sz;
+			sg_set_buf(sclp, p, ret_sz);
 
 			SCSI_LOG_TIMEOUT(5, printk("sg_build_build: k=%d, a=0x%p, len=%d\n",
 					  k, sg_scatg2virt(sclp), ret_sz));
diff --git a/drivers/usb/misc/usbtest.c b/drivers/usb/misc/usbtest.c
index 90a96257d6ce..2997f558159b 100644
--- a/drivers/usb/misc/usbtest.c
+++ b/drivers/usb/misc/usbtest.c
@@ -9,7 +9,7 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
-#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
 
 #include <linux/usb.h>
 
@@ -381,7 +381,6 @@ alloc_sglist (int nents, int max, int vary)
 	sg = kmalloc (nents * sizeof *sg, SLAB_KERNEL);
 	if (!sg)
 		return NULL;
-	memset (sg, 0, nents * sizeof *sg);
 
 	for (i = 0; i < nents; i++) {
 		char		*buf;
@@ -394,9 +393,7 @@ alloc_sglist (int nents, int max, int vary)
 		memset (buf, 0, size);
 
 		/* kmalloc pages are always physically contiguous! */
-		sg [i].page = virt_to_page (buf);
-		sg [i].offset = offset_in_page (buf);
-		sg [i].length = size;
+		sg_init_one(&sg[i], buf, size);
 
 		if (vary) {
 			size += vary;
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index a970b4727ce8..41edc14851e8 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -75,7 +75,7 @@
 #ifdef CONFIG_IPV6_PRIVACY
 #include <linux/random.h>
 #include <linux/crypto.h>
-#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
 #endif
 
 #include <asm/uaccess.h>
@@ -1217,12 +1217,8 @@ static int __ipv6_regen_rndid(struct inet6_dev *idev)
 	struct net_device *dev;
 	struct scatterlist sg[2];
 
-	sg[0].page = virt_to_page(idev->entropy);
-	sg[0].offset = offset_in_page(idev->entropy);
-	sg[0].length = 8;
-	sg[1].page = virt_to_page(idev->work_eui64);
-	sg[1].offset = offset_in_page(idev->work_eui64);
-	sg[1].length = 8;
+	sg_set_buf(&sg[0], idev->entropy, 8);
+	sg_set_buf(&sg[1], idev->work_eui64, 8);
 
 	dev = idev->dev;
 
diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c
index 3f3d5437f02d..e65e1f979275 100644
--- a/net/sunrpc/auth_gss/gss_krb5_crypto.c
+++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c
@@ -37,7 +37,7 @@
 #include <linux/types.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
-#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
 #include <linux/crypto.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
@@ -75,9 +75,7 @@ krb5_encrypt(
 		memcpy(local_iv, iv, crypto_tfm_alg_ivsize(tfm));
 
 	memcpy(out, in, length);
-	sg[0].page = virt_to_page(out);
-	sg[0].offset = offset_in_page(out);
-	sg[0].length = length;
+	sg_set_buf(&sg[0], out, length);
 
 	ret = crypto_cipher_encrypt_iv(tfm, sg, sg, length, local_iv);
 
@@ -117,9 +115,7 @@ krb5_decrypt(
 		memcpy(local_iv,iv, crypto_tfm_alg_ivsize(tfm));
 
 	memcpy(out, in, length);
-	sg[0].page = virt_to_page(out);
-	sg[0].offset = offset_in_page(out);
-	sg[0].length = length;
+	sg_set_buf(&sg[0], out, length);
 
 	ret = crypto_cipher_decrypt_iv(tfm, sg, sg, length, local_iv);
 
@@ -132,13 +128,6 @@ out:
 
 EXPORT_SYMBOL(krb5_decrypt);
 
-static void
-buf_to_sg(struct scatterlist *sg, char *ptr, int len) {
-	sg->page = virt_to_page(ptr);
-	sg->offset = offset_in_page(ptr);
-	sg->length = len;
-}
-
 static int
 process_xdr_buf(struct xdr_buf *buf, int offset, int len,
 		int (*actor)(struct scatterlist *, void *), void *data)
@@ -152,7 +141,7 @@ process_xdr_buf(struct xdr_buf *buf, int offset, int len,
 		thislen = buf->head[0].iov_len - offset;
 		if (thislen > len)
 			thislen = len;
-		buf_to_sg(sg, buf->head[0].iov_base + offset, thislen);
+		sg_set_buf(sg, buf->head[0].iov_base + offset, thislen);
 		ret = actor(sg, data);
 		if (ret)
 			goto out;
@@ -195,7 +184,7 @@ process_xdr_buf(struct xdr_buf *buf, int offset, int len,
 		thislen = buf->tail[0].iov_len - offset;
 		if (thislen > len)
 			thislen = len;
-		buf_to_sg(sg, buf->tail[0].iov_base + offset, thislen);
+		sg_set_buf(sg, buf->tail[0].iov_base + offset, thislen);
 		ret = actor(sg, data);
 		len -= thislen;
 	}
@@ -241,7 +230,7 @@ make_checksum(s32 cksumtype, char *header, int hdrlen, struct xdr_buf *body,
 		goto out;
 
 	crypto_digest_init(tfm);
-	buf_to_sg(sg, header, hdrlen);
+	sg_set_buf(sg, header, hdrlen);
 	crypto_digest_update(tfm, sg, 1);
 	process_xdr_buf(body, body_offset, body->len - body_offset,
 			checksummer, tfm);

From 6df5b9f48dd0e77fa796b9b7d3fde7cc5f1237f2 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 19 Sep 2005 22:30:11 +1000
Subject: [PATCH 03/91] [CRYPTO] Simplify one-member scatterlist expressions

This patch rewrites various occurences of &sg[0] where sg is an array
of length one to simply sg.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/tcrypt.c                       | 8 ++++----
 drivers/net/wireless/airo.c           | 4 ++--
 net/sunrpc/auth_gss/gss_krb5_crypto.c | 4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c
index 577a3aff3113..53f4ee804bdb 100644
--- a/crypto/tcrypt.c
+++ b/crypto/tcrypt.c
@@ -415,12 +415,12 @@ out:
 static int test_cipher_jiffies(struct crypto_tfm *tfm, int enc, char *p,
 			       int blen, int sec)
 {
-	struct scatterlist sg[8];
+	struct scatterlist sg[1];
 	unsigned long start, end;
 	int bcount;
 	int ret;
 
-	sg_set_buf(&sg[0], p, blen);
+	sg_set_buf(sg, p, blen);
 
 	for (start = jiffies, end = start + sec * HZ, bcount = 0;
 	     time_before(jiffies, end); bcount++) {
@@ -441,12 +441,12 @@ static int test_cipher_jiffies(struct crypto_tfm *tfm, int enc, char *p,
 static int test_cipher_cycles(struct crypto_tfm *tfm, int enc, char *p,
 			      int blen)
 {
-	struct scatterlist sg[8];
+	struct scatterlist sg[1];
 	unsigned long cycles = 0;
 	int ret = 0;
 	int i;
 
-	sg_set_buf(&sg[0], p, blen);
+	sg_set_buf(sg, p, blen);
 
 	local_bh_disable();
 	local_irq_disable();
diff --git a/drivers/net/wireless/airo.c b/drivers/net/wireless/airo.c
index 1609ce11389d..750c0167539c 100644
--- a/drivers/net/wireless/airo.c
+++ b/drivers/net/wireless/airo.c
@@ -1591,9 +1591,9 @@ static void emmh32_setseed(emmh32_context *context, u8 *pkey, int keylen, struct
 		aes_counter[12] = (u8)(counter >> 24);
 		counter++;
 		memcpy (plain, aes_counter, 16);
-		sg_set_buf(&sg[0], plain, 16);
+		sg_set_buf(sg, plain, 16);
 		crypto_cipher_encrypt(tfm, sg, sg, 16);
-		cipher = kmap(sg[0].page) + sg[0].offset;
+		cipher = kmap(sg->page) + sg->offset;
 		for (j=0; (j<16) && (i< (sizeof(context->coeff)/sizeof(context->coeff[0]))); ) {
 			context->coeff[i++] = ntohl(*(u32 *)&cipher[j]);
 			j += 4;
diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c
index e65e1f979275..97c981fa6b8e 100644
--- a/net/sunrpc/auth_gss/gss_krb5_crypto.c
+++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c
@@ -75,7 +75,7 @@ krb5_encrypt(
 		memcpy(local_iv, iv, crypto_tfm_alg_ivsize(tfm));
 
 	memcpy(out, in, length);
-	sg_set_buf(&sg[0], out, length);
+	sg_set_buf(sg, out, length);
 
 	ret = crypto_cipher_encrypt_iv(tfm, sg, sg, length, local_iv);
 
@@ -115,7 +115,7 @@ krb5_decrypt(
 		memcpy(local_iv,iv, crypto_tfm_alg_ivsize(tfm));
 
 	memcpy(out, in, length);
-	sg_set_buf(&sg[0], out, length);
+	sg_set_buf(sg, out, length);
 
 	ret = crypto_cipher_decrypt_iv(tfm, sg, sg, length, local_iv);
 

From 1b40efd772f4419fbc1a8940506424246985c333 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 3 Oct 2005 15:15:36 +1000
Subject: [PATCH 04/91] [CRYPTO] Check cra_alignmask against cra_blocksize

The cipher code relies on the fact that the block size is a multiple
of the required alignment.  So we should check this at the time of
algorith registration.  We also ensure that the block size is bounded
by the page size.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/api.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/crypto/api.c b/crypto/api.c
index 959c4e5f264f..40ae42e9b6a6 100644
--- a/crypto/api.c
+++ b/crypto/api.c
@@ -215,7 +215,10 @@ int crypto_register_alg(struct crypto_alg *alg)
 	if (alg->cra_alignmask & (alg->cra_alignmask + 1))
 		return -EINVAL;
 
-	if (alg->cra_alignmask > PAGE_SIZE)
+	if (alg->cra_alignmask & alg->cra_blocksize)
+		return -EINVAL;
+
+	if (alg->cra_blocksize > PAGE_SIZE)
 		return -EINVAL;
 	
 	down_write(&crypto_alg_sem);

From 0169e284f6b6b263cc7c2ed25986b96cd6fda610 Mon Sep 17 00:00:00 2001
From: Jeff Garzik <jgarzik@pobox.com>
Date: Sat, 29 Oct 2005 21:25:10 -0400
Subject: [PATCH 05/91] [libata] remove ata_chk_err(), ->check_err() hook.

We now depend on ->tf_read() to provide us with the contents
of the Error shadow register.
---
 drivers/scsi/ahci.c        |  9 ---------
 drivers/scsi/libata-core.c | 41 +++++++++-----------------------------
 drivers/scsi/sata_mv.c     | 18 -----------------
 drivers/scsi/sata_sil24.c  |  8 --------
 include/linux/libata.h     |  2 --
 5 files changed, 9 insertions(+), 69 deletions(-)

diff --git a/drivers/scsi/ahci.c b/drivers/scsi/ahci.c
index fe8187d6f58b..03829aedfd39 100644
--- a/drivers/scsi/ahci.c
+++ b/drivers/scsi/ahci.c
@@ -192,7 +192,6 @@ static void ahci_port_stop(struct ata_port *ap);
 static void ahci_tf_read(struct ata_port *ap, struct ata_taskfile *tf);
 static void ahci_qc_prep(struct ata_queued_cmd *qc);
 static u8 ahci_check_status(struct ata_port *ap);
-static u8 ahci_check_err(struct ata_port *ap);
 static inline int ahci_host_intr(struct ata_port *ap, struct ata_queued_cmd *qc);
 static void ahci_remove_one (struct pci_dev *pdev);
 
@@ -221,7 +220,6 @@ static const struct ata_port_operations ahci_ops = {
 
 	.check_status		= ahci_check_status,
 	.check_altstatus	= ahci_check_status,
-	.check_err		= ahci_check_err,
 	.dev_select		= ata_noop_dev_select,
 
 	.tf_read		= ahci_tf_read,
@@ -458,13 +456,6 @@ static u8 ahci_check_status(struct ata_port *ap)
 	return readl(mmio + PORT_TFDATA) & 0xFF;
 }
 
-static u8 ahci_check_err(struct ata_port *ap)
-{
-	void __iomem *mmio = (void __iomem *) ap->ioaddr.cmd_addr;
-
-	return (readl(mmio + PORT_TFDATA) >> 8) & 0xFF;
-}
-
 static void ahci_tf_read(struct ata_port *ap, struct ata_taskfile *tf)
 {
 	struct ahci_port_priv *pp = ap->private_data;
diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index b1b1c6f01419..d2f71a2331bb 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -371,7 +371,7 @@ static void ata_tf_read_pio(struct ata_port *ap, struct ata_taskfile *tf)
 	struct ata_ioports *ioaddr = &ap->ioaddr;
 
 	tf->command = ata_check_status(ap);
-	tf->feature = ata_chk_err(ap);
+	tf->feature = inb(ioaddr->error_addr);
 	tf->nsect = inb(ioaddr->nsect_addr);
 	tf->lbal = inb(ioaddr->lbal_addr);
 	tf->lbam = inb(ioaddr->lbam_addr);
@@ -405,7 +405,7 @@ static void ata_tf_read_mmio(struct ata_port *ap, struct ata_taskfile *tf)
 	struct ata_ioports *ioaddr = &ap->ioaddr;
 
 	tf->command = ata_check_status(ap);
-	tf->feature = ata_chk_err(ap);
+	tf->feature = readb((void __iomem *)ioaddr->error_addr);
 	tf->nsect = readb((void __iomem *)ioaddr->nsect_addr);
 	tf->lbal = readb((void __iomem *)ioaddr->lbal_addr);
 	tf->lbam = readb((void __iomem *)ioaddr->lbam_addr);
@@ -525,30 +525,6 @@ u8 ata_altstatus(struct ata_port *ap)
 }
 
 
-/**
- *	ata_chk_err - Read device error reg
- *	@ap: port where the device is
- *
- *	Reads ATA taskfile error register for
- *	currently-selected device and return its value.
- *
- *	Note: may NOT be used as the check_err() entry in
- *	ata_port_operations.
- *
- *	LOCKING:
- *	Inherited from caller.
- */
-u8 ata_chk_err(struct ata_port *ap)
-{
-	if (ap->ops->check_err)
-		return ap->ops->check_err(ap);
-
-	if (ap->flags & ATA_FLAG_MMIO) {
-		return readb((void __iomem *) ap->ioaddr.error_addr);
-	}
-	return inb(ap->ioaddr.error_addr);
-}
-
 /**
  *	ata_tf_to_fis - Convert ATA taskfile to SATA FIS structure
  *	@tf: Taskfile to convert
@@ -901,8 +877,8 @@ static u8 ata_dev_try_classify(struct ata_port *ap, unsigned int device)
 
 	memset(&tf, 0, sizeof(tf));
 
-	err = ata_chk_err(ap);
 	ap->ops->tf_read(ap, &tf);
+	err = tf.feature;
 
 	dev->class = ATA_DEV_NONE;
 
@@ -1139,7 +1115,6 @@ static void ata_dev_identify(struct ata_port *ap, unsigned int device)
 	unsigned int major_version;
 	u16 tmp;
 	unsigned long xfer_modes;
-	u8 status;
 	unsigned int using_edd;
 	DECLARE_COMPLETION(wait);
 	struct ata_queued_cmd *qc;
@@ -1193,8 +1168,11 @@ retry:
 	else
 		wait_for_completion(&wait);
 
-	status = ata_chk_status(ap);
-	if (status & ATA_ERR) {
+	spin_lock_irqsave(&ap->host_set->lock, flags);
+	ap->ops->tf_read(ap, &qc->tf);
+	spin_unlock_irqrestore(&ap->host_set->lock, flags);
+
+	if (qc->tf.command & ATA_ERR) {
 		/*
 		 * arg!  EDD works for all test cases, but seems to return
 		 * the ATA signature for some ATAPI devices.  Until the
@@ -1207,7 +1185,7 @@ retry:
 		 * to have this problem.
 		 */
 		if ((using_edd) && (qc->tf.command == ATA_CMD_ID_ATA)) {
-			u8 err = ata_chk_err(ap);
+			u8 err = qc->tf.feature;
 			if (err & ATA_ABORTED) {
 				dev->class = ATA_DEV_ATAPI;
 				qc->cursg = 0;
@@ -4873,7 +4851,6 @@ EXPORT_SYMBOL_GPL(ata_tf_to_fis);
 EXPORT_SYMBOL_GPL(ata_tf_from_fis);
 EXPORT_SYMBOL_GPL(ata_check_status);
 EXPORT_SYMBOL_GPL(ata_altstatus);
-EXPORT_SYMBOL_GPL(ata_chk_err);
 EXPORT_SYMBOL_GPL(ata_exec_command);
 EXPORT_SYMBOL_GPL(ata_port_start);
 EXPORT_SYMBOL_GPL(ata_port_stop);
diff --git a/drivers/scsi/sata_mv.c b/drivers/scsi/sata_mv.c
index 422e0b6f603a..dcef5fe8600b 100644
--- a/drivers/scsi/sata_mv.c
+++ b/drivers/scsi/sata_mv.c
@@ -258,7 +258,6 @@ struct mv_host_priv {
 static void mv_irq_clear(struct ata_port *ap);
 static u32 mv_scr_read(struct ata_port *ap, unsigned int sc_reg_in);
 static void mv_scr_write(struct ata_port *ap, unsigned int sc_reg_in, u32 val);
-static u8 mv_check_err(struct ata_port *ap);
 static void mv_phy_reset(struct ata_port *ap);
 static void mv_host_stop(struct ata_host_set *host_set);
 static int mv_port_start(struct ata_port *ap);
@@ -296,7 +295,6 @@ static const struct ata_port_operations mv_ops = {
 	.tf_load		= ata_tf_load,
 	.tf_read		= ata_tf_read,
 	.check_status		= ata_check_status,
-	.check_err		= mv_check_err,
 	.exec_command		= ata_exec_command,
 	.dev_select		= ata_std_dev_select,
 
@@ -1184,22 +1182,6 @@ static irqreturn_t mv_interrupt(int irq, void *dev_instance,
 	return IRQ_RETVAL(handled);
 }
 
-/**
- *      mv_check_err - Return the error shadow register to caller.
- *      @ap: ATA channel to manipulate
- *
- *      Marvell requires DMA to be stopped before accessing shadow
- *      registers.  So we do that, then return the needed register.
- *
- *      LOCKING:
- *      Inherited from caller.  FIXME: protect mv_stop_dma with lock?
- */
-static u8 mv_check_err(struct ata_port *ap)
-{
-	mv_stop_dma(ap);		/* can't read shadow regs if DMA on */
-	return readb((void __iomem *) ap->ioaddr.error_addr);
-}
-
 /**
  *      mv_phy_reset - Perform eDMA reset followed by COMRESET
  *      @ap: ATA channel to manipulate
diff --git a/drivers/scsi/sata_sil24.c b/drivers/scsi/sata_sil24.c
index 51855d3bac64..e18a1e2bb65e 100644
--- a/drivers/scsi/sata_sil24.c
+++ b/drivers/scsi/sata_sil24.c
@@ -225,7 +225,6 @@ struct sil24_host_priv {
 };
 
 static u8 sil24_check_status(struct ata_port *ap);
-static u8 sil24_check_err(struct ata_port *ap);
 static u32 sil24_scr_read(struct ata_port *ap, unsigned sc_reg);
 static void sil24_scr_write(struct ata_port *ap, unsigned sc_reg, u32 val);
 static void sil24_tf_read(struct ata_port *ap, struct ata_taskfile *tf);
@@ -280,7 +279,6 @@ static const struct ata_port_operations sil24_ops = {
 
 	.check_status		= sil24_check_status,
 	.check_altstatus	= sil24_check_status,
-	.check_err		= sil24_check_err,
 	.dev_select		= ata_noop_dev_select,
 
 	.tf_read		= sil24_tf_read,
@@ -363,12 +361,6 @@ static u8 sil24_check_status(struct ata_port *ap)
 	return pp->tf.command;
 }
 
-static u8 sil24_check_err(struct ata_port *ap)
-{
-	struct sil24_port_priv *pp = ap->private_data;
-	return pp->tf.feature;
-}
-
 static int sil24_scr_map[] = {
 	[SCR_CONTROL]	= 0,
 	[SCR_STATUS]	= 1,
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 00a8a5738858..a4cce9936a80 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -347,7 +347,6 @@ struct ata_port_operations {
 	void (*exec_command)(struct ata_port *ap, const struct ata_taskfile *tf);
 	u8   (*check_status)(struct ata_port *ap);
 	u8   (*check_altstatus)(struct ata_port *ap);
-	u8   (*check_err)(struct ata_port *ap);
 	void (*dev_select)(struct ata_port *ap, unsigned int device);
 
 	void (*phy_reset) (struct ata_port *ap);
@@ -434,7 +433,6 @@ extern void ata_noop_dev_select (struct ata_port *ap, unsigned int device);
 extern void ata_std_dev_select (struct ata_port *ap, unsigned int device);
 extern u8 ata_check_status(struct ata_port *ap);
 extern u8 ata_altstatus(struct ata_port *ap);
-extern u8 ata_chk_err(struct ata_port *ap);
 extern void ata_exec_command(struct ata_port *ap, const struct ata_taskfile *tf);
 extern int ata_port_start (struct ata_port *ap);
 extern void ata_port_stop (struct ata_port *ap);

From 930fc45a49ddebe7555cc5c837d82b9c27e65ff4 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Sat, 29 Oct 2005 18:15:41 -0700
Subject: [PATCH 06/91] [PATCH] vmalloc_node

This patch adds

vmalloc_node(size, node)	-> Allocate necessary memory on the specified node

and

get_vm_area_node(size, flags, node)

and the other functions that it depends on.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/vmalloc.h |  8 ++++-
 mm/vmalloc.c            | 73 ++++++++++++++++++++++++++++++++---------
 2 files changed, 64 insertions(+), 17 deletions(-)

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 3701a0673d2c..1d5577b2b752 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -32,10 +32,14 @@ struct vm_struct {
  *	Highlevel APIs for driver use
  */
 extern void *vmalloc(unsigned long size);
+extern void *vmalloc_node(unsigned long size, int node);
 extern void *vmalloc_exec(unsigned long size);
 extern void *vmalloc_32(unsigned long size);
 extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot);
-extern void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot);
+extern void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask,
+				pgprot_t prot);
+extern void *__vmalloc_node(unsigned long size, gfp_t gfp_mask,
+				pgprot_t prot, int node);
 extern void vfree(void *addr);
 
 extern void *vmap(struct page **pages, unsigned int count,
@@ -48,6 +52,8 @@ extern void vunmap(void *addr);
 extern struct vm_struct *get_vm_area(unsigned long size, unsigned long flags);
 extern struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
 					unsigned long start, unsigned long end);
+extern struct vm_struct *get_vm_area_node(unsigned long size,
+					unsigned long flags, int node);
 extern struct vm_struct *remove_vm_area(void *addr);
 extern struct vm_struct *__remove_vm_area(void *addr);
 extern int map_vm_area(struct vm_struct *area, pgprot_t prot,
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 1150229b6366..5e9120598799 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -5,6 +5,7 @@
  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
  *  SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
  *  Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
+ *  Numa awareness, Christoph Lameter, SGI, June 2005
  */
 
 #include <linux/mm.h>
@@ -158,8 +159,8 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
 	return err;
 }
 
-struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
-				unsigned long start, unsigned long end)
+struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags,
+				unsigned long start, unsigned long end, int node)
 {
 	struct vm_struct **p, *tmp, *area;
 	unsigned long align = 1;
@@ -178,7 +179,7 @@ struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
 	addr = ALIGN(start, align);
 	size = PAGE_ALIGN(size);
 
-	area = kmalloc(sizeof(*area), GFP_KERNEL);
+	area = kmalloc_node(sizeof(*area), GFP_KERNEL, node);
 	if (unlikely(!area))
 		return NULL;
 
@@ -231,6 +232,12 @@ out:
 	return NULL;
 }
 
+struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
+				unsigned long start, unsigned long end)
+{
+	return __get_vm_area_node(size, flags, start, end, -1);
+}
+
 /**
  *	get_vm_area  -  reserve a contingous kernel virtual area
  *
@@ -246,6 +253,11 @@ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
 	return __get_vm_area(size, flags, VMALLOC_START, VMALLOC_END);
 }
 
+struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, int node)
+{
+	return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node);
+}
+
 /* Caller must hold vmlist_lock */
 struct vm_struct *__remove_vm_area(void *addr)
 {
@@ -342,7 +354,6 @@ void vfree(void *addr)
 	BUG_ON(in_interrupt());
 	__vunmap(addr, 1);
 }
-
 EXPORT_SYMBOL(vfree);
 
 /**
@@ -360,7 +371,6 @@ void vunmap(void *addr)
 	BUG_ON(in_interrupt());
 	__vunmap(addr, 0);
 }
-
 EXPORT_SYMBOL(vunmap);
 
 /**
@@ -392,10 +402,10 @@ void *vmap(struct page **pages, unsigned int count,
 
 	return area->addr;
 }
-
 EXPORT_SYMBOL(vmap);
 
-void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
+void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
+				pgprot_t prot, int node)
 {
 	struct page **pages;
 	unsigned int nr_pages, array_size, i;
@@ -406,9 +416,9 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
 	area->nr_pages = nr_pages;
 	/* Please note that the recursion is strictly bounded. */
 	if (array_size > PAGE_SIZE)
-		pages = __vmalloc(array_size, gfp_mask, PAGE_KERNEL);
+		pages = __vmalloc_node(array_size, gfp_mask, PAGE_KERNEL, node);
 	else
-		pages = kmalloc(array_size, (gfp_mask & ~__GFP_HIGHMEM));
+		pages = kmalloc_node(array_size, (gfp_mask & ~__GFP_HIGHMEM), node);
 	area->pages = pages;
 	if (!area->pages) {
 		remove_vm_area(area->addr);
@@ -418,7 +428,10 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
 	memset(area->pages, 0, array_size);
 
 	for (i = 0; i < area->nr_pages; i++) {
-		area->pages[i] = alloc_page(gfp_mask);
+		if (node < 0)
+			area->pages[i] = alloc_page(gfp_mask);
+		else
+			area->pages[i] = alloc_pages_node(node, gfp_mask, 0);
 		if (unlikely(!area->pages[i])) {
 			/* Successfully allocated i pages, free them in __vunmap() */
 			area->nr_pages = i;
@@ -435,18 +448,25 @@ fail:
 	return NULL;
 }
 
+void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
+{
+	return __vmalloc_area_node(area, gfp_mask, prot, -1);
+}
+
 /**
- *	__vmalloc  -  allocate virtually contiguous memory
+ *	__vmalloc_node  -  allocate virtually contiguous memory
  *
  *	@size:		allocation size
  *	@gfp_mask:	flags for the page level allocator
  *	@prot:		protection mask for the allocated pages
+ *	@node		node to use for allocation or -1
  *
  *	Allocate enough pages to cover @size from the page level
  *	allocator with @gfp_mask flags.  Map them into contiguous
  *	kernel virtual space, using a pagetable protection of @prot.
  */
-void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
+void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
+			int node)
 {
 	struct vm_struct *area;
 
@@ -454,13 +474,18 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
 	if (!size || (size >> PAGE_SHIFT) > num_physpages)
 		return NULL;
 
-	area = get_vm_area(size, VM_ALLOC);
+	area = get_vm_area_node(size, VM_ALLOC, node);
 	if (!area)
 		return NULL;
 
-	return __vmalloc_area(area, gfp_mask, prot);
+	return __vmalloc_area_node(area, gfp_mask, prot, node);
 }
+EXPORT_SYMBOL(__vmalloc_node);
 
+void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
+{
+	return __vmalloc_node(size, gfp_mask, prot, -1);
+}
 EXPORT_SYMBOL(__vmalloc);
 
 /**
@@ -478,9 +503,26 @@ void *vmalloc(unsigned long size)
 {
        return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
 }
-
 EXPORT_SYMBOL(vmalloc);
 
+/**
+ *	vmalloc_node  -  allocate memory on a specific node
+ *
+ *	@size:		allocation size
+ *	@node;		numa node
+ *
+ *	Allocate enough pages to cover @size from the page level
+ *	allocator and map them into contiguous kernel virtual space.
+ *
+ *	For tight cotrol over page level allocator and protection flags
+ *	use __vmalloc() instead.
+ */
+void *vmalloc_node(unsigned long size, int node)
+{
+       return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, node);
+}
+EXPORT_SYMBOL(vmalloc_node);
+
 #ifndef PAGE_KERNEL_EXEC
 # define PAGE_KERNEL_EXEC PAGE_KERNEL
 #endif
@@ -515,7 +557,6 @@ void *vmalloc_32(unsigned long size)
 {
 	return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL);
 }
-
 EXPORT_SYMBOL(vmalloc_32);
 
 long vread(char *buf, char *addr, unsigned long count)

From 4b8f573b5db02a3017afbba49026a6aef480174f Mon Sep 17 00:00:00 2001
From: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Date: Sat, 29 Oct 2005 18:15:42 -0700
Subject: [PATCH 07/91] [PATCH] TIMERS: add missing compensation for HZ == 250

Add missing compensation for (HZ == 250) != (1 << SHIFT_HZ) in
second_overflow().

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/timer.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/kernel/timer.c b/kernel/timer.c
index 3ba10fa35b60..6a2e5f8dc725 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -752,6 +752,15 @@ static void second_overflow(void)
     else
 	time_adj += (time_adj >> 2) + (time_adj >> 5);
 #endif
+#if HZ == 250
+    /* Compensate for (HZ==250) != (1 << SHIFT_HZ).
+     * Add 1.5625% and 0.78125% to get 255.85938; => only 0.05% error (p. 14)
+     */
+    if (time_adj < 0)
+	time_adj -= (-time_adj >> 6) + (-time_adj >> 7);
+    else
+	time_adj += (time_adj >> 6) + (time_adj >> 7);
+#endif
 #if HZ == 1000
     /* Compensate for (HZ==1000) != (1 << SHIFT_HZ).
      * Add 1.5625% and 0.78125% to get 1023.4375; => only 0.05% error (p. 14)

From 63f324cf0792ed69089b79d6921ba3aaea97af50 Mon Sep 17 00:00:00 2001
From: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Date: Sat, 29 Oct 2005 18:15:43 -0700
Subject: [PATCH 08/91] [PATCH] fix alpha breakage

barrier.h uses barrier() in non-SMP case.  And doesn't include compiler.h.

Cc: Al Viro <viro@ftp.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-alpha/barrier.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/asm-alpha/barrier.h b/include/asm-alpha/barrier.h
index 229c83fe77cb..681ff581afa5 100644
--- a/include/asm-alpha/barrier.h
+++ b/include/asm-alpha/barrier.h
@@ -1,6 +1,8 @@
 #ifndef __BARRIER_H
 #define __BARRIER_H
 
+#include <asm/compiler.h>
+
 #define mb() \
 __asm__ __volatile__("mb": : :"memory")
 

From eb92f4ef320b738e41ad43476a5d05c8a20d5cc7 Mon Sep 17 00:00:00 2001
From: Rik Van Riel <riel@redhat.com>
Date: Sat, 29 Oct 2005 18:15:44 -0700
Subject: [PATCH 09/91] [PATCH] add sem_is_read/write_locked()

Add sem_is_read/write_locked functions to the read/write semaphores, along the
same lines of the *_is_locked spinlock functions.  The swap token tuning patch
uses sem_is_read_locked; sem_is_write_locked is added for completeness.

Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-alpha/rwsem.h      | 5 +++++
 include/asm-i386/rwsem.h       | 5 +++++
 include/asm-ia64/rwsem.h       | 5 +++++
 include/asm-ppc/rwsem.h        | 5 +++++
 include/asm-ppc64/rwsem.h      | 5 +++++
 include/asm-s390/rwsem.h       | 5 +++++
 include/asm-sh/rwsem.h         | 5 +++++
 include/asm-sparc64/rwsem.h    | 5 +++++
 include/asm-x86_64/rwsem.h     | 5 +++++
 include/linux/rwsem-spinlock.h | 5 +++++
 10 files changed, 50 insertions(+)

diff --git a/include/asm-alpha/rwsem.h b/include/asm-alpha/rwsem.h
index 8e058a67c9a4..fafdd4f7010a 100644
--- a/include/asm-alpha/rwsem.h
+++ b/include/asm-alpha/rwsem.h
@@ -262,5 +262,10 @@ static inline long rwsem_atomic_update(long val, struct rw_semaphore *sem)
 #endif
 }
 
+static inline int rwsem_is_locked(struct rw_semaphore *sem)
+{
+	return (sem->count != 0);
+}
+
 #endif /* __KERNEL__ */
 #endif /* _ALPHA_RWSEM_H */
diff --git a/include/asm-i386/rwsem.h b/include/asm-i386/rwsem.h
index 7625a675852f..be4ab859238e 100644
--- a/include/asm-i386/rwsem.h
+++ b/include/asm-i386/rwsem.h
@@ -284,5 +284,10 @@ LOCK_PREFIX	"xadd %0,(%2)"
 	return tmp+delta;
 }
 
+static inline int rwsem_is_locked(struct rw_semaphore *sem)
+{
+	return (sem->count != 0);
+}
+
 #endif /* __KERNEL__ */
 #endif /* _I386_RWSEM_H */
diff --git a/include/asm-ia64/rwsem.h b/include/asm-ia64/rwsem.h
index e18b5ab0cb75..1327c91ea39c 100644
--- a/include/asm-ia64/rwsem.h
+++ b/include/asm-ia64/rwsem.h
@@ -186,4 +186,9 @@ __downgrade_write (struct rw_semaphore *sem)
 #define rwsem_atomic_add(delta, sem)	atomic64_add(delta, (atomic64_t *)(&(sem)->count))
 #define rwsem_atomic_update(delta, sem)	atomic64_add_return(delta, (atomic64_t *)(&(sem)->count))
 
+static inline int rwsem_is_locked(struct rw_semaphore *sem)
+{
+	return (sem->count != 0);
+}
+
 #endif /* _ASM_IA64_RWSEM_H */
diff --git a/include/asm-ppc/rwsem.h b/include/asm-ppc/rwsem.h
index 3e738f483c11..3501ea72f88c 100644
--- a/include/asm-ppc/rwsem.h
+++ b/include/asm-ppc/rwsem.h
@@ -168,5 +168,10 @@ static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem)
 	return atomic_add_return(delta, (atomic_t *)(&sem->count));
 }
 
+static inline int rwsem_is_locked(struct rw_semaphore *sem)
+{
+	return (sem->count != 0);
+}
+
 #endif /* __KERNEL__ */
 #endif /* _PPC_RWSEM_XADD_H */
diff --git a/include/asm-ppc64/rwsem.h b/include/asm-ppc64/rwsem.h
index bd5c2f093575..7a647fae3765 100644
--- a/include/asm-ppc64/rwsem.h
+++ b/include/asm-ppc64/rwsem.h
@@ -163,5 +163,10 @@ static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem)
 	return atomic_add_return(delta, (atomic_t *)(&sem->count));
 }
 
+static inline int rwsem_is_locked(struct rw_semaphore *sem)
+{
+	return (sem->count != 0);
+}
+
 #endif /* __KERNEL__ */
 #endif /* _PPC_RWSEM_XADD_H */
diff --git a/include/asm-s390/rwsem.h b/include/asm-s390/rwsem.h
index 8c0cebbfc034..0422a085dd56 100644
--- a/include/asm-s390/rwsem.h
+++ b/include/asm-s390/rwsem.h
@@ -351,5 +351,10 @@ static inline long rwsem_atomic_update(long delta, struct rw_semaphore *sem)
 	return new;
 }
 
+static inline int rwsem_is_locked(struct rw_semaphore *sem)
+{
+	return (sem->count != 0);
+}
+
 #endif /* __KERNEL__ */
 #endif /* _S390_RWSEM_H */
diff --git a/include/asm-sh/rwsem.h b/include/asm-sh/rwsem.h
index 1be4337f5259..0262d3d1e5e0 100644
--- a/include/asm-sh/rwsem.h
+++ b/include/asm-sh/rwsem.h
@@ -166,5 +166,10 @@ static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem)
 	return atomic_add_return(delta, (atomic_t *)(&sem->count));
 }
 
+static inline int rwsem_is_locked(struct rw_semaphore *sem)
+{
+	return (sem->count != 0);
+}
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_SH_RWSEM_H */
diff --git a/include/asm-sparc64/rwsem.h b/include/asm-sparc64/rwsem.h
index 4568ee4022df..cef5e8270421 100644
--- a/include/asm-sparc64/rwsem.h
+++ b/include/asm-sparc64/rwsem.h
@@ -56,6 +56,11 @@ static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem)
 	atomic_add(delta, (atomic_t *)(&sem->count));
 }
 
+static inline int rwsem_is_locked(struct rw_semaphore *sem)
+{
+	return (sem->count != 0);
+}
+
 #endif /* __KERNEL__ */
 
 #endif /* _SPARC64_RWSEM_H */
diff --git a/include/asm-x86_64/rwsem.h b/include/asm-x86_64/rwsem.h
index c002175b6e82..46077e9c1910 100644
--- a/include/asm-x86_64/rwsem.h
+++ b/include/asm-x86_64/rwsem.h
@@ -274,5 +274,10 @@ LOCK_PREFIX	"xaddl %0,(%2)"
 	return tmp+delta;
 }
 
+static inline int rwsem_is_locked(struct rw_semaphore *sem)
+{
+	return (sem->count != 0);
+}
+
 #endif /* __KERNEL__ */
 #endif /* _X8664_RWSEM_H */
diff --git a/include/linux/rwsem-spinlock.h b/include/linux/rwsem-spinlock.h
index b52a2af25f1f..f30f805080ae 100644
--- a/include/linux/rwsem-spinlock.h
+++ b/include/linux/rwsem-spinlock.h
@@ -61,5 +61,10 @@ extern void FASTCALL(__up_read(struct rw_semaphore *sem));
 extern void FASTCALL(__up_write(struct rw_semaphore *sem));
 extern void FASTCALL(__downgrade_write(struct rw_semaphore *sem));
 
+static inline int rwsem_is_locked(struct rw_semaphore *sem)
+{
+	return (sem->activity != 0);
+}
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_RWSEM_SPINLOCK_H */

From fcdae29aa7a5c79f245110f6680afdc1858d3626 Mon Sep 17 00:00:00 2001
From: Rik Van Riel <riel@redhat.com>
Date: Sat, 29 Oct 2005 18:15:46 -0700
Subject: [PATCH 10/91] [PATCH] swaptoken tuning

It turns out that the original swap token implementation, by Song Jiang, only
enforced the swap token while the task holding the token is handling a page
fault.  This patch approximates that, without adding an additional flag to the
mm_struct, by checking whether the mm->mmap_sem is held for reading, like the
page fault code does.

This patch has the effect of automatically, and gradually, disabling the
enforcement of the swap token when there is little or no paging going on, and
"turning up" the intensity of the swap token code the more the task holding
the token is thrashing.

Thanks to Song Jiang for pointing out this aspect of the token based thrashing
control concept.

The new code shows a slight degradation over the old swap token code, but
still a big win over running without the swap token.

2.6.12+ swap token disabled

$ for i in `seq 10` ; do /usr/bin/time ./qsbench -n 30000000 -p 3 ; done
101.74user 23.13system 8:26.91elapsed 24%CPU (0avgtext+0avgdata 0maxresident)k
0inputs+0outputs (38597major+430315minor)pagefaults 0swaps
101.98user 24.91system 8:03.06elapsed 26%CPU (0avgtext+0avgdata 0maxresident)k
0inputs+0outputs (33939major+430457minor)pagefaults 0swaps
101.93user 22.12system 7:34.90elapsed 27%CPU (0avgtext+0avgdata 0maxresident)k
0inputs+0outputs (33166major+421267minor)pagefaults 0swaps
101.82user 22.38system 8:31.40elapsed 24%CPU (0avgtext+0avgdata 0maxresident)k
0inputs+0outputs (39338major+433262minor)pagefaults 0swaps

2.6.12+ swap token enabled, timeout 300 seconds

$ for i in `seq 4` ; do /usr/bin/time ./qsbench -n 30000000 -p 3 ; done
102.58user 16.08system 3:41.44elapsed 53%CPU (0avgtext+0avgdata 0maxresident)k
0inputs+0outputs (19707major+285786minor)pagefaults 0swaps
102.07user 19.56system 4:00.64elapsed 50%CPU (0avgtext+0avgdata 0maxresident)k
0inputs+0outputs (19012major+299259minor)pagefaults 0swaps
102.64user 18.25system 4:07.31elapsed 48%CPU (0avgtext+0avgdata 0maxresident)k
0inputs+0outputs (21990major+304831minor)pagefaults 0swaps
101.39user 19.41system 5:15.81elapsed 38%CPU (0avgtext+0avgdata 0maxresident)k
0inputs+0outputs (24850major+323321minor)pagefaults 0swaps

2.6.12+ with new swap token code, timeout 300 seconds

$ for i in `seq 4` ; do /usr/bin/time ./qsbench -n 30000000 -p 3 ; done
101.87user 24.66system 5:53.20elapsed 35%CPU (0avgtext+0avgdata 0maxresident)k
0inputs+0outputs (26848major+363497minor)pagefaults 0swaps
102.83user 19.95system 4:17.25elapsed 47%CPU (0avgtext+0avgdata 0maxresident)k
0inputs+0outputs (19946major+305722minor)pagefaults 0swaps
102.09user 19.46system 5:12.57elapsed 38%CPU (0avgtext+0avgdata 0maxresident)k
0inputs+0outputs (25461major+334994minor)pagefaults 0swaps
101.67user 20.61system 4:52.97elapsed 41%CPU (0avgtext+0avgdata 0maxresident)k
0inputs+0outputs (22190major+329508minor)pagefaults 0swaps

Signed-off-by: Rik Van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/rmap.c   | 6 +++++-
 mm/thrash.c | 2 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/mm/rmap.c b/mm/rmap.c
index 450f5241b5a5..1fc559e09ca8 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -298,7 +298,11 @@ static int page_referenced_one(struct page *page,
 		if (ptep_clear_flush_young(vma, address, pte))
 			referenced++;
 
-		if (mm != current->mm && !ignore_token && has_swap_token(mm))
+		/* Pretend the page is referenced if the task has the
+		   swap token and is in the middle of a page fault. */
+		if (mm != current->mm && !ignore_token &&
+				has_swap_token(mm) &&
+				rwsem_is_locked(&mm->mmap_sem))
 			referenced++;
 
 		(*mapcount)--;
diff --git a/mm/thrash.c b/mm/thrash.c
index 11461f7ad830..eff3c18c33a1 100644
--- a/mm/thrash.c
+++ b/mm/thrash.c
@@ -19,7 +19,7 @@ static unsigned long swap_token_check;
 struct mm_struct * swap_token_mm = &init_mm;
 
 #define SWAP_TOKEN_CHECK_INTERVAL (HZ * 2)
-#define SWAP_TOKEN_TIMEOUT	0
+#define SWAP_TOKEN_TIMEOUT	(300 * HZ)
 /*
  * Currently disabled; Needs further code to work at HZ * 300.
  */

From ba56e91c940146e99ac694c4c7cd7f2b4aaa565d Mon Sep 17 00:00:00 2001
From: "Seth, Rohit" <rohit.seth@intel.com>
Date: Sat, 29 Oct 2005 18:15:47 -0700
Subject: [PATCH 11/91] [PATCH] mm: page_alloc: increase size of per-cpu-pages

Increase the page allocator's per-cpu magazines from 1/4MB to 1/2MB.

Over 100+ runs for a workload, the difference in mean is about 2%.  The best
results for both are almost same.  Though the max variation in results with
1/2MB is only 2.2%, whereas with 1/4MB it is 12%.

Signed-off-by: Rohit Seth <rohit.seth@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 94c864eac9c4..f799217dc2f3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1721,29 +1721,29 @@ static int __devinit zone_batchsize(struct zone *zone)
 
 	/*
 	 * The per-cpu-pages pools are set to around 1000th of the
-	 * size of the zone.  But no more than 1/4 of a meg - there's
-	 * no point in going beyond the size of L2 cache.
+	 * size of the zone.  But no more than 1/2 of a meg.
 	 *
 	 * OK, so we don't know how big the cache is.  So guess.
 	 */
 	batch = zone->present_pages / 1024;
-	if (batch * PAGE_SIZE > 256 * 1024)
-		batch = (256 * 1024) / PAGE_SIZE;
+	if (batch * PAGE_SIZE > 512 * 1024)
+		batch = (512 * 1024) / PAGE_SIZE;
 	batch /= 4;		/* We effectively *= 4 below */
 	if (batch < 1)
 		batch = 1;
 
 	/*
-	 * Clamp the batch to a 2^n - 1 value. Having a power
-	 * of 2 value was found to be more likely to have
-	 * suboptimal cache aliasing properties in some cases.
+	 * We will be trying to allcoate bigger chunks of contiguous
+	 * memory of the order of fls(batch).  This should result in
+	 * better cache coloring.
 	 *
-	 * For example if 2 tasks are alternately allocating
-	 * batches of pages, one task can end up with a lot
-	 * of pages of one half of the possible page colors
-	 * and the other with pages of the other colors.
+	 * A sanity check also to ensure that batch is still in limits.
 	 */
-	batch = (1 << fls(batch + batch/2)) - 1;
+	batch = (1 << fls(batch + batch/2));
+
+	if (fls(batch) >= (PAGE_SHIFT + MAX_ORDER - 2))
+		batch = PAGE_SHIFT + ((MAX_ORDER - 1 - PAGE_SHIFT)/2);
+
 	return batch;
 }
 

From e46a5e28c201f703c18b47b108bfddec44f897c4 Mon Sep 17 00:00:00 2001
From: "Seth, Rohit" <rohit.seth@intel.com>
Date: Sat, 29 Oct 2005 18:15:48 -0700
Subject: [PATCH 12/91] [PATCH] mm: set per-cpu-pages lower threshold to zero

Set the low water mark for hot pages in pcp to zero.

(akpm: for the life of me I cannot remember why we created pcp->low.  Neither
can Martin and the changelog is silent.  Maybe it was just a brainfart, but I
have this feeling that there was a reason.  If not, we should remove the
fields completely.  We'll see.)

Signed-off-by: Rohit Seth <rohit.seth@intel.com>
Cc: <linux-mm@kvack.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f799217dc2f3..60663232fbb2 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1755,7 +1755,7 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
 
 	pcp = &p->pcp[0];		/* hot */
 	pcp->count = 0;
-	pcp->low = 2 * batch;
+	pcp->low = 0;
 	pcp->high = 6 * batch;
 	pcp->batch = max(1UL, 1 * batch);
 	INIT_LIST_HEAD(&pcp->list);
@@ -1764,7 +1764,7 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
 	pcp->count = 0;
 	pcp->low = 0;
 	pcp->high = 2 * batch;
-	pcp->batch = max(1UL, 1 * batch);
+	pcp->batch = max(1UL, batch/2);
 	INIT_LIST_HEAD(&pcp->list);
 }
 

From dfcd3c0dc426bb75770c34b40e14f2da8845ea62 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 29 Oct 2005 18:15:48 -0700
Subject: [PATCH 13/91] [PATCH] Convert mempolicies to nodemask_t

The NUMA policy code predated nodemask_t so it used open coded bitmaps.
Convert everything to nodemask_t.  Big patch, but shouldn't have any actual
behaviour changes (except I removed one unnecessary check against
node_online_map and one unnecessary BUG_ON)

Signed-off-by: "Andi Kleen" <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/task_mmu.c        |   2 +-
 include/linux/mempolicy.h |   4 +-
 mm/mempolicy.c            | 120 +++++++++++++++++---------------------
 3 files changed, 56 insertions(+), 70 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index c7ef3e48e35b..994612bc72d0 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -469,7 +469,7 @@ static int show_numa_map(struct seq_file *m, void *v)
 		seq_printf(m, " interleave={");
 		first = 1;
 		for_each_node(n) {
-			if (test_bit(n, pol->v.nodes)) {
+			if (node_isset(n, pol->v.nodes)) {
 				if (!first)
 					seq_putc(m,',');
 				else
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 58385ee1c0ac..38e60a099399 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -27,10 +27,10 @@
 
 #include <linux/config.h>
 #include <linux/mmzone.h>
-#include <linux/bitmap.h>
 #include <linux/slab.h>
 #include <linux/rbtree.h>
 #include <linux/spinlock.h>
+#include <linux/nodemask.h>
 
 struct vm_area_struct;
 
@@ -63,7 +63,7 @@ struct mempolicy {
 	union {
 		struct zonelist  *zonelist;	/* bind */
 		short 		 preferred_node; /* preferred */
-		DECLARE_BITMAP(nodes, MAX_NUMNODES); /* interleave */
+		nodemask_t	 nodes;		/* interleave */
 		/* undefined for default */
 	} v;
 };
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 1d5c64df1653..8bc0be1c9efd 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -93,23 +93,10 @@ struct mempolicy default_policy = {
 	.policy = MPOL_DEFAULT,
 };
 
-/* Check if all specified nodes are online */
-static int nodes_online(unsigned long *nodes)
-{
-	DECLARE_BITMAP(online2, MAX_NUMNODES);
-
-	bitmap_copy(online2, nodes_addr(node_online_map), MAX_NUMNODES);
-	if (bitmap_empty(online2, MAX_NUMNODES))
-		set_bit(0, online2);
-	if (!bitmap_subset(nodes, online2, MAX_NUMNODES))
-		return -EINVAL;
-	return 0;
-}
-
 /* Do sanity checking on a policy */
-static int mpol_check_policy(int mode, unsigned long *nodes)
+static int mpol_check_policy(int mode, nodemask_t *nodes)
 {
-	int empty = bitmap_empty(nodes, MAX_NUMNODES);
+	int empty = nodes_empty(*nodes);
 
 	switch (mode) {
 	case MPOL_DEFAULT:
@@ -124,11 +111,11 @@ static int mpol_check_policy(int mode, unsigned long *nodes)
 			return -EINVAL;
 		break;
 	}
-	return nodes_online(nodes);
+	return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
 }
 
 /* Copy a node mask from user space. */
-static int get_nodes(unsigned long *nodes, unsigned long __user *nmask,
+static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask,
 		     unsigned long maxnode, int mode)
 {
 	unsigned long k;
@@ -136,7 +123,7 @@ static int get_nodes(unsigned long *nodes, unsigned long __user *nmask,
 	unsigned long endmask;
 
 	--maxnode;
-	bitmap_zero(nodes, MAX_NUMNODES);
+	nodes_clear(*nodes);
 	if (maxnode == 0 || !nmask)
 		return 0;
 
@@ -153,7 +140,7 @@ static int get_nodes(unsigned long *nodes, unsigned long __user *nmask,
 			return -EINVAL;
 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
 			unsigned long t;
-			if (get_user(t,  nmask + k))
+			if (get_user(t, nmask + k))
 				return -EFAULT;
 			if (k == nlongs - 1) {
 				if (t & endmask)
@@ -165,30 +152,29 @@ static int get_nodes(unsigned long *nodes, unsigned long __user *nmask,
 		endmask = ~0UL;
 	}
 
-	if (copy_from_user(nodes, nmask, nlongs*sizeof(unsigned long)))
+	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
 		return -EFAULT;
-	nodes[nlongs-1] &= endmask;
+	nodes_addr(*nodes)[nlongs-1] &= endmask;
 	/* Update current mems_allowed */
 	cpuset_update_current_mems_allowed();
 	/* Ignore nodes not set in current->mems_allowed */
-	cpuset_restrict_to_mems_allowed(nodes);
+	/* AK: shouldn't this error out instead? */
+	cpuset_restrict_to_mems_allowed(nodes_addr(*nodes));
 	return mpol_check_policy(mode, nodes);
 }
 
 /* Generate a custom zonelist for the BIND policy. */
-static struct zonelist *bind_zonelist(unsigned long *nodes)
+static struct zonelist *bind_zonelist(nodemask_t *nodes)
 {
 	struct zonelist *zl;
 	int num, max, nd;
 
-	max = 1 + MAX_NR_ZONES * bitmap_weight(nodes, MAX_NUMNODES);
+	max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
 	zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
 	if (!zl)
 		return NULL;
 	num = 0;
-	for (nd = find_first_bit(nodes, MAX_NUMNODES);
-	     nd < MAX_NUMNODES;
-	     nd = find_next_bit(nodes, MAX_NUMNODES, 1+nd)) {
+	for_each_node_mask(nd, *nodes) {
 		int k;
 		for (k = MAX_NR_ZONES-1; k >= 0; k--) {
 			struct zone *z = &NODE_DATA(nd)->node_zones[k];
@@ -205,11 +191,11 @@ static struct zonelist *bind_zonelist(unsigned long *nodes)
 }
 
 /* Create a new policy */
-static struct mempolicy *mpol_new(int mode, unsigned long *nodes)
+static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 {
 	struct mempolicy *policy;
 
-	PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes[0]);
+	PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
 	if (mode == MPOL_DEFAULT)
 		return NULL;
 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
@@ -218,10 +204,10 @@ static struct mempolicy *mpol_new(int mode, unsigned long *nodes)
 	atomic_set(&policy->refcnt, 1);
 	switch (mode) {
 	case MPOL_INTERLEAVE:
-		bitmap_copy(policy->v.nodes, nodes, MAX_NUMNODES);
+		policy->v.nodes = *nodes;
 		break;
 	case MPOL_PREFERRED:
-		policy->v.preferred_node = find_first_bit(nodes, MAX_NUMNODES);
+		policy->v.preferred_node = first_node(*nodes);
 		if (policy->v.preferred_node >= MAX_NUMNODES)
 			policy->v.preferred_node = -1;
 		break;
@@ -239,7 +225,7 @@ static struct mempolicy *mpol_new(int mode, unsigned long *nodes)
 
 /* Ensure all existing pages follow the policy. */
 static int check_pte_range(struct mm_struct *mm, pmd_t *pmd,
-		unsigned long addr, unsigned long end, unsigned long *nodes)
+		unsigned long addr, unsigned long end, nodemask_t *nodes)
 {
 	pte_t *orig_pte;
 	pte_t *pte;
@@ -256,7 +242,7 @@ static int check_pte_range(struct mm_struct *mm, pmd_t *pmd,
 		if (!pfn_valid(pfn))
 			continue;
 		nid = pfn_to_nid(pfn);
-		if (!test_bit(nid, nodes))
+		if (!node_isset(nid, *nodes))
 			break;
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 	pte_unmap(orig_pte);
@@ -265,7 +251,7 @@ static int check_pte_range(struct mm_struct *mm, pmd_t *pmd,
 }
 
 static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud,
-		unsigned long addr, unsigned long end, unsigned long *nodes)
+		unsigned long addr, unsigned long end, nodemask_t *nodes)
 {
 	pmd_t *pmd;
 	unsigned long next;
@@ -282,7 +268,7 @@ static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud,
 }
 
 static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd,
-		unsigned long addr, unsigned long end, unsigned long *nodes)
+		unsigned long addr, unsigned long end, nodemask_t *nodes)
 {
 	pud_t *pud;
 	unsigned long next;
@@ -299,7 +285,7 @@ static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd,
 }
 
 static inline int check_pgd_range(struct mm_struct *mm,
-		unsigned long addr, unsigned long end, unsigned long *nodes)
+		unsigned long addr, unsigned long end, nodemask_t *nodes)
 {
 	pgd_t *pgd;
 	unsigned long next;
@@ -318,7 +304,7 @@ static inline int check_pgd_range(struct mm_struct *mm,
 /* Step 1: check the range */
 static struct vm_area_struct *
 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
-	    unsigned long *nodes, unsigned long flags)
+	    nodemask_t *nodes, unsigned long flags)
 {
 	int err;
 	struct vm_area_struct *first, *vma, *prev;
@@ -403,7 +389,7 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 	struct mm_struct *mm = current->mm;
 	struct mempolicy *new;
 	unsigned long end;
-	DECLARE_BITMAP(nodes, MAX_NUMNODES);
+	nodemask_t nodes;
 	int err;
 
 	if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
@@ -419,19 +405,19 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 	if (end == start)
 		return 0;
 
-	err = get_nodes(nodes, nmask, maxnode, mode);
+	err = get_nodes(&nodes, nmask, maxnode, mode);
 	if (err)
 		return err;
 
-	new = mpol_new(mode, nodes);
+	new = mpol_new(mode, &nodes);
 	if (IS_ERR(new))
 		return PTR_ERR(new);
 
 	PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
-			mode,nodes[0]);
+			mode,nodes_addr(nodes)[0]);
 
 	down_write(&mm->mmap_sem);
-	vma = check_range(mm, start, end, nodes, flags);
+	vma = check_range(mm, start, end, &nodes, flags);
 	err = PTR_ERR(vma);
 	if (!IS_ERR(vma))
 		err = mbind_range(vma, start, end, new);
@@ -446,45 +432,45 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 {
 	int err;
 	struct mempolicy *new;
-	DECLARE_BITMAP(nodes, MAX_NUMNODES);
+	nodemask_t nodes;
 
 	if (mode < 0 || mode > MPOL_MAX)
 		return -EINVAL;
-	err = get_nodes(nodes, nmask, maxnode, mode);
+	err = get_nodes(&nodes, nmask, maxnode, mode);
 	if (err)
 		return err;
-	new = mpol_new(mode, nodes);
+	new = mpol_new(mode, &nodes);
 	if (IS_ERR(new))
 		return PTR_ERR(new);
 	mpol_free(current->mempolicy);
 	current->mempolicy = new;
 	if (new && new->policy == MPOL_INTERLEAVE)
-		current->il_next = find_first_bit(new->v.nodes, MAX_NUMNODES);
+		current->il_next = first_node(new->v.nodes);
 	return 0;
 }
 
 /* Fill a zone bitmap for a policy */
-static void get_zonemask(struct mempolicy *p, unsigned long *nodes)
+static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
 {
 	int i;
 
-	bitmap_zero(nodes, MAX_NUMNODES);
+	nodes_clear(*nodes);
 	switch (p->policy) {
 	case MPOL_BIND:
 		for (i = 0; p->v.zonelist->zones[i]; i++)
-			__set_bit(p->v.zonelist->zones[i]->zone_pgdat->node_id, nodes);
+			node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id, *nodes);
 		break;
 	case MPOL_DEFAULT:
 		break;
 	case MPOL_INTERLEAVE:
-		bitmap_copy(nodes, p->v.nodes, MAX_NUMNODES);
+		*nodes = p->v.nodes;
 		break;
 	case MPOL_PREFERRED:
 		/* or use current node instead of online map? */
 		if (p->v.preferred_node < 0)
-			bitmap_copy(nodes, nodes_addr(node_online_map), MAX_NUMNODES);
+			*nodes = node_online_map;
 		else
-			__set_bit(p->v.preferred_node, nodes);
+			node_set(p->v.preferred_node, *nodes);
 		break;
 	default:
 		BUG();
@@ -506,9 +492,10 @@ static int lookup_node(struct mm_struct *mm, unsigned long addr)
 
 /* Copy a kernel node mask to user space */
 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
-			      void *nodes, unsigned nbytes)
+			      nodemask_t *nodes)
 {
 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
+	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
 
 	if (copy > nbytes) {
 		if (copy > PAGE_SIZE)
@@ -517,7 +504,7 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
 			return -EFAULT;
 		copy = nbytes;
 	}
-	return copy_to_user(mask, nodes, copy) ? -EFAULT : 0;
+	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
 }
 
 /* Retrieve NUMA policy */
@@ -578,9 +565,9 @@ asmlinkage long sys_get_mempolicy(int __user *policy,
 
 	err = 0;
 	if (nmask) {
-		DECLARE_BITMAP(nodes, MAX_NUMNODES);
-		get_zonemask(pol, nodes);
-		err = copy_nodes_to_user(nmask, maxnode, nodes, sizeof(nodes));
+		nodemask_t nodes;
+		get_zonemask(pol, &nodes);
+		err = copy_nodes_to_user(nmask, maxnode, &nodes);
 	}
 
  out:
@@ -649,15 +636,15 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
 	long err = 0;
 	unsigned long __user *nm = NULL;
 	unsigned long nr_bits, alloc_size;
-	DECLARE_BITMAP(bm, MAX_NUMNODES);
+	nodemask_t bm;
 
 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 
 	if (nmask) {
-		err = compat_get_bitmap(bm, nmask, nr_bits);
+		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
 		nm = compat_alloc_user_space(alloc_size);
-		err |= copy_to_user(nm, bm, alloc_size);
+		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
 	}
 
 	if (err)
@@ -723,9 +710,9 @@ static unsigned interleave_nodes(struct mempolicy *policy)
 
 	nid = me->il_next;
 	BUG_ON(nid >= MAX_NUMNODES);
-	next = find_next_bit(policy->v.nodes, MAX_NUMNODES, 1+nid);
+	next = next_node(nid, policy->v.nodes);
 	if (next >= MAX_NUMNODES)
-		next = find_first_bit(policy->v.nodes, MAX_NUMNODES);
+		next = first_node(policy->v.nodes);
 	me->il_next = next;
 	return nid;
 }
@@ -734,18 +721,17 @@ static unsigned interleave_nodes(struct mempolicy *policy)
 static unsigned offset_il_node(struct mempolicy *pol,
 		struct vm_area_struct *vma, unsigned long off)
 {
-	unsigned nnodes = bitmap_weight(pol->v.nodes, MAX_NUMNODES);
+	unsigned nnodes = nodes_weight(pol->v.nodes);
 	unsigned target = (unsigned)off % nnodes;
 	int c;
 	int nid = -1;
 
 	c = 0;
 	do {
-		nid = find_next_bit(pol->v.nodes, MAX_NUMNODES, nid+1);
+		nid = next_node(nid, pol->v.nodes);
 		c++;
 	} while (c <= target);
 	BUG_ON(nid >= MAX_NUMNODES);
-	BUG_ON(!test_bit(nid, pol->v.nodes));
 	return nid;
 }
 
@@ -878,7 +864,7 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
 	case MPOL_DEFAULT:
 		return 1;
 	case MPOL_INTERLEAVE:
-		return bitmap_equal(a->v.nodes, b->v.nodes, MAX_NUMNODES);
+		return nodes_equal(a->v.nodes, b->v.nodes);
 	case MPOL_PREFERRED:
 		return a->v.preferred_node == b->v.preferred_node;
 	case MPOL_BIND: {
@@ -1117,7 +1103,7 @@ int mpol_set_shared_policy(struct shared_policy *info,
 	PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
 		 vma->vm_pgoff,
 		 sz, npol? npol->policy : -1,
-		npol ? npol->v.nodes[0] : -1);
+		npol ? nodes_addr(npol->v.nodes)[0] : -1);
 
 	if (npol) {
 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);

From 662f3a0b94cc92bd708c27b80f8207cd7db93204 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 29 Oct 2005 18:15:49 -0700
Subject: [PATCH 14/91] [PATCH] Remove near all BUGs in mm/mempolicy.c

Most of them can never be triggered and were only for development.

Signed-off-by: "Andi Kleen" <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mempolicy.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 8bc0be1c9efd..43b1199af591 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -185,7 +185,6 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
 				policy_zone = k;
 		}
 	}
-	BUG_ON(num >= max);
 	zl->zones[num] = NULL;
 	return zl;
 }
@@ -709,7 +708,6 @@ static unsigned interleave_nodes(struct mempolicy *policy)
 	struct task_struct *me = current;
 
 	nid = me->il_next;
-	BUG_ON(nid >= MAX_NUMNODES);
 	next = next_node(nid, policy->v.nodes);
 	if (next >= MAX_NUMNODES)
 		next = first_node(policy->v.nodes);
@@ -731,18 +729,17 @@ static unsigned offset_il_node(struct mempolicy *pol,
 		nid = next_node(nid, pol->v.nodes);
 		c++;
 	} while (c <= target);
-	BUG_ON(nid >= MAX_NUMNODES);
 	return nid;
 }
 
 /* Allocate a page in interleaved policy.
    Own path because it needs to do special accounting. */
-static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, unsigned nid)
+static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
+					unsigned nid)
 {
 	struct zonelist *zl;
 	struct page *page;
 
-	BUG_ON(!node_online(nid));
 	zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
 	page = __alloc_pages(gfp, order, zl);
 	if (page && page_zone(page) == zl->zones[0]) {
@@ -785,8 +782,6 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
 		unsigned nid;
 		if (vma) {
 			unsigned long off;
-			BUG_ON(addr >= vma->vm_end);
-			BUG_ON(addr < vma->vm_start);
 			off = vma->vm_pgoff;
 			off += (addr - vma->vm_start) >> PAGE_SHIFT;
 			nid = offset_il_node(pol, vma, off);

From b57b98d147ef98758886a39efb94f3254542c39b Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Sat, 29 Oct 2005 18:15:50 -0700
Subject: [PATCH 15/91] [PATCH] mm/msync.c cleanup

This is not problem actually, but sync_page_range() is using for exported
function to filesystems.

The msync_xxx is more readable at least to me.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Acked-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/msync.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/mm/msync.c b/mm/msync.c
index d0f5a1bce7cb..9cab3f2d5863 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -22,7 +22,7 @@
  * threads/the swapper from ripping pte's out from under us.
  */
 
-static void sync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
+static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 				unsigned long addr, unsigned long end)
 {
 	pte_t *pte;
@@ -50,7 +50,7 @@ static void sync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 	pte_unmap(pte - 1);
 }
 
-static inline void sync_pmd_range(struct vm_area_struct *vma, pud_t *pud,
+static inline void msync_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 				unsigned long addr, unsigned long end)
 {
 	pmd_t *pmd;
@@ -61,11 +61,11 @@ static inline void sync_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 		next = pmd_addr_end(addr, end);
 		if (pmd_none_or_clear_bad(pmd))
 			continue;
-		sync_pte_range(vma, pmd, addr, next);
+		msync_pte_range(vma, pmd, addr, next);
 	} while (pmd++, addr = next, addr != end);
 }
 
-static inline void sync_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
+static inline void msync_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 				unsigned long addr, unsigned long end)
 {
 	pud_t *pud;
@@ -76,11 +76,11 @@ static inline void sync_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 		next = pud_addr_end(addr, end);
 		if (pud_none_or_clear_bad(pud))
 			continue;
-		sync_pmd_range(vma, pud, addr, next);
+		msync_pmd_range(vma, pud, addr, next);
 	} while (pud++, addr = next, addr != end);
 }
 
-static void sync_page_range(struct vm_area_struct *vma,
+static void msync_page_range(struct vm_area_struct *vma,
 				unsigned long addr, unsigned long end)
 {
 	struct mm_struct *mm = vma->vm_mm;
@@ -101,14 +101,14 @@ static void sync_page_range(struct vm_area_struct *vma,
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd))
 			continue;
-		sync_pud_range(vma, pgd, addr, next);
+		msync_pud_range(vma, pgd, addr, next);
 	} while (pgd++, addr = next, addr != end);
 	spin_unlock(&mm->page_table_lock);
 }
 
 #ifdef CONFIG_PREEMPT
-static inline void filemap_sync(struct vm_area_struct *vma,
-				unsigned long addr, unsigned long end)
+static inline void filemap_msync(struct vm_area_struct *vma,
+				 unsigned long addr, unsigned long end)
 {
 	const size_t chunk = 64 * 1024;	/* bytes */
 	unsigned long next;
@@ -117,15 +117,15 @@ static inline void filemap_sync(struct vm_area_struct *vma,
 		next = addr + chunk;
 		if (next > end || next < addr)
 			next = end;
-		sync_page_range(vma, addr, next);
+		msync_page_range(vma, addr, next);
 		cond_resched();
 	} while (addr = next, addr != end);
 }
 #else
-static inline void filemap_sync(struct vm_area_struct *vma,
-				unsigned long addr, unsigned long end)
+static inline void filemap_msync(struct vm_area_struct *vma,
+				 unsigned long addr, unsigned long end)
 {
-	sync_page_range(vma, addr, end);
+	msync_page_range(vma, addr, end);
 }
 #endif
 
@@ -150,7 +150,7 @@ static int msync_interval(struct vm_area_struct *vma,
 		return -EBUSY;
 
 	if (file && (vma->vm_flags & VM_SHARED)) {
-		filemap_sync(vma, addr, end);
+		filemap_msync(vma, addr, end);
 
 		if (flags & MS_SYNC) {
 			struct address_space *mapping = file->f_mapping;

From c340010e4bf824d969a89fa192ecc7a526c0cd24 Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn <lee.schermerhorn@hp.com>
Date: Sat, 29 Oct 2005 18:15:51 -0700
Subject: [PATCH 16/91] [PATCH] shrink_list(): skip anon pages if not may_swap

Martin Hicks' page cache reclaim patch added the 'may_swap' flag to the
scan_control struct; and modified shrink_list() not to add anon pages to
the swap cache if may_swap is not asserted.

Ref:  http://marc.theaimsgroup.com/?l=linux-mm&m=111461480725322&w=4

However, further down, if the page is mapped, shrink_list() calls
try_to_unmap() which will call try_to_unmap_one() via try_to_unmap_anon ().
 try_to_unmap_one() will BUG_ON() an anon page that is NOT in the swap
cache.  Martin says he never encountered this path in his testing, but
agrees that it might happen.

This patch modifies shrink_list() to skip anon pages that are not already
in the swap cache when !may_swap, rather than just not adding them to the
cache.

Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmscan.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 843c87d1e61f..41d1064aabfb 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -417,7 +417,9 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
 		 * Anonymous process memory has backing store?
 		 * Try to allocate it some swap space here.
 		 */
-		if (PageAnon(page) && !PageSwapCache(page) && sc->may_swap) {
+		if (PageAnon(page) && !PageSwapCache(page)) {
+			if (!sc->may_swap)
+				goto keep_locked;
 			if (!add_to_swap(page))
 				goto activate_locked;
 		}

From 09ad4bbc3a5c93316d7f4ffc0c310d9cbb28c2f0 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Sat, 29 Oct 2005 18:15:52 -0700
Subject: [PATCH 17/91] [PATCH] slab: add additional debugging to detect slabs
 from the wrong node

This patch adds some stack dumps if the slab logic is processing slab
blocks from the wrong node.  This is necessary in order to detect
situations as encountered by Petr.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/mm/slab.c b/mm/slab.c
index d30423f167a2..22bfb0b2ac8b 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2419,6 +2419,7 @@ retry:
 			next = slab_bufctl(slabp)[slabp->free];
 #if DEBUG
 			slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
+			WARN_ON(numa_node_id() != slabp->nodeid);
 #endif
 		       	slabp->free = next;
 		}
@@ -2633,8 +2634,10 @@ static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects, int n
 		check_spinlock_acquired_node(cachep, node);
 		check_slabp(cachep, slabp);
 
-
 #if DEBUG
+		/* Verify that the slab belongs to the intended node */
+		WARN_ON(slabp->nodeid != node);
+
 		if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) {
 			printk(KERN_ERR "slab: double free detected in cache "
 					"'%s', objp %p\n", cachep->name, objp);

From e040f218bb49a6965a5b77edce05fe47a62dda39 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:15:53 -0700
Subject: [PATCH 18/91] [PATCH] mm: copy_pte_range progress fix

My latency breaking in copy_pte_range didn't work as intended: instead of
checking at regularish intervals, after the first interval it checked every
time around the loop, too impatient to be preempted.  Fix that.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 1db40e935e55..222c13e46130 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -410,7 +410,7 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 {
 	pte_t *src_pte, *dst_pte;
 	unsigned long vm_flags = vma->vm_flags;
-	int progress;
+	int progress = 0;
 
 again:
 	dst_pte = pte_alloc_map(dst_mm, dst_pmd, addr);
@@ -418,17 +418,19 @@ again:
 		return -ENOMEM;
 	src_pte = pte_offset_map_nested(src_pmd, addr);
 
-	progress = 0;
 	spin_lock(&src_mm->page_table_lock);
 	do {
 		/*
 		 * We are holding two locks at this point - either of them
 		 * could generate latencies in another task on another CPU.
 		 */
-		if (progress >= 32 && (need_resched() ||
-		    need_lockbreak(&src_mm->page_table_lock) ||
-		    need_lockbreak(&dst_mm->page_table_lock)))
-			break;
+		if (progress >= 32) {
+			progress = 0;
+			if (need_resched() ||
+			    need_lockbreak(&src_mm->page_table_lock) ||
+			    need_lockbreak(&dst_mm->page_table_lock))
+				break;
+		}
 		if (pte_none(*src_pte)) {
 			progress++;
 			continue;

From 0c942a4539c09adf09097315cc174aefd0eeedf7 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:15:53 -0700
Subject: [PATCH 19/91] [PATCH] mm: msync_pte_range progress

Use latency breaking in msync_pte_range like that in copy_pte_range, instead
of the ugly CONFIG_PREEMPT filemap_msync alternatives.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/msync.c | 38 ++++++++++++++------------------------
 1 file changed, 14 insertions(+), 24 deletions(-)

diff --git a/mm/msync.c b/mm/msync.c
index 9cab3f2d5863..3b5f1c521d4b 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -26,12 +26,21 @@ static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 				unsigned long addr, unsigned long end)
 {
 	pte_t *pte;
+	int progress = 0;
 
+again:
 	pte = pte_offset_map(pmd, addr);
 	do {
 		unsigned long pfn;
 		struct page *page;
 
+		if (progress >= 64) {
+			progress = 0;
+			if (need_resched() ||
+			    need_lockbreak(&vma->vm_mm->page_table_lock))
+				break;
+		}
+		progress++;
 		if (!pte_present(*pte))
 			continue;
 		if (!pte_maybe_dirty(*pte))
@@ -46,8 +55,12 @@ static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 		if (ptep_clear_flush_dirty(vma, addr, pte) ||
 		    page_test_and_clear_dirty(page))
 			set_page_dirty(page);
+		progress += 3;
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 	pte_unmap(pte - 1);
+	cond_resched_lock(&vma->vm_mm->page_table_lock);
+	if (addr != end)
+		goto again;
 }
 
 static inline void msync_pmd_range(struct vm_area_struct *vma, pud_t *pud,
@@ -106,29 +119,6 @@ static void msync_page_range(struct vm_area_struct *vma,
 	spin_unlock(&mm->page_table_lock);
 }
 
-#ifdef CONFIG_PREEMPT
-static inline void filemap_msync(struct vm_area_struct *vma,
-				 unsigned long addr, unsigned long end)
-{
-	const size_t chunk = 64 * 1024;	/* bytes */
-	unsigned long next;
-
-	do {
-		next = addr + chunk;
-		if (next > end || next < addr)
-			next = end;
-		msync_page_range(vma, addr, next);
-		cond_resched();
-	} while (addr = next, addr != end);
-}
-#else
-static inline void filemap_msync(struct vm_area_struct *vma,
-				 unsigned long addr, unsigned long end)
-{
-	msync_page_range(vma, addr, end);
-}
-#endif
-
 /*
  * MS_SYNC syncs the entire file - including mappings.
  *
@@ -150,7 +140,7 @@ static int msync_interval(struct vm_area_struct *vma,
 		return -EBUSY;
 
 	if (file && (vma->vm_flags & VM_SHARED)) {
-		filemap_msync(vma, addr, end);
+		msync_page_range(vma, addr, end);
 
 		if (flags & MS_SYNC) {
 			struct address_space *mapping = file->f_mapping;

From 6237bcd94851e9cf0ecd2520d744779df0f5a9a6 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:15:54 -0700
Subject: [PATCH 20/91] [PATCH] mm: zap_pte_range dont dirty anon

zap_pte_range already avoids wasting time to mark_page_accessed on anon pages:
it can also skip anon set_page_dirty - the page only needs to be marked dirty
if shared with another mm, but that will say pte_dirty too.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 222c13e46130..fd5d4c6dc762 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -574,12 +574,14 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
 						addr) != page->index)
 				set_pte_at(tlb->mm, addr, pte,
 					   pgoff_to_pte(page->index));
-			if (pte_dirty(ptent))
-				set_page_dirty(page);
 			if (PageAnon(page))
 				dec_mm_counter(tlb->mm, anon_rss);
-			else if (pte_young(ptent))
-				mark_page_accessed(page);
+			else {
+				if (pte_dirty(ptent))
+					set_page_dirty(page);
+				if (pte_young(ptent))
+					mark_page_accessed(page);
+			}
 			tlb->freed++;
 			page_remove_rmap(page);
 			tlb_remove_page(tlb, page);

From 72866f6f277ec0ddd6df7a3b6ecdcf59a28de115 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:15:55 -0700
Subject: [PATCH 21/91] [PATCH] mm: anon is already wrprotected

do_anonymous_page's pte_wrprotect causes some confusion: in such a case,
vm_page_prot must already be forcing COW, so must omit write permission, and
so the pte_wrprotect is redundant.  Replace it by a comment to that effect,
and reword the comment on unuse_pte which also caused confusion.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c   | 7 ++++---
 mm/swapfile.c | 7 +++----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index fd5d4c6dc762..13667681cd16 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1768,13 +1768,14 @@ do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long addr)
 {
 	pte_t entry;
-	struct page * page = ZERO_PAGE(addr);
 
-	/* Read-only mapping of ZERO_PAGE. */
-	entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
+	/* Mapping of ZERO_PAGE - vm_page_prot is readonly */
+	entry = mk_pte(ZERO_PAGE(addr), vma->vm_page_prot);
 
 	/* ..except if it's a write access */
 	if (write_access) {
+		struct page *page;
+
 		/* Allocate our own private page. */
 		pte_unmap(page_table);
 		spin_unlock(&mm->page_table_lock);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 1dcaeda039f4..05c851291241 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -398,10 +398,9 @@ void free_swap_and_cache(swp_entry_t entry)
 }
 
 /*
- * Always set the resulting pte to be nowrite (the same as COW pages
- * after one process has exited).  We don't know just how many PTEs will
- * share this swap entry, so be cautious and let do_wp_page work out
- * what to do if a write is requested later.
+ * No need to decide whether this PTE shares the swap entry with others,
+ * just let do_wp_page work it out if a write is requested later - to
+ * force COW, vm_page_prot omits write permission from any private vma.
  *
  * vma->vm_mm->page_table_lock is held.
  */

From ab50b8ed818016cfecd747d6d4bb9139986bc029 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:15:56 -0700
Subject: [PATCH 22/91] [PATCH] mm: vm_stat_account unshackled

The original vm_stat_account has fallen into disuse, with only one user, and
only one user of vm_stat_unaccount.  It's easier to keep track if we convert
them all to __vm_stat_account, then free it from its __shackles.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/kernel/perfmon.c |  3 ++-
 arch/ia64/mm/fault.c       |  2 +-
 include/linux/mm.h         | 16 ++--------------
 kernel/fork.c              |  2 +-
 mm/mmap.c                  | 20 ++++++++++----------
 mm/mprotect.c              |  4 ++--
 mm/mremap.c                |  4 ++--
 7 files changed, 20 insertions(+), 31 deletions(-)

diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index d71731ee5b61..f7dfc107cb7b 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -2352,7 +2352,8 @@ pfm_smpl_buffer_alloc(struct task_struct *task, pfm_context_t *ctx, unsigned lon
 	insert_vm_struct(mm, vma);
 
 	mm->total_vm  += size >> PAGE_SHIFT;
-	vm_stat_account(vma);
+	vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file,
+							vma_pages(vma));
 	up_write(&task->mm->mmap_sem);
 
 	/*
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
index 3c32af910d60..f21b55549787 100644
--- a/arch/ia64/mm/fault.c
+++ b/arch/ia64/mm/fault.c
@@ -41,7 +41,7 @@ expand_backing_store (struct vm_area_struct *vma, unsigned long address)
 	vma->vm_mm->total_vm += grow;
 	if (vma->vm_flags & VM_LOCKED)
 		vma->vm_mm->locked_vm += grow;
-	__vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file, grow);
+	vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file, grow);
 	return 0;
 }
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index e1649578fb0c..376a466743bc 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -928,26 +928,14 @@ int remap_pfn_range(struct vm_area_struct *, unsigned long,
 		unsigned long, unsigned long, pgprot_t);
 
 #ifdef CONFIG_PROC_FS
-void __vm_stat_account(struct mm_struct *, unsigned long, struct file *, long);
+void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long);
 #else
-static inline void __vm_stat_account(struct mm_struct *mm,
+static inline void vm_stat_account(struct mm_struct *mm,
 			unsigned long flags, struct file *file, long pages)
 {
 }
 #endif /* CONFIG_PROC_FS */
 
-static inline void vm_stat_account(struct vm_area_struct *vma)
-{
-	__vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file,
-							vma_pages(vma));
-}
-
-static inline void vm_stat_unaccount(struct vm_area_struct *vma)
-{
-	__vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file,
-							-vma_pages(vma));
-}
-
 /* update per process rss and vm hiwater data */
 extern void update_mem_hiwater(struct task_struct *tsk);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 280bd44ac441..e2ff11f8c1b0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -212,7 +212,7 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
 		if (mpnt->vm_flags & VM_DONTCOPY) {
 			long pages = vma_pages(mpnt);
 			mm->total_vm -= pages;
-			__vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
+			vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
 								-pages);
 			continue;
 		}
diff --git a/mm/mmap.c b/mm/mmap.c
index fa11d91242e8..e1780266ac7d 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -832,7 +832,7 @@ none:
 }
 
 #ifdef CONFIG_PROC_FS
-void __vm_stat_account(struct mm_struct *mm, unsigned long flags,
+void vm_stat_account(struct mm_struct *mm, unsigned long flags,
 						struct file *file, long pages)
 {
 	const unsigned long stack_flags
@@ -1110,7 +1110,7 @@ munmap_back:
 	}
 out:	
 	mm->total_vm += len >> PAGE_SHIFT;
-	__vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
+	vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
 	if (vm_flags & VM_LOCKED) {
 		mm->locked_vm += len >> PAGE_SHIFT;
 		make_pages_present(addr, addr + len);
@@ -1475,7 +1475,7 @@ static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, un
 	mm->total_vm += grow;
 	if (vma->vm_flags & VM_LOCKED)
 		mm->locked_vm += grow;
-	__vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);
+	vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);
 	return 0;
 }
 
@@ -1610,15 +1610,15 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr)
  * By the time this function is called, the area struct has been
  * removed from the process mapping list.
  */
-static void unmap_vma(struct mm_struct *mm, struct vm_area_struct *area)
+static void unmap_vma(struct mm_struct *mm, struct vm_area_struct *vma)
 {
-	size_t len = area->vm_end - area->vm_start;
+	long nrpages = vma_pages(vma);
 
-	area->vm_mm->total_vm -= len >> PAGE_SHIFT;
-	if (area->vm_flags & VM_LOCKED)
-		area->vm_mm->locked_vm -= len >> PAGE_SHIFT;
-	vm_stat_unaccount(area);
-	remove_vm_struct(area);
+	mm->total_vm -= nrpages;
+	if (vma->vm_flags & VM_LOCKED)
+		mm->locked_vm -= nrpages;
+	vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
+	remove_vm_struct(vma);
 }
 
 /*
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 57577f63b305..b426f01c5e9c 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -168,8 +168,8 @@ success:
 	vma->vm_flags = newflags;
 	vma->vm_page_prot = newprot;
 	change_protection(vma, start, end, newprot);
-	__vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
-	__vm_stat_account(mm, newflags, vma->vm_file, nrpages);
+	vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
+	vm_stat_account(mm, newflags, vma->vm_file, nrpages);
 	return 0;
 
 fail:
diff --git a/mm/mremap.c b/mm/mremap.c
index f343fc73a8bd..55df8f53e84d 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -233,7 +233,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 	 * since do_munmap() will decrement it by old_len == new_len
 	 */
 	mm->total_vm += new_len >> PAGE_SHIFT;
-	__vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
+	vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
 
 	if (do_munmap(mm, old_addr, old_len) < 0) {
 		/* OOM: unable to split vma, just get accounts right */
@@ -384,7 +384,7 @@ unsigned long do_mremap(unsigned long addr,
 				addr + new_len, vma->vm_pgoff, NULL);
 
 			current->mm->total_vm += pages;
-			__vm_stat_account(vma->vm_mm, vma->vm_flags,
+			vm_stat_account(vma->vm_mm, vma->vm_flags,
 							vma->vm_file, pages);
 			if (vma->vm_flags & VM_LOCKED) {
 				current->mm->locked_vm += pages;

From 2c0b381467bc2997be9d741a152f3fc75785eedc Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:15:56 -0700
Subject: [PATCH 23/91] [PATCH] mm: remove_vma_list consolidation

unmap_vma doesn't amount to much, let's put it inside unmap_vma_list.  Except
it doesn't unmap anything, unmap_region just did the unmapping: rename it to
remove_vma_list.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mmap.c | 36 ++++++++++++------------------------
 1 file changed, 12 insertions(+), 24 deletions(-)

diff --git a/mm/mmap.c b/mm/mmap.c
index e1780266ac7d..eeefe19a0fac 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1603,35 +1603,23 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr)
 }
 #endif
 
-/* Normal function to fix up a mapping
- * This function is the default for when an area has no specific
- * function.  This may be used as part of a more specific routine.
- *
- * By the time this function is called, the area struct has been
- * removed from the process mapping list.
- */
-static void unmap_vma(struct mm_struct *mm, struct vm_area_struct *vma)
-{
-	long nrpages = vma_pages(vma);
-
-	mm->total_vm -= nrpages;
-	if (vma->vm_flags & VM_LOCKED)
-		mm->locked_vm -= nrpages;
-	vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
-	remove_vm_struct(vma);
-}
-
 /*
- * Update the VMA and inode share lists.
- *
- * Ok - we have the memory areas we should free on the 'free' list,
+ * Ok - we have the memory areas we should free on the vma list,
  * so release them, and do the vma updates.
+ *
+ * Called with the mm semaphore held.
  */
-static void unmap_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
+static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
 {
 	do {
 		struct vm_area_struct *next = vma->vm_next;
-		unmap_vma(mm, vma);
+		long nrpages = vma_pages(vma);
+
+		mm->total_vm -= nrpages;
+		if (vma->vm_flags & VM_LOCKED)
+			mm->locked_vm -= nrpages;
+		vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
+		remove_vm_struct(vma);
 		vma = next;
 	} while (vma);
 	validate_mm(mm);
@@ -1799,7 +1787,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
 	unmap_region(mm, vma, prev, start, end);
 
 	/* Fix up all other VM information */
-	unmap_vma_list(mm, vma);
+	remove_vma_list(mm, vma);
 
 	return 0;
 }

From a8fb5618dab7e45c8990f3155628d772a9ed45f9 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:15:57 -0700
Subject: [PATCH 24/91] [PATCH] mm: unlink_file_vma, remove_vma

Divide remove_vm_struct into two parts: first anon_vma_unlink plus
unlink_file_vma, to unlink the vma from the list and tree by which rmap or
vmtruncate might find it; then remove_vma to close, fput and free.

The intention here is to do the anon_vma_unlink and unlink_file_vma earlier,
in free_pgtables before freeing any page tables: so we can be sure that any
page tables traversed by rmap and vmtruncate are stable (and other, ordinary
cases are stabilized by holding mmap_sem).

This will be crucial to traversing pgd,pud,pmd without page_table_lock.  But
testing the split-out patch showed that lifting the page_table_lock is
symbiotically necessary to make this change - the lock ordering is wrong to
move those unlinks into free_pgtables while it's under ptlock.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h |  1 +
 mm/mmap.c          | 41 +++++++++++++++++++++++++++--------------
 2 files changed, 28 insertions(+), 14 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 376a466743bc..0c64484d8ae0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -834,6 +834,7 @@ extern int split_vma(struct mm_struct *,
 extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
 extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *,
 	struct rb_node **, struct rb_node *);
+extern void unlink_file_vma(struct vm_area_struct *);
 extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
 	unsigned long addr, unsigned long len, pgoff_t pgoff);
 extern void exit_mmap(struct mm_struct *);
diff --git a/mm/mmap.c b/mm/mmap.c
index eeefe19a0fac..a3984fad3fc2 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -181,26 +181,44 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
 }
 
 /*
- * Remove one vm structure and free it.
+ * Unlink a file-based vm structure from its prio_tree, to hide
+ * vma from rmap and vmtruncate before freeing its page tables.
  */
-static void remove_vm_struct(struct vm_area_struct *vma)
+void unlink_file_vma(struct vm_area_struct *vma)
 {
 	struct file *file = vma->vm_file;
 
-	might_sleep();
 	if (file) {
 		struct address_space *mapping = file->f_mapping;
 		spin_lock(&mapping->i_mmap_lock);
 		__remove_shared_vm_struct(vma, file, mapping);
 		spin_unlock(&mapping->i_mmap_lock);
 	}
+}
+
+/*
+ * Close a vm structure and free it, returning the next.
+ */
+static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
+{
+	struct vm_area_struct *next = vma->vm_next;
+
+	/*
+	 * Hide vma from rmap and vmtruncate before freeing page tables:
+	 * to be moved into free_pgtables once page_table_lock is lifted
+	 * from it, but until then lock ordering forbids that move.
+	 */
+	anon_vma_unlink(vma);
+	unlink_file_vma(vma);
+
+	might_sleep();
 	if (vma->vm_ops && vma->vm_ops->close)
 		vma->vm_ops->close(vma);
-	if (file)
-		fput(file);
-	anon_vma_unlink(vma);
+	if (vma->vm_file)
+		fput(vma->vm_file);
 	mpol_free(vma_policy(vma));
 	kmem_cache_free(vm_area_cachep, vma);
+	return next;
 }
 
 asmlinkage unsigned long sys_brk(unsigned long brk)
@@ -1612,15 +1630,13 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr)
 static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
 {
 	do {
-		struct vm_area_struct *next = vma->vm_next;
 		long nrpages = vma_pages(vma);
 
 		mm->total_vm -= nrpages;
 		if (vma->vm_flags & VM_LOCKED)
 			mm->locked_vm -= nrpages;
 		vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
-		remove_vm_struct(vma);
-		vma = next;
+		vma = remove_vma(vma);
 	} while (vma);
 	validate_mm(mm);
 }
@@ -1944,11 +1960,8 @@ void exit_mmap(struct mm_struct *mm)
 	 * Walk the list again, actually closing and freeing it
 	 * without holding any MM locks.
 	 */
-	while (vma) {
-		struct vm_area_struct *next = vma->vm_next;
-		remove_vm_struct(vma);
-		vma = next;
-	}
+	while (vma)
+		vma = remove_vma(vma);
 
 	BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
 }

From 7c1fd6b964860cdcf44b6b98d7dcd8cc16a0a26d Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:15:58 -0700
Subject: [PATCH 25/91] [PATCH] mm: exit_mmap need not reset

exit_mmap resets various mm_struct fields, but the mm is well on its way out,
and none of those fields matter by this point.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mmap.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/mm/mmap.c b/mm/mmap.c
index a3984fad3fc2..459b9f068ad7 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1948,12 +1948,6 @@ void exit_mmap(struct mm_struct *mm)
 	free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
 	tlb_finish_mmu(tlb, 0, end);
 
-	mm->mmap = mm->mmap_cache = NULL;
-	mm->mm_rb = RB_ROOT;
-	set_mm_counter(mm, rss, 0);
-	mm->total_vm = 0;
-	mm->locked_vm = 0;
-
 	spin_unlock(&mm->page_table_lock);
 
 	/*

From 65500d234e74fc4e8f18e1a429bc24e51e75de4a Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:15:59 -0700
Subject: [PATCH 26/91] [PATCH] mm: page fault handlers tidyup

Impose a little more consistency on the page fault handlers do_wp_page,
do_swap_page, do_anonymous_page, do_no_page, do_file_page: why not pass their
arguments in the same order, called the same names?

break_cow is all very well, but what it did was inlined elsewhere: easier to
compare if it's brought back into do_wp_page.

do_file_page's fallback to do_no_page dates from a time when we were testing
pte_file by using it wherever possible: currently it's peculiar to nonlinear
vmas, so just check that.  BUG_ON if not?  Better not, it's probably page
table corruption, so just show the pte: hmm, there's a pte_ERROR macro, let's
use that for do_wp_page's invalid pfn too.

Hah!  Someone in the ppc64 world noticed pte_ERROR was unused so removed it:
restored (and say "pud" not "pmd" in its pud_ERROR).

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-ppc64/pgtable.h |   4 +-
 mm/filemap.c                |   2 +-
 mm/memory.c                 | 220 ++++++++++++++++--------------------
 mm/shmem.c                  |   2 +-
 4 files changed, 102 insertions(+), 126 deletions(-)

diff --git a/include/asm-ppc64/pgtable.h b/include/asm-ppc64/pgtable.h
index c83679c9d2b0..2eb1778a3a15 100644
--- a/include/asm-ppc64/pgtable.h
+++ b/include/asm-ppc64/pgtable.h
@@ -478,10 +478,12 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long addr,
 #define __HAVE_ARCH_PTE_SAME
 #define pte_same(A,B)	(((pte_val(A) ^ pte_val(B)) & ~_PAGE_HPTEFLAGS) == 0)
 
+#define pte_ERROR(e) \
+	printk("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, pte_val(e))
 #define pmd_ERROR(e) \
 	printk("%s:%d: bad pmd %08lx.\n", __FILE__, __LINE__, pmd_val(e))
 #define pud_ERROR(e) \
-	printk("%s:%d: bad pmd %08lx.\n", __FILE__, __LINE__, pud_val(e))
+	printk("%s:%d: bad pud %08lx.\n", __FILE__, __LINE__, pud_val(e))
 #define pgd_ERROR(e) \
 	printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e))
 
diff --git a/mm/filemap.c b/mm/filemap.c
index 1c31b2fd2ca5..8aa344e88489 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1520,7 +1520,7 @@ repeat:
 			page_cache_release(page);
 			return err;
 		}
-	} else {
+	} else if (vma->vm_flags & VM_NONLINEAR) {
 		/* No page was found just because we can't read it in now (being
 		 * here implies nonblock != 0), but the page may exist, so set
 		 * the PTE to fault it in later. */
diff --git a/mm/memory.c b/mm/memory.c
index 13667681cd16..eaf79031f573 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1212,29 +1212,11 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
 	return pte;
 }
 
-/*
- * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock
- */
-static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address, 
-		pte_t *page_table)
-{
-	pte_t entry;
-
-	entry = maybe_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)),
-			      vma);
-	ptep_establish(vma, address, page_table, entry);
-	update_mmu_cache(vma, address, entry);
-	lazy_mmu_prot_update(entry);
-}
-
 /*
  * This routine handles present pages, when users try to write
  * to a shared page. It is done by copying the page to a new address
  * and decrementing the shared-page counter for the old page.
  *
- * Goto-purists beware: the only reason for goto's here is that it results
- * in better assembly code.. The "default" path will see no jumps at all.
- *
  * Note that this routine assumes that the protection checks have been
  * done by the caller (the low-level page fault routine in most cases).
  * Thus we can safely just mark it writable once we've done any necessary
@@ -1247,25 +1229,22 @@ static inline void break_cow(struct vm_area_struct * vma, struct page * new_page
  * We hold the mm semaphore and the page_table_lock on entry and exit
  * with the page_table_lock released.
  */
-static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
-	unsigned long address, pte_t *page_table, pmd_t *pmd, pte_t pte)
+static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
+		unsigned long address, pte_t *page_table, pmd_t *pmd,
+		pte_t orig_pte)
 {
 	struct page *old_page, *new_page;
-	unsigned long pfn = pte_pfn(pte);
+	unsigned long pfn = pte_pfn(orig_pte);
 	pte_t entry;
-	int ret;
+	int ret = VM_FAULT_MINOR;
 
 	if (unlikely(!pfn_valid(pfn))) {
 		/*
-		 * This should really halt the system so it can be debugged or
-		 * at least the kernel stops what it's doing before it corrupts
-		 * data, but for the moment just pretend this is OOM.
+		 * Page table corrupted: show pte and kill process.
 		 */
-		pte_unmap(page_table);
-		printk(KERN_ERR "do_wp_page: bogus page at address %08lx\n",
-				address);
-		spin_unlock(&mm->page_table_lock);
-		return VM_FAULT_OOM;
+		pte_ERROR(orig_pte);
+		ret = VM_FAULT_OOM;
+		goto unlock;
 	}
 	old_page = pfn_to_page(pfn);
 
@@ -1274,52 +1253,57 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
 		unlock_page(old_page);
 		if (reuse) {
 			flush_cache_page(vma, address, pfn);
-			entry = maybe_mkwrite(pte_mkyoung(pte_mkdirty(pte)),
-					      vma);
+			entry = pte_mkyoung(orig_pte);
+			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 			ptep_set_access_flags(vma, address, page_table, entry, 1);
 			update_mmu_cache(vma, address, entry);
 			lazy_mmu_prot_update(entry);
-			pte_unmap(page_table);
-			spin_unlock(&mm->page_table_lock);
-			return VM_FAULT_MINOR|VM_FAULT_WRITE;
+			ret |= VM_FAULT_WRITE;
+			goto unlock;
 		}
 	}
-	pte_unmap(page_table);
 
 	/*
 	 * Ok, we need to copy. Oh, well..
 	 */
 	if (!PageReserved(old_page))
 		page_cache_get(old_page);
+	pte_unmap(page_table);
 	spin_unlock(&mm->page_table_lock);
 
 	if (unlikely(anon_vma_prepare(vma)))
-		goto no_new_page;
+		goto oom;
 	if (old_page == ZERO_PAGE(address)) {
 		new_page = alloc_zeroed_user_highpage(vma, address);
 		if (!new_page)
-			goto no_new_page;
+			goto oom;
 	} else {
 		new_page = alloc_page_vma(GFP_HIGHUSER, vma, address);
 		if (!new_page)
-			goto no_new_page;
+			goto oom;
 		copy_user_highpage(new_page, old_page, address);
 	}
+
 	/*
 	 * Re-check the pte - we dropped the lock
 	 */
-	ret = VM_FAULT_MINOR;
 	spin_lock(&mm->page_table_lock);
 	page_table = pte_offset_map(pmd, address);
-	if (likely(pte_same(*page_table, pte))) {
+	if (likely(pte_same(*page_table, orig_pte))) {
 		if (PageAnon(old_page))
 			dec_mm_counter(mm, anon_rss);
 		if (PageReserved(old_page))
 			inc_mm_counter(mm, rss);
 		else
 			page_remove_rmap(old_page);
+
 		flush_cache_page(vma, address, pfn);
-		break_cow(vma, new_page, address, page_table);
+		entry = mk_pte(new_page, vma->vm_page_prot);
+		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+		ptep_establish(vma, address, page_table, entry);
+		update_mmu_cache(vma, address, entry);
+		lazy_mmu_prot_update(entry);
+
 		lru_cache_add_active(new_page);
 		page_add_anon_rmap(new_page, vma, address);
 
@@ -1327,13 +1311,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
 		new_page = old_page;
 		ret |= VM_FAULT_WRITE;
 	}
-	pte_unmap(page_table);
 	page_cache_release(new_page);
 	page_cache_release(old_page);
+unlock:
+	pte_unmap(page_table);
 	spin_unlock(&mm->page_table_lock);
 	return ret;
-
-no_new_page:
+oom:
 	page_cache_release(old_page);
 	return VM_FAULT_OOM;
 }
@@ -1661,17 +1645,19 @@ void swapin_readahead(swp_entry_t entry, unsigned long addr,struct vm_area_struc
  * We hold the mm semaphore and the page_table_lock on entry and
  * should release the pagetable lock on exit..
  */
-static int do_swap_page(struct mm_struct * mm,
-	struct vm_area_struct * vma, unsigned long address,
-	pte_t *page_table, pmd_t *pmd, pte_t orig_pte, int write_access)
+static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
+		unsigned long address, pte_t *page_table, pmd_t *pmd,
+		int write_access, pte_t orig_pte)
 {
 	struct page *page;
-	swp_entry_t entry = pte_to_swp_entry(orig_pte);
+	swp_entry_t entry;
 	pte_t pte;
 	int ret = VM_FAULT_MINOR;
 
 	pte_unmap(page_table);
 	spin_unlock(&mm->page_table_lock);
+
+	entry = pte_to_swp_entry(orig_pte);
 	page = lookup_swap_cache(entry);
 	if (!page) {
  		swapin_readahead(entry, address, vma);
@@ -1685,11 +1671,7 @@ static int do_swap_page(struct mm_struct * mm,
 			page_table = pte_offset_map(pmd, address);
 			if (likely(pte_same(*page_table, orig_pte)))
 				ret = VM_FAULT_OOM;
-			else
-				ret = VM_FAULT_MINOR;
-			pte_unmap(page_table);
-			spin_unlock(&mm->page_table_lock);
-			goto out;
+			goto unlock;
 		}
 
 		/* Had to read the page from swap area: Major fault */
@@ -1745,6 +1727,7 @@ static int do_swap_page(struct mm_struct * mm,
 	/* No need to invalidate - it was non-present before */
 	update_mmu_cache(vma, address, pte);
 	lazy_mmu_prot_update(pte);
+unlock:
 	pte_unmap(page_table);
 	spin_unlock(&mm->page_table_lock);
 out:
@@ -1754,7 +1737,7 @@ out_nomap:
 	spin_unlock(&mm->page_table_lock);
 	unlock_page(page);
 	page_cache_release(page);
-	goto out;
+	return ret;
 }
 
 /*
@@ -1762,17 +1745,15 @@ out_nomap:
  * spinlock held to protect against concurrent faults in
  * multithreaded programs. 
  */
-static int
-do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
-		pte_t *page_table, pmd_t *pmd, int write_access,
-		unsigned long addr)
+static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
+		unsigned long address, pte_t *page_table, pmd_t *pmd,
+		int write_access)
 {
 	pte_t entry;
 
 	/* Mapping of ZERO_PAGE - vm_page_prot is readonly */
 	entry = mk_pte(ZERO_PAGE(addr), vma->vm_page_prot);
 
-	/* ..except if it's a write access */
 	if (write_access) {
 		struct page *page;
 
@@ -1781,39 +1762,36 @@ do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		spin_unlock(&mm->page_table_lock);
 
 		if (unlikely(anon_vma_prepare(vma)))
-			goto no_mem;
-		page = alloc_zeroed_user_highpage(vma, addr);
+			goto oom;
+		page = alloc_zeroed_user_highpage(vma, address);
 		if (!page)
-			goto no_mem;
+			goto oom;
 
 		spin_lock(&mm->page_table_lock);
-		page_table = pte_offset_map(pmd, addr);
+		page_table = pte_offset_map(pmd, address);
 
 		if (!pte_none(*page_table)) {
-			pte_unmap(page_table);
 			page_cache_release(page);
-			spin_unlock(&mm->page_table_lock);
-			goto out;
+			goto unlock;
 		}
 		inc_mm_counter(mm, rss);
-		entry = maybe_mkwrite(pte_mkdirty(mk_pte(page,
-							 vma->vm_page_prot)),
-				      vma);
+		entry = mk_pte(page, vma->vm_page_prot);
+		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 		lru_cache_add_active(page);
 		SetPageReferenced(page);
-		page_add_anon_rmap(page, vma, addr);
+		page_add_anon_rmap(page, vma, address);
 	}
 
-	set_pte_at(mm, addr, page_table, entry);
-	pte_unmap(page_table);
+	set_pte_at(mm, address, page_table, entry);
 
 	/* No need to invalidate - it was non-present before */
-	update_mmu_cache(vma, addr, entry);
+	update_mmu_cache(vma, address, entry);
 	lazy_mmu_prot_update(entry);
+unlock:
+	pte_unmap(page_table);
 	spin_unlock(&mm->page_table_lock);
-out:
 	return VM_FAULT_MINOR;
-no_mem:
+oom:
 	return VM_FAULT_OOM;
 }
 
@@ -1829,20 +1807,17 @@ no_mem:
  * This is called with the MM semaphore held and the page table
  * spinlock held. Exit with the spinlock released.
  */
-static int
-do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
-	unsigned long address, int write_access, pte_t *page_table, pmd_t *pmd)
+static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
+		unsigned long address, pte_t *page_table, pmd_t *pmd,
+		int write_access)
 {
-	struct page * new_page;
+	struct page *new_page;
 	struct address_space *mapping = NULL;
 	pte_t entry;
 	unsigned int sequence = 0;
 	int ret = VM_FAULT_MINOR;
 	int anon = 0;
 
-	if (!vma->vm_ops || !vma->vm_ops->nopage)
-		return do_anonymous_page(mm, vma, page_table,
-					pmd, write_access, address);
 	pte_unmap(page_table);
 	spin_unlock(&mm->page_table_lock);
 
@@ -1852,7 +1827,6 @@ do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		smp_rmb(); /* serializes i_size against truncate_count */
 	}
 retry:
-	cond_resched();
 	new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret);
 	/*
 	 * No smp_rmb is needed here as long as there's a full
@@ -1892,9 +1866,11 @@ retry:
 	 * retry getting the page.
 	 */
 	if (mapping && unlikely(sequence != mapping->truncate_count)) {
-		sequence = mapping->truncate_count;
 		spin_unlock(&mm->page_table_lock);
 		page_cache_release(new_page);
+		cond_resched();
+		sequence = mapping->truncate_count;
+		smp_rmb();
 		goto retry;
 	}
 	page_table = pte_offset_map(pmd, address);
@@ -1924,25 +1900,22 @@ retry:
 			page_add_anon_rmap(new_page, vma, address);
 		} else
 			page_add_file_rmap(new_page);
-		pte_unmap(page_table);
 	} else {
 		/* One of our sibling threads was faster, back out. */
-		pte_unmap(page_table);
 		page_cache_release(new_page);
-		spin_unlock(&mm->page_table_lock);
-		goto out;
+		goto unlock;
 	}
 
 	/* no need to invalidate: a not-present page shouldn't be cached */
 	update_mmu_cache(vma, address, entry);
 	lazy_mmu_prot_update(entry);
+unlock:
+	pte_unmap(page_table);
 	spin_unlock(&mm->page_table_lock);
-out:
 	return ret;
 oom:
 	page_cache_release(new_page);
-	ret = VM_FAULT_OOM;
-	goto out;
+	return VM_FAULT_OOM;
 }
 
 /*
@@ -1950,29 +1923,28 @@ oom:
  * from the encoded file_pte if possible. This enables swappable
  * nonlinear vmas.
  */
-static int do_file_page(struct mm_struct * mm, struct vm_area_struct * vma,
-	unsigned long address, int write_access, pte_t *pte, pmd_t *pmd)
+static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma,
+		unsigned long address, pte_t *page_table, pmd_t *pmd,
+		int write_access, pte_t orig_pte)
 {
-	unsigned long pgoff;
+	pgoff_t pgoff;
 	int err;
 
-	BUG_ON(!vma->vm_ops || !vma->vm_ops->nopage);
-	/*
-	 * Fall back to the linear mapping if the fs does not support
-	 * ->populate:
-	 */
-	if (!vma->vm_ops->populate ||
-			(write_access && !(vma->vm_flags & VM_SHARED))) {
-		pte_clear(mm, address, pte);
-		return do_no_page(mm, vma, address, write_access, pte, pmd);
-	}
-
-	pgoff = pte_to_pgoff(*pte);
-
-	pte_unmap(pte);
+	pte_unmap(page_table);
 	spin_unlock(&mm->page_table_lock);
 
-	err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, vma->vm_page_prot, pgoff, 0);
+	if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
+		/*
+		 * Page table corrupted: show pte and kill process.
+		 */
+		pte_ERROR(orig_pte);
+		return VM_FAULT_OOM;
+	}
+	/* We can then assume vm->vm_ops && vma->vm_ops->populate */
+
+	pgoff = pte_to_pgoff(orig_pte);
+	err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE,
+					vma->vm_page_prot, pgoff, 0);
 	if (err == -ENOMEM)
 		return VM_FAULT_OOM;
 	if (err)
@@ -2002,23 +1974,25 @@ static int do_file_page(struct mm_struct * mm, struct vm_area_struct * vma,
  * release it when done.
  */
 static inline int handle_pte_fault(struct mm_struct *mm,
-	struct vm_area_struct * vma, unsigned long address,
-	int write_access, pte_t *pte, pmd_t *pmd)
+		struct vm_area_struct *vma, unsigned long address,
+		pte_t *pte, pmd_t *pmd, int write_access)
 {
 	pte_t entry;
 
 	entry = *pte;
 	if (!pte_present(entry)) {
-		/*
-		 * If it truly wasn't present, we know that kswapd
-		 * and the PTE updates will not touch it later. So
-		 * drop the lock.
-		 */
-		if (pte_none(entry))
-			return do_no_page(mm, vma, address, write_access, pte, pmd);
+		if (pte_none(entry)) {
+			if (!vma->vm_ops || !vma->vm_ops->nopage)
+				return do_anonymous_page(mm, vma, address,
+					pte, pmd, write_access);
+			return do_no_page(mm, vma, address,
+					pte, pmd, write_access);
+		}
 		if (pte_file(entry))
-			return do_file_page(mm, vma, address, write_access, pte, pmd);
-		return do_swap_page(mm, vma, address, pte, pmd, entry, write_access);
+			return do_file_page(mm, vma, address,
+					pte, pmd, write_access, entry);
+		return do_swap_page(mm, vma, address,
+					pte, pmd, write_access, entry);
 	}
 
 	if (write_access) {
@@ -2038,7 +2012,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
 /*
  * By the time we get here, we already hold the mm semaphore
  */
-int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,
+int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, int write_access)
 {
 	pgd_t *pgd;
@@ -2072,7 +2046,7 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,
 	if (!pte)
 		goto oom;
 	
-	return handle_pte_fault(mm, vma, address, write_access, pte, pmd);
+	return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
 
  oom:
 	spin_unlock(&mm->page_table_lock);
diff --git a/mm/shmem.c b/mm/shmem.c
index 55e04a0734c1..6796311a23ef 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1201,7 +1201,7 @@ static int shmem_populate(struct vm_area_struct *vma,
 				page_cache_release(page);
 				return err;
 			}
-		} else {
+		} else if (vma->vm_flags & VM_NONLINEAR) {
 			/* No page was found just because we can't read it in
 			 * now (being here implies nonblock != 0), but the page
 			 * may exist, so set the PTE to fault it in later. */

From 7be7a546994f1222b2312fd348da14e16b6b7b42 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:00 -0700
Subject: [PATCH 27/91] [PATCH] mm: move_page_tables by extents

Speeding up mremap's moving of ptes has never been a priority, but the locking
will get more complicated shortly, and is already too baroque.

Scrap the current one-by-one moving, do an extent at a time: curtailed by end
of src and dst pmds (have to use PMD_SIZE: the way pmd_addr_end gets elided
doesn't match this usage), and by latency considerations.

One nice property of the old method is lost: it never allocated a page table
unless absolutely necessary, so you could free empty page tables by mremapping
to and fro.  Whereas this way, it allocates a dst table wherever there was a
src table.  I keep diving in to reinstate the old behaviour, then come out
preferring not to clutter how it now is.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mremap.c | 166 ++++++++++++++++++++++------------------------------
 1 file changed, 71 insertions(+), 95 deletions(-)

diff --git a/mm/mremap.c b/mm/mremap.c
index 55df8f53e84d..f4e562098500 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -22,40 +22,15 @@
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 
-static pte_t *get_one_pte_map_nested(struct mm_struct *mm, unsigned long addr)
-{
-	pgd_t *pgd;
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte = NULL;
-
-	pgd = pgd_offset(mm, addr);
-	if (pgd_none_or_clear_bad(pgd))
-		goto end;
-
-	pud = pud_offset(pgd, addr);
-	if (pud_none_or_clear_bad(pud))
-		goto end;
-
-	pmd = pmd_offset(pud, addr);
-	if (pmd_none_or_clear_bad(pmd))
-		goto end;
-
-	pte = pte_offset_map_nested(pmd, addr);
-	if (pte_none(*pte)) {
-		pte_unmap_nested(pte);
-		pte = NULL;
-	}
-end:
-	return pte;
-}
-
-static pte_t *get_one_pte_map(struct mm_struct *mm, unsigned long addr)
+static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
 
+	/*
+	 * We don't need page_table_lock: we have mmap_sem exclusively.
+	 */
 	pgd = pgd_offset(mm, addr);
 	if (pgd_none_or_clear_bad(pgd))
 		return NULL;
@@ -68,35 +43,48 @@ static pte_t *get_one_pte_map(struct mm_struct *mm, unsigned long addr)
 	if (pmd_none_or_clear_bad(pmd))
 		return NULL;
 
-	return pte_offset_map(pmd, addr);
+	return pmd;
 }
 
-static inline pte_t *alloc_one_pte_map(struct mm_struct *mm, unsigned long addr)
+static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pgd;
 	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte = NULL;
+	pmd_t *pmd = NULL;
+	pte_t *pte;
 
+	/*
+	 * We do need page_table_lock: because allocators expect that.
+	 */
+	spin_lock(&mm->page_table_lock);
 	pgd = pgd_offset(mm, addr);
-
 	pud = pud_alloc(mm, pgd, addr);
 	if (!pud)
-		return NULL;
+		goto out;
+
 	pmd = pmd_alloc(mm, pud, addr);
-	if (pmd)
-		pte = pte_alloc_map(mm, pmd, addr);
-	return pte;
+	if (!pmd)
+		goto out;
+
+	pte = pte_alloc_map(mm, pmd, addr);
+	if (!pte) {
+		pmd = NULL;
+		goto out;
+	}
+	pte_unmap(pte);
+out:
+	spin_unlock(&mm->page_table_lock);
+	return pmd;
 }
 
-static int
-move_one_page(struct vm_area_struct *vma, unsigned long old_addr,
-		struct vm_area_struct *new_vma, unsigned long new_addr)
+static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
+		unsigned long old_addr, unsigned long old_end,
+		struct vm_area_struct *new_vma, pmd_t *new_pmd,
+		unsigned long new_addr)
 {
 	struct address_space *mapping = NULL;
 	struct mm_struct *mm = vma->vm_mm;
-	int error = 0;
-	pte_t *src, *dst;
+	pte_t *old_pte, *new_pte, pte;
 
 	if (vma->vm_file) {
 		/*
@@ -111,74 +99,62 @@ move_one_page(struct vm_area_struct *vma, unsigned long old_addr,
 		    new_vma->vm_truncate_count != vma->vm_truncate_count)
 			new_vma->vm_truncate_count = 0;
 	}
+
 	spin_lock(&mm->page_table_lock);
+	old_pte = pte_offset_map(old_pmd, old_addr);
+	new_pte = pte_offset_map_nested(new_pmd, new_addr);
 
-	src = get_one_pte_map_nested(mm, old_addr);
-	if (src) {
-		/*
-		 * Look to see whether alloc_one_pte_map needs to perform a
-		 * memory allocation.  If it does then we need to drop the
-		 * atomic kmap
-		 */
-		dst = get_one_pte_map(mm, new_addr);
-		if (unlikely(!dst)) {
-			pte_unmap_nested(src);
-			if (mapping)
-				spin_unlock(&mapping->i_mmap_lock);
-			dst = alloc_one_pte_map(mm, new_addr);
-			if (mapping && !spin_trylock(&mapping->i_mmap_lock)) {
-				spin_unlock(&mm->page_table_lock);
-				spin_lock(&mapping->i_mmap_lock);
-				spin_lock(&mm->page_table_lock);
-			}
-			src = get_one_pte_map_nested(mm, old_addr);
-		}
-		/*
-		 * Since alloc_one_pte_map can drop and re-acquire
-		 * page_table_lock, we should re-check the src entry...
-		 */
-		if (src) {
-			if (dst) {
-				pte_t pte;
-				pte = ptep_clear_flush(vma, old_addr, src);
-
-				/* ZERO_PAGE can be dependant on virtual addr */
-				pte = move_pte(pte, new_vma->vm_page_prot,
-							old_addr, new_addr);
-				set_pte_at(mm, new_addr, dst, pte);
-			} else
-				error = -ENOMEM;
-			pte_unmap_nested(src);
-		}
-		if (dst)
-			pte_unmap(dst);
+	for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
+				   new_pte++, new_addr += PAGE_SIZE) {
+		if (pte_none(*old_pte))
+			continue;
+		pte = ptep_clear_flush(vma, old_addr, old_pte);
+		/* ZERO_PAGE can be dependant on virtual addr */
+		pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
+		set_pte_at(mm, new_addr, new_pte, pte);
 	}
+
+	pte_unmap_nested(new_pte - 1);
+	pte_unmap(old_pte - 1);
 	spin_unlock(&mm->page_table_lock);
 	if (mapping)
 		spin_unlock(&mapping->i_mmap_lock);
-	return error;
 }
 
+#define LATENCY_LIMIT	(64 * PAGE_SIZE)
+
 static unsigned long move_page_tables(struct vm_area_struct *vma,
 		unsigned long old_addr, struct vm_area_struct *new_vma,
 		unsigned long new_addr, unsigned long len)
 {
-	unsigned long offset;
+	unsigned long extent, next, old_end;
+	pmd_t *old_pmd, *new_pmd;
 
-	flush_cache_range(vma, old_addr, old_addr + len);
+	old_end = old_addr + len;
+	flush_cache_range(vma, old_addr, old_end);
 
-	/*
-	 * This is not the clever way to do this, but we're taking the
-	 * easy way out on the assumption that most remappings will be
-	 * only a few pages.. This also makes error recovery easier.
-	 */
-	for (offset = 0; offset < len; offset += PAGE_SIZE) {
-		if (move_one_page(vma, old_addr + offset,
-				new_vma, new_addr + offset) < 0)
-			break;
+	for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
 		cond_resched();
+		next = (old_addr + PMD_SIZE) & PMD_MASK;
+		if (next - 1 > old_end)
+			next = old_end;
+		extent = next - old_addr;
+		old_pmd = get_old_pmd(vma->vm_mm, old_addr);
+		if (!old_pmd)
+			continue;
+		new_pmd = alloc_new_pmd(vma->vm_mm, new_addr);
+		if (!new_pmd)
+			break;
+		next = (new_addr + PMD_SIZE) & PMD_MASK;
+		if (extent > next - new_addr)
+			extent = next - new_addr;
+		if (extent > LATENCY_LIMIT)
+			extent = LATENCY_LIMIT;
+		move_ptes(vma, old_pmd, old_addr, old_addr + extent,
+				new_vma, new_pmd, new_addr);
 	}
-	return offset;
+
+	return len + old_addr - old_end;	/* how much done */
 }
 
 static unsigned long move_vma(struct vm_area_struct *vma,

From 15a23ffa2fc91cebdac44d4aee994f59d5c28dc0 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:01 -0700
Subject: [PATCH 28/91] [PATCH] mm: tlb_gather_mmu get_cpu_var

tlb_gather_mmu dates from before kernel preemption was allowed, and uses
smp_processor_id or __get_cpu_var to find its per-cpu mmu_gather.  That works
because it's currently only called after getting page_table_lock, which is not
dropped until after the matching tlb_finish_mmu.  But don't rely on that, it
will soon change: now disable preemption internally by proper get_cpu_var in
tlb_gather_mmu, put_cpu_var in tlb_finish_mmu.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-arm/tlb.h     |  5 +++--
 include/asm-arm26/tlb.h   |  7 ++++---
 include/asm-generic/tlb.h | 10 +++++-----
 include/asm-ia64/tlb.h    |  6 ++++--
 include/asm-sparc64/tlb.h |  4 +++-
 5 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/include/asm-arm/tlb.h b/include/asm-arm/tlb.h
index 9bb325c54645..da41df20928f 100644
--- a/include/asm-arm/tlb.h
+++ b/include/asm-arm/tlb.h
@@ -39,8 +39,7 @@ DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);
 static inline struct mmu_gather *
 tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
 {
-	int cpu = smp_processor_id();
-	struct mmu_gather *tlb = &per_cpu(mmu_gathers, cpu);
+	struct mmu_gather *tlb = &get_cpu_var(mmu_gathers);
 
 	tlb->mm = mm;
 	tlb->freed = 0;
@@ -65,6 +64,8 @@ tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
 
 	/* keep the page table cache within bounds */
 	check_pgt_cache();
+
+	put_cpu_var(mmu_gathers);
 }
 
 static inline unsigned int tlb_is_full_mm(struct mmu_gather *tlb)
diff --git a/include/asm-arm26/tlb.h b/include/asm-arm26/tlb.h
index 1316352a58f3..8486b00a6799 100644
--- a/include/asm-arm26/tlb.h
+++ b/include/asm-arm26/tlb.h
@@ -17,13 +17,12 @@ struct mmu_gather {
         unsigned int            avoided_flushes;
 };
 
-extern struct mmu_gather mmu_gathers[NR_CPUS];
+DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);
 
 static inline struct mmu_gather *
 tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
 {
-        int cpu = smp_processor_id();
-        struct mmu_gather *tlb = &mmu_gathers[cpu];
+        struct mmu_gather *tlb = &get_cpu_var(mmu_gathers);
 
         tlb->mm = mm;
         tlb->freed = 0;
@@ -52,6 +51,8 @@ tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
 
         /* keep the page table cache within bounds */
         check_pgt_cache();
+
+        put_cpu_var(mmu_gathers);
 }
 
 
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 7d0298347ee7..c8232622c8d9 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -35,9 +35,7 @@
 #endif
 
 /* struct mmu_gather is an opaque type used by the mm code for passing around
- * any data needed by arch specific code for tlb_remove_page.  This structure
- * can be per-CPU or per-MM as the page table lock is held for the duration of
- * TLB shootdown.
+ * any data needed by arch specific code for tlb_remove_page.
  */
 struct mmu_gather {
 	struct mm_struct	*mm;
@@ -57,7 +55,7 @@ DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);
 static inline struct mmu_gather *
 tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
 {
-	struct mmu_gather *tlb = &per_cpu(mmu_gathers, smp_processor_id());
+	struct mmu_gather *tlb = &get_cpu_var(mmu_gathers);
 
 	tlb->mm = mm;
 
@@ -85,7 +83,7 @@ tlb_flush_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
 
 /* tlb_finish_mmu
  *	Called at the end of the shootdown operation to free up any resources
- *	that were required.  The page table lock is still held at this point.
+ *	that were required.
  */
 static inline void
 tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
@@ -101,6 +99,8 @@ tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
 
 	/* keep the page table cache within bounds */
 	check_pgt_cache();
+
+	put_cpu_var(mmu_gathers);
 }
 
 static inline unsigned int
diff --git a/include/asm-ia64/tlb.h b/include/asm-ia64/tlb.h
index 3a9a6d1be75c..1b82299d7c1e 100644
--- a/include/asm-ia64/tlb.h
+++ b/include/asm-ia64/tlb.h
@@ -129,7 +129,7 @@ ia64_tlb_flush_mmu (struct mmu_gather *tlb, unsigned long start, unsigned long e
 static inline struct mmu_gather *
 tlb_gather_mmu (struct mm_struct *mm, unsigned int full_mm_flush)
 {
-	struct mmu_gather *tlb = &__get_cpu_var(mmu_gathers);
+	struct mmu_gather *tlb = &get_cpu_var(mmu_gathers);
 
 	tlb->mm = mm;
 	/*
@@ -154,7 +154,7 @@ tlb_gather_mmu (struct mm_struct *mm, unsigned int full_mm_flush)
 
 /*
  * Called at the end of the shootdown operation to free up any resources that were
- * collected.  The page table lock is still held at this point.
+ * collected.
  */
 static inline void
 tlb_finish_mmu (struct mmu_gather *tlb, unsigned long start, unsigned long end)
@@ -174,6 +174,8 @@ tlb_finish_mmu (struct mmu_gather *tlb, unsigned long start, unsigned long end)
 
 	/* keep the page table cache within bounds */
 	check_pgt_cache();
+
+	put_cpu_var(mmu_gathers);
 }
 
 static inline unsigned int
diff --git a/include/asm-sparc64/tlb.h b/include/asm-sparc64/tlb.h
index 9baf57db01d2..169309bdbf82 100644
--- a/include/asm-sparc64/tlb.h
+++ b/include/asm-sparc64/tlb.h
@@ -44,7 +44,7 @@ extern void flush_tlb_pending(void);
 
 static inline struct mmu_gather *tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
 {
-	struct mmu_gather *mp = &__get_cpu_var(mmu_gathers);
+	struct mmu_gather *mp = &get_cpu_var(mmu_gathers);
 
 	BUG_ON(mp->tlb_nr);
 
@@ -97,6 +97,8 @@ static inline void tlb_finish_mmu(struct mmu_gather *mp, unsigned long start, un
 
 	/* keep the page table cache within bounds */
 	check_pgt_cache();
+
+	put_cpu_var(mmu_gathers);
 }
 
 static inline unsigned int tlb_is_full_mm(struct mmu_gather *mp)

From 4d6ddfa9242bc3d27fb0f7248f6fdee0299c731f Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:02 -0700
Subject: [PATCH 29/91] [PATCH] mm: tlb_is_full_mm was obscure

tlb_is_full_mm?  What does that mean?  The TLB is full?  No, it means that the
mm's last user has gone and the whole mm is being torn down.  And it's an
inline function because sparc64 uses a different (slightly better)
"tlb_frozen" name for the flag others call "fullmm".

And now the ptep_get_and_clear_full macro used in zap_pte_range refers
directly to tlb->fullmm, which would be wrong for sparc64.  Rather than
correct that, I'd prefer to scrap tlb_is_full_mm altogether, and change
sparc64 to just use the same poor name as everyone else - is that okay?

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/sparc64/mm/tlb.c     |  4 ++--
 include/asm-arm/tlb.h     |  5 -----
 include/asm-arm26/tlb.h   |  7 -------
 include/asm-generic/tlb.h |  6 ------
 include/asm-ia64/tlb.h    |  6 ------
 include/asm-sparc64/tlb.h | 13 ++++---------
 mm/memory.c               |  4 ++--
 7 files changed, 8 insertions(+), 37 deletions(-)

diff --git a/arch/sparc64/mm/tlb.c b/arch/sparc64/mm/tlb.c
index 90ca99d0b89c..6a43f7cd090e 100644
--- a/arch/sparc64/mm/tlb.c
+++ b/arch/sparc64/mm/tlb.c
@@ -72,7 +72,7 @@ void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr, pte_t *ptep, pte_t
 
 no_cache_flush:
 
-	if (mp->tlb_frozen)
+	if (mp->fullmm)
 		return;
 
 	nr = mp->tlb_nr;
@@ -97,7 +97,7 @@ void flush_tlb_pgtables(struct mm_struct *mm, unsigned long start, unsigned long
 	unsigned long nr = mp->tlb_nr;
 	long s = start, e = end, vpte_base;
 
-	if (mp->tlb_frozen)
+	if (mp->fullmm)
 		return;
 
 	/* If start is greater than end, that is a real problem.  */
diff --git a/include/asm-arm/tlb.h b/include/asm-arm/tlb.h
index da41df20928f..a35ab0f2e25e 100644
--- a/include/asm-arm/tlb.h
+++ b/include/asm-arm/tlb.h
@@ -68,11 +68,6 @@ tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
 	put_cpu_var(mmu_gathers);
 }
 
-static inline unsigned int tlb_is_full_mm(struct mmu_gather *tlb)
-{
-	return tlb->fullmm;
-}
-
 #define tlb_remove_tlb_entry(tlb,ptep,address)	do { } while (0)
 
 /*
diff --git a/include/asm-arm26/tlb.h b/include/asm-arm26/tlb.h
index 8486b00a6799..c7d54ca0a239 100644
--- a/include/asm-arm26/tlb.h
+++ b/include/asm-arm26/tlb.h
@@ -55,13 +55,6 @@ tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
         put_cpu_var(mmu_gathers);
 }
 
-
-static inline unsigned int
-tlb_is_full_mm(struct mmu_gather *tlb)
-{
-     return tlb->fullmm;
-}
-
 #define tlb_remove_tlb_entry(tlb,ptep,address)  do { } while (0)
 //#define tlb_start_vma(tlb,vma)                  do { } while (0)
 //FIXME - ARM32 uses this now that things changed in the kernel. seems like it may be pointless on arm26, however to get things compiling...
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index c8232622c8d9..5d352a70f004 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -103,12 +103,6 @@ tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
 	put_cpu_var(mmu_gathers);
 }
 
-static inline unsigned int
-tlb_is_full_mm(struct mmu_gather *tlb)
-{
-	return tlb->fullmm;
-}
-
 /* tlb_remove_page
  *	Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while
  *	handling the additional races in SMP caused by other CPUs caching valid
diff --git a/include/asm-ia64/tlb.h b/include/asm-ia64/tlb.h
index 1b82299d7c1e..0bbd79f6a793 100644
--- a/include/asm-ia64/tlb.h
+++ b/include/asm-ia64/tlb.h
@@ -178,12 +178,6 @@ tlb_finish_mmu (struct mmu_gather *tlb, unsigned long start, unsigned long end)
 	put_cpu_var(mmu_gathers);
 }
 
-static inline unsigned int
-tlb_is_full_mm(struct mmu_gather *tlb)
-{
-     return tlb->fullmm;
-}
-
 /*
  * Logically, this routine frees PAGE.  On MP machines, the actual freeing of the page
  * must be delayed until after the TLB has been flushed (see comments at the beginning of
diff --git a/include/asm-sparc64/tlb.h b/include/asm-sparc64/tlb.h
index 169309bdbf82..5d194eae870c 100644
--- a/include/asm-sparc64/tlb.h
+++ b/include/asm-sparc64/tlb.h
@@ -25,7 +25,7 @@ struct mmu_gather {
 	struct mm_struct *mm;
 	unsigned int pages_nr;
 	unsigned int need_flush;
-	unsigned int tlb_frozen;
+	unsigned int fullmm;
 	unsigned int tlb_nr;
 	unsigned long freed;
 	unsigned long vaddrs[TLB_BATCH_NR];
@@ -50,7 +50,7 @@ static inline struct mmu_gather *tlb_gather_mmu(struct mm_struct *mm, unsigned i
 
 	mp->mm = mm;
 	mp->pages_nr = num_online_cpus() > 1 ? 0U : ~0U;
-	mp->tlb_frozen = full_mm_flush;
+	mp->fullmm = full_mm_flush;
 	mp->freed = 0;
 
 	return mp;
@@ -88,10 +88,10 @@ static inline void tlb_finish_mmu(struct mmu_gather *mp, unsigned long start, un
 
 	tlb_flush_mmu(mp);
 
-	if (mp->tlb_frozen) {
+	if (mp->fullmm) {
 		if (CTX_VALID(mm->context))
 			do_flush_tlb_mm(mm);
-		mp->tlb_frozen = 0;
+		mp->fullmm = 0;
 	} else
 		flush_tlb_pending();
 
@@ -101,11 +101,6 @@ static inline void tlb_finish_mmu(struct mmu_gather *mp, unsigned long start, un
 	put_cpu_var(mmu_gathers);
 }
 
-static inline unsigned int tlb_is_full_mm(struct mmu_gather *mp)
-{
-	return mp->tlb_frozen;
-}
-
 static inline void tlb_remove_page(struct mmu_gather *mp, struct page *page)
 {
 	mp->need_flush = 1;
diff --git a/mm/memory.c b/mm/memory.c
index eaf79031f573..585bb4e0b97f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -249,7 +249,7 @@ void free_pgd_range(struct mmu_gather **tlb,
 		free_pud_range(*tlb, pgd, addr, next, floor, ceiling);
 	} while (pgd++, addr = next, addr != end);
 
-	if (!tlb_is_full_mm(*tlb))
+	if (!(*tlb)->fullmm)
 		flush_tlb_pgtables((*tlb)->mm, start, end);
 }
 
@@ -698,7 +698,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
 	int tlb_start_valid = 0;
 	unsigned long start = start_addr;
 	spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
-	int fullmm = tlb_is_full_mm(*tlbp);
+	int fullmm = (*tlbp)->fullmm;
 
 	for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
 		unsigned long end;

From fc2acab31be8e869b2d5f6de12f557f6f054f19c Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:03 -0700
Subject: [PATCH 30/91] [PATCH] mm: tlb_finish_mmu forget rss

zap_pte_range has been counting the pages it frees in tlb->freed, then
tlb_finish_mmu has used that to update the mm's rss.  That got stranger when I
added anon_rss, yet updated it by a different route; and stranger when rss and
anon_rss became mm_counters with special access macros.  And it would no
longer be viable if we're relying on page_table_lock to stabilize the
mm_counter, but calling tlb_finish_mmu outside that lock.

Remove the mmu_gather's freed field, let tlb_finish_mmu stick to its own
business, just decrement the rss mm_counter in zap_pte_range (yes, there was
some point to batching the update, and a subsequent patch restores that).  And
forget the anal paranoia of first reading the counter to avoid going negative
- if rss does go negative, just fix that bug.

Remove the mmu_gather's flushes and avoided_flushes from arm and arm26: no use
was being made of them.  But arm26 alone was actually using the freed, in the
way some others use need_flush: give it a need_flush.  arm26 seems to prefer
spaces to tabs here: respect that.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/sparc64/mm/tlb.c     |  3 +--
 include/asm-arm/tlb.h     | 15 +--------------
 include/asm-arm26/tlb.h   | 35 +++++++++++++----------------------
 include/asm-generic/tlb.h |  9 ---------
 include/asm-ia64/tlb.h    |  9 ---------
 include/asm-sparc64/tlb.h | 14 ++------------
 mm/memory.c               |  2 +-
 7 files changed, 18 insertions(+), 69 deletions(-)

diff --git a/arch/sparc64/mm/tlb.c b/arch/sparc64/mm/tlb.c
index 6a43f7cd090e..8b104be4662b 100644
--- a/arch/sparc64/mm/tlb.c
+++ b/arch/sparc64/mm/tlb.c
@@ -18,8 +18,7 @@
 
 /* Heavily inspired by the ppc64 code.  */
 
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers) =
-	{ NULL, 0, 0, 0, 0, 0, { 0 }, { NULL }, };
+DEFINE_PER_CPU(struct mmu_gather, mmu_gathers) = { 0, };
 
 void flush_tlb_pending(void)
 {
diff --git a/include/asm-arm/tlb.h b/include/asm-arm/tlb.h
index a35ab0f2e25e..f49bfb78c221 100644
--- a/include/asm-arm/tlb.h
+++ b/include/asm-arm/tlb.h
@@ -27,11 +27,7 @@
  */
 struct mmu_gather {
 	struct mm_struct	*mm;
-	unsigned int		freed;
 	unsigned int		fullmm;
-
-	unsigned int		flushes;
-	unsigned int		avoided_flushes;
 };
 
 DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);
@@ -42,7 +38,6 @@ tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
 	struct mmu_gather *tlb = &get_cpu_var(mmu_gathers);
 
 	tlb->mm = mm;
-	tlb->freed = 0;
 	tlb->fullmm = full_mm_flush;
 
 	return tlb;
@@ -51,16 +46,8 @@ tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
 static inline void
 tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
 {
-	struct mm_struct *mm = tlb->mm;
-	unsigned long freed = tlb->freed;
-	int rss = get_mm_counter(mm, rss);
-
-	if (rss < freed)
-		freed = rss;
-	add_mm_counter(mm, rss, -freed);
-
 	if (tlb->fullmm)
-		flush_tlb_mm(mm);
+		flush_tlb_mm(tlb->mm);
 
 	/* keep the page table cache within bounds */
 	check_pgt_cache();
diff --git a/include/asm-arm26/tlb.h b/include/asm-arm26/tlb.h
index c7d54ca0a239..08ddd85b8d35 100644
--- a/include/asm-arm26/tlb.h
+++ b/include/asm-arm26/tlb.h
@@ -10,11 +10,8 @@
  */
 struct mmu_gather {
         struct mm_struct        *mm;
-        unsigned int            freed;
-	unsigned int            fullmm;
-
-        unsigned int            flushes;
-        unsigned int            avoided_flushes;
+        unsigned int            need_flush;
+        unsigned int            fullmm;
 };
 
 DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);
@@ -25,8 +22,8 @@ tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
         struct mmu_gather *tlb = &get_cpu_var(mmu_gathers);
 
         tlb->mm = mm;
-        tlb->freed = 0;
-	tlb->fullmm = full_mm_flush;
+        tlb->need_flush = 0;
+        tlb->fullmm = full_mm_flush;
 
         return tlb;
 }
@@ -34,20 +31,8 @@ tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
 static inline void
 tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
 {
-        struct mm_struct *mm = tlb->mm;
-        unsigned long freed = tlb->freed;
-        int rss = get_mm_counter(mm, rss);
-
-        if (rss < freed)
-                freed = rss;
-        add_mm_counter(mm, rss, -freed);
-
-        if (freed) {
-                flush_tlb_mm(mm);
-                tlb->flushes++;
-        } else {
-                tlb->avoided_flushes++;
-        }
+        if (tlb->need_flush)
+                flush_tlb_mm(tlb->mm);
 
         /* keep the page table cache within bounds */
         check_pgt_cache();
@@ -65,7 +50,13 @@ tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
         } while (0)
 #define tlb_end_vma(tlb,vma)                    do { } while (0)
 
-#define tlb_remove_page(tlb,page)       free_page_and_swap_cache(page)
+static inline void
+tlb_remove_page(struct mmu_gather *tlb, struct page *page)
+{
+        tlb->need_flush = 1;
+        free_page_and_swap_cache(page);
+}
+
 #define pte_free_tlb(tlb,ptep)          pte_free(ptep)
 #define pmd_free_tlb(tlb,pmdp)          pmd_free(pmdp)
 
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 5d352a70f004..cdd4145243cd 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -42,7 +42,6 @@ struct mmu_gather {
 	unsigned int		nr;	/* set to ~0U means fast mode */
 	unsigned int		need_flush;/* Really unmapped some ptes? */
 	unsigned int		fullmm; /* non-zero means full mm flush */
-	unsigned long		freed;
 	struct page *		pages[FREE_PTE_NR];
 };
 
@@ -63,7 +62,6 @@ tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
 	tlb->nr = num_online_cpus() > 1 ? 0U : ~0U;
 
 	tlb->fullmm = full_mm_flush;
-	tlb->freed = 0;
 
 	return tlb;
 }
@@ -88,13 +86,6 @@ tlb_flush_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
 static inline void
 tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
 {
-	int freed = tlb->freed;
-	struct mm_struct *mm = tlb->mm;
-	int rss = get_mm_counter(mm, rss);
-
-	if (rss < freed)
-		freed = rss;
-	add_mm_counter(mm, rss, -freed);
 	tlb_flush_mmu(tlb, start, end);
 
 	/* keep the page table cache within bounds */
diff --git a/include/asm-ia64/tlb.h b/include/asm-ia64/tlb.h
index 0bbd79f6a793..834370b9dea1 100644
--- a/include/asm-ia64/tlb.h
+++ b/include/asm-ia64/tlb.h
@@ -60,7 +60,6 @@ struct mmu_gather {
 	unsigned int		nr;		/* == ~0U => fast mode */
 	unsigned char		fullmm;		/* non-zero means full mm flush */
 	unsigned char		need_flush;	/* really unmapped some PTEs? */
-	unsigned long		freed;		/* number of pages freed */
 	unsigned long		start_addr;
 	unsigned long		end_addr;
 	struct page 		*pages[FREE_PTE_NR];
@@ -147,7 +146,6 @@ tlb_gather_mmu (struct mm_struct *mm, unsigned int full_mm_flush)
 	 */
 	tlb->nr = (num_online_cpus() == 1) ? ~0U : 0;
 	tlb->fullmm = full_mm_flush;
-	tlb->freed = 0;
 	tlb->start_addr = ~0UL;
 	return tlb;
 }
@@ -159,13 +157,6 @@ tlb_gather_mmu (struct mm_struct *mm, unsigned int full_mm_flush)
 static inline void
 tlb_finish_mmu (struct mmu_gather *tlb, unsigned long start, unsigned long end)
 {
-	unsigned long freed = tlb->freed;
-	struct mm_struct *mm = tlb->mm;
-	unsigned long rss = get_mm_counter(mm, rss);
-
-	if (rss < freed)
-		freed = rss;
-	add_mm_counter(mm, rss, -freed);
 	/*
 	 * Note: tlb->nr may be 0 at this point, so we can't rely on tlb->start_addr and
 	 * tlb->end_addr.
diff --git a/include/asm-sparc64/tlb.h b/include/asm-sparc64/tlb.h
index 5d194eae870c..66138d959df5 100644
--- a/include/asm-sparc64/tlb.h
+++ b/include/asm-sparc64/tlb.h
@@ -27,7 +27,6 @@ struct mmu_gather {
 	unsigned int need_flush;
 	unsigned int fullmm;
 	unsigned int tlb_nr;
-	unsigned long freed;
 	unsigned long vaddrs[TLB_BATCH_NR];
 	struct page *pages[FREE_PTE_NR];
 };
@@ -51,7 +50,6 @@ static inline struct mmu_gather *tlb_gather_mmu(struct mm_struct *mm, unsigned i
 	mp->mm = mm;
 	mp->pages_nr = num_online_cpus() > 1 ? 0U : ~0U;
 	mp->fullmm = full_mm_flush;
-	mp->freed = 0;
 
 	return mp;
 }
@@ -78,19 +76,11 @@ extern void smp_flush_tlb_mm(struct mm_struct *mm);
 
 static inline void tlb_finish_mmu(struct mmu_gather *mp, unsigned long start, unsigned long end)
 {
-	unsigned long freed = mp->freed;
-	struct mm_struct *mm = mp->mm;
-	unsigned long rss = get_mm_counter(mm, rss);
-
-	if (rss < freed)
-		freed = rss;
-	add_mm_counter(mm, rss, -freed);
-
 	tlb_flush_mmu(mp);
 
 	if (mp->fullmm) {
-		if (CTX_VALID(mm->context))
-			do_flush_tlb_mm(mm);
+		if (CTX_VALID(mp->mm->context))
+			do_flush_tlb_mm(mp->mm);
 		mp->fullmm = 0;
 	} else
 		flush_tlb_pending();
diff --git a/mm/memory.c b/mm/memory.c
index 585bb4e0b97f..51eb38574830 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -582,7 +582,7 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
 				if (pte_young(ptent))
 					mark_page_accessed(page);
 			}
-			tlb->freed++;
+			dec_mm_counter(tlb->mm, rss);
 			page_remove_rmap(page);
 			tlb_remove_page(tlb, page);
 			continue;

From 404351e67a9facb475abf1492245374a28d13e90 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:04 -0700
Subject: [PATCH 31/91] [PATCH] mm: mm_init set_mm_counters

How is anon_rss initialized?  In dup_mmap, and by mm_alloc's memset; but
that's not so good if an mm_counter_t is a special type.  And how is rss
initialized?  By set_mm_counter, all over the place.  Come on, we just need to
initialize them both at once by set_mm_counter in mm_init (which follows the
memcpy when forking).

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/mips/kernel/irixelf.c          | 1 -
 arch/sparc64/kernel/binfmt_aout32.c | 1 -
 arch/x86_64/ia32/ia32_aout.c        | 1 -
 fs/binfmt_aout.c                    | 1 -
 fs/binfmt_elf.c                     | 1 -
 fs/binfmt_elf_fdpic.c               | 7 -------
 fs/binfmt_flat.c                    | 1 -
 fs/binfmt_som.c                     | 1 -
 kernel/fork.c                       | 4 ++--
 9 files changed, 2 insertions(+), 16 deletions(-)

diff --git a/arch/mips/kernel/irixelf.c b/arch/mips/kernel/irixelf.c
index 99262fe64560..7ce34d4aa220 100644
--- a/arch/mips/kernel/irixelf.c
+++ b/arch/mips/kernel/irixelf.c
@@ -697,7 +697,6 @@ static int load_irix_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	/* Do this so that we can load the interpreter, if need be.  We will
 	 * change some of these later.
 	 */
-	set_mm_counter(current->mm, rss, 0);
 	setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT);
 	current->mm->start_stack = bprm->p;
 
diff --git a/arch/sparc64/kernel/binfmt_aout32.c b/arch/sparc64/kernel/binfmt_aout32.c
index b2854ef221d0..edf52d06b280 100644
--- a/arch/sparc64/kernel/binfmt_aout32.c
+++ b/arch/sparc64/kernel/binfmt_aout32.c
@@ -241,7 +241,6 @@ static int load_aout32_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	current->mm->brk = ex.a_bss +
 		(current->mm->start_brk = N_BSSADDR(ex));
 
-	set_mm_counter(current->mm, rss, 0);
 	current->mm->mmap = NULL;
 	compute_creds(bprm);
  	current->flags &= ~PF_FORKNOEXEC;
diff --git a/arch/x86_64/ia32/ia32_aout.c b/arch/x86_64/ia32/ia32_aout.c
index 3e6780fa0186..93c60f4aa47a 100644
--- a/arch/x86_64/ia32/ia32_aout.c
+++ b/arch/x86_64/ia32/ia32_aout.c
@@ -314,7 +314,6 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	current->mm->free_area_cache = TASK_UNMAPPED_BASE;
 	current->mm->cached_hole_size = 0;
 
-	set_mm_counter(current->mm, rss, 0);
 	current->mm->mmap = NULL;
 	compute_creds(bprm);
  	current->flags &= ~PF_FORKNOEXEC;
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index dd9baabaf016..72011826f0cb 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -318,7 +318,6 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	current->mm->free_area_cache = current->mm->mmap_base;
 	current->mm->cached_hole_size = 0;
 
-	set_mm_counter(current->mm, rss, 0);
 	current->mm->mmap = NULL;
 	compute_creds(bprm);
  	current->flags &= ~PF_FORKNOEXEC;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index d4b15576e584..918ccc267e41 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -773,7 +773,6 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 
 	/* Do this so that we can load the interpreter, if need be.  We will
 	   change some of these later */
-	set_mm_counter(current->mm, rss, 0);
 	current->mm->free_area_cache = current->mm->mmap_base;
 	current->mm->cached_hole_size = 0;
 	retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP),
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 134c9c0d1f54..dda87c4c82a3 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -294,14 +294,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, struct pt_regs *regs
 				  &interp_params,
 				  &current->mm->start_stack,
 				  &current->mm->start_brk);
-#endif
 
-	/* do this so that we can load the interpreter, if need be
-	 * - we will change some of these later
-	 */
-	set_mm_counter(current->mm, rss, 0);
-
-#ifdef CONFIG_MMU
 	retval = setup_arg_pages(bprm, current->mm->start_stack, executable_stack);
 	if (retval < 0) {
 		send_sig(SIGKILL, current, 0);
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 7974efa107bc..9d6625829b99 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -650,7 +650,6 @@ static int load_flat_file(struct linux_binprm * bprm,
 		current->mm->start_brk = datapos + data_len + bss_len;
 		current->mm->brk = (current->mm->start_brk + 3) & ~3;
 		current->mm->context.end_brk = memp + ksize((void *) memp) - stack_len;
-		set_mm_counter(current->mm, rss, 0);
 	}
 
 	if (flags & FLAT_FLAG_KTRACE)
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
index 227a2682d2bf..00a91dc25d16 100644
--- a/fs/binfmt_som.c
+++ b/fs/binfmt_som.c
@@ -259,7 +259,6 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	create_som_tables(bprm);
 
 	current->mm->start_stack = bprm->p;
-	set_mm_counter(current->mm, rss, 0);
 
 #if 0
 	printk("(start_brk) %08lx\n" , (unsigned long) current->mm->start_brk);
diff --git a/kernel/fork.c b/kernel/fork.c
index e2ff11f8c1b0..25caa02e2eac 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -198,8 +198,6 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
 	mm->free_area_cache = oldmm->mmap_base;
 	mm->cached_hole_size = ~0UL;
 	mm->map_count = 0;
-	set_mm_counter(mm, rss, 0);
-	set_mm_counter(mm, anon_rss, 0);
 	cpus_clear(mm->cpu_vm_mask);
 	mm->mm_rb = RB_ROOT;
 	rb_link = &mm->mm_rb.rb_node;
@@ -323,6 +321,8 @@ static struct mm_struct * mm_init(struct mm_struct * mm)
 	INIT_LIST_HEAD(&mm->mmlist);
 	mm->core_waiters = 0;
 	mm->nr_ptes = 0;
+	set_mm_counter(mm, rss, 0);
+	set_mm_counter(mm, anon_rss, 0);
 	spin_lock_init(&mm->page_table_lock);
 	rwlock_init(&mm->ioctx_list_lock);
 	mm->ioctx_list = NULL;

From 4294621f41a85497019fae64341aa5351a1921b7 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:05 -0700
Subject: [PATCH 32/91] [PATCH] mm: rss = file_rss + anon_rss

I was lazy when we added anon_rss, and chose to change as few places as
possible.  So currently each anonymous page has to be counted twice, in rss
and in anon_rss.  Which won't be so good if those are atomic counts in some
configurations.

Change that around: keep file_rss and anon_rss separately, and add them
together (with get_mm_rss macro) when the total is needed - reading two
atomics is much cheaper than updating two atomics.  And update anon_rss
upfront, typically in memory.c, not tucked away in page_add_anon_rmap.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c             |  2 +-
 fs/proc/array.c       |  2 +-
 fs/proc/task_mmu.c    |  8 +++-----
 include/linux/sched.h |  4 +++-
 kernel/acct.c         |  2 +-
 kernel/fork.c         |  4 ++--
 mm/fremap.c           |  4 ++--
 mm/hugetlb.c          |  6 +++---
 mm/memory.c           | 31 +++++++++++++++++--------------
 mm/nommu.c            |  2 +-
 mm/rmap.c             |  8 +++-----
 mm/swapfile.c         |  2 +-
 12 files changed, 38 insertions(+), 37 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index d2208f7c87db..cefadf5ab83b 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -330,7 +330,7 @@ void install_arg_page(struct vm_area_struct *vma,
 		pte_unmap(pte);
 		goto out;
 	}
-	inc_mm_counter(mm, rss);
+	inc_mm_counter(mm, anon_rss);
 	lru_cache_add_active(page);
 	set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte(
 					page, vma->vm_page_prot))));
diff --git a/fs/proc/array.c b/fs/proc/array.c
index d84eecacbeaf..3e1239e4b303 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -438,7 +438,7 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
 		jiffies_to_clock_t(it_real_value),
 		start_time,
 		vsize,
-		mm ? get_mm_counter(mm, rss) : 0, /* you might want to shift this left 3 */
+		mm ? get_mm_rss(mm) : 0,
 	        rsslim,
 		mm ? mm->start_code : 0,
 		mm ? mm->end_code : 0,
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 994612bc72d0..bccee7cf9ccd 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -29,7 +29,7 @@ char *task_mem(struct mm_struct *mm, char *buffer)
 		"VmPTE:\t%8lu kB\n",
 		(mm->total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
 		mm->locked_vm << (PAGE_SHIFT-10),
-		get_mm_counter(mm, rss) << (PAGE_SHIFT-10),
+		get_mm_rss(mm) << (PAGE_SHIFT-10),
 		data << (PAGE_SHIFT-10),
 		mm->stack_vm << (PAGE_SHIFT-10), text, lib,
 		(PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10);
@@ -44,13 +44,11 @@ unsigned long task_vsize(struct mm_struct *mm)
 int task_statm(struct mm_struct *mm, int *shared, int *text,
 	       int *data, int *resident)
 {
-	int rss = get_mm_counter(mm, rss);
-
-	*shared = rss - get_mm_counter(mm, anon_rss);
+	*shared = get_mm_counter(mm, file_rss);
 	*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
 								>> PAGE_SHIFT;
 	*data = mm->total_vm - mm->shared_vm;
-	*resident = rss;
+	*resident = *shared + get_mm_counter(mm, anon_rss);
 	return mm->total_vm;
 }
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 27519df0f987..afcaac66cbd5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -254,6 +254,8 @@ extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
 #define add_mm_counter(mm, member, value) (mm)->_##member += (value)
 #define inc_mm_counter(mm, member) (mm)->_##member++
 #define dec_mm_counter(mm, member) (mm)->_##member--
+#define get_mm_rss(mm) ((mm)->_file_rss + (mm)->_anon_rss)
+
 typedef unsigned long mm_counter_t;
 
 struct mm_struct {
@@ -286,7 +288,7 @@ struct mm_struct {
 	unsigned long exec_vm, stack_vm, reserved_vm, def_flags, nr_ptes;
 
 	/* Special counters protected by the page_table_lock */
-	mm_counter_t _rss;
+	mm_counter_t _file_rss;
 	mm_counter_t _anon_rss;
 
 	unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
diff --git a/kernel/acct.c b/kernel/acct.c
index b756f527497e..2e3f4a47e7d0 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -553,7 +553,7 @@ void acct_update_integrals(struct task_struct *tsk)
 		if (delta == 0)
 			return;
 		tsk->acct_stimexpd = tsk->stime;
-		tsk->acct_rss_mem1 += delta * get_mm_counter(tsk->mm, rss);
+		tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm);
 		tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
 	}
 }
diff --git a/kernel/fork.c b/kernel/fork.c
index 25caa02e2eac..2048ed7b5872 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -321,7 +321,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm)
 	INIT_LIST_HEAD(&mm->mmlist);
 	mm->core_waiters = 0;
 	mm->nr_ptes = 0;
-	set_mm_counter(mm, rss, 0);
+	set_mm_counter(mm, file_rss, 0);
 	set_mm_counter(mm, anon_rss, 0);
 	spin_lock_init(&mm->page_table_lock);
 	rwlock_init(&mm->ioctx_list_lock);
@@ -499,7 +499,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
 	if (retval)
 		goto free_pt;
 
-	mm->hiwater_rss = get_mm_counter(mm,rss);
+	mm->hiwater_rss = get_mm_rss(mm);
 	mm->hiwater_vm = mm->total_vm;
 
 good_mm:
diff --git a/mm/fremap.c b/mm/fremap.c
index ab23a0673c35..fd7f2a17ff3e 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -39,7 +39,7 @@ static inline void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
 					set_page_dirty(page);
 				page_remove_rmap(page);
 				page_cache_release(page);
-				dec_mm_counter(mm, rss);
+				dec_mm_counter(mm, file_rss);
 			}
 		}
 	} else {
@@ -95,7 +95,7 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	zap_pte(mm, vma, addr, pte);
 
-	inc_mm_counter(mm,rss);
+	inc_mm_counter(mm, file_rss);
 	flush_icache_page(vma, page);
 	set_pte_at(mm, addr, pte, mk_pte(page, prot));
 	page_add_file_rmap(page);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 61d380678030..094455bcbbf7 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -286,7 +286,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 			entry = *src_pte;
 			ptepage = pte_page(entry);
 			get_page(ptepage);
-			add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE);
+			add_mm_counter(dst, file_rss, HPAGE_SIZE / PAGE_SIZE);
 			set_huge_pte_at(dst, addr, dst_pte, entry);
 		}
 		spin_unlock(&src->page_table_lock);
@@ -324,7 +324,7 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 
 		page = pte_page(pte);
 		put_page(page);
-		add_mm_counter(mm, rss,  - (HPAGE_SIZE / PAGE_SIZE));
+		add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE));
 	}
 	flush_tlb_range(vma, start, end);
 }
@@ -386,7 +386,7 @@ int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
 				goto out;
 			}
 		}
-		add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE);
+		add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
 		set_huge_pte_at(mm, addr, pte, make_huge_pte(vma, page));
 	}
 out:
diff --git a/mm/memory.c b/mm/memory.c
index 51eb38574830..59d42e50fa53 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -397,9 +397,10 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		pte = pte_mkclean(pte);
 	pte = pte_mkold(pte);
 	get_page(page);
-	inc_mm_counter(dst_mm, rss);
 	if (PageAnon(page))
 		inc_mm_counter(dst_mm, anon_rss);
+	else
+		inc_mm_counter(dst_mm, file_rss);
 	set_pte_at(dst_mm, addr, dst_pte, pte);
 	page_dup_rmap(page);
 }
@@ -581,8 +582,8 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
 					set_page_dirty(page);
 				if (pte_young(ptent))
 					mark_page_accessed(page);
+				dec_mm_counter(tlb->mm, file_rss);
 			}
-			dec_mm_counter(tlb->mm, rss);
 			page_remove_rmap(page);
 			tlb_remove_page(tlb, page);
 			continue;
@@ -1290,13 +1291,15 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	spin_lock(&mm->page_table_lock);
 	page_table = pte_offset_map(pmd, address);
 	if (likely(pte_same(*page_table, orig_pte))) {
-		if (PageAnon(old_page))
-			dec_mm_counter(mm, anon_rss);
 		if (PageReserved(old_page))
-			inc_mm_counter(mm, rss);
-		else
+			inc_mm_counter(mm, anon_rss);
+		else {
 			page_remove_rmap(old_page);
-
+			if (!PageAnon(old_page)) {
+				inc_mm_counter(mm, anon_rss);
+				dec_mm_counter(mm, file_rss);
+			}
+		}
 		flush_cache_page(vma, address, pfn);
 		entry = mk_pte(new_page, vma->vm_page_prot);
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -1701,7 +1704,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	/* The page isn't present yet, go ahead with the fault. */
 
-	inc_mm_counter(mm, rss);
+	inc_mm_counter(mm, anon_rss);
 	pte = mk_pte(page, vma->vm_page_prot);
 	if (write_access && can_share_swap_page(page)) {
 		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
@@ -1774,7 +1777,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			page_cache_release(page);
 			goto unlock;
 		}
-		inc_mm_counter(mm, rss);
+		inc_mm_counter(mm, anon_rss);
 		entry = mk_pte(page, vma->vm_page_prot);
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 		lru_cache_add_active(page);
@@ -1887,19 +1890,19 @@ retry:
 	 */
 	/* Only go through if we didn't race with anybody else... */
 	if (pte_none(*page_table)) {
-		if (!PageReserved(new_page))
-			inc_mm_counter(mm, rss);
-
 		flush_icache_page(vma, new_page);
 		entry = mk_pte(new_page, vma->vm_page_prot);
 		if (write_access)
 			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 		set_pte_at(mm, address, page_table, entry);
 		if (anon) {
+			inc_mm_counter(mm, anon_rss);
 			lru_cache_add_active(new_page);
 			page_add_anon_rmap(new_page, vma, address);
-		} else
+		} else if (!PageReserved(new_page)) {
+			inc_mm_counter(mm, file_rss);
 			page_add_file_rmap(new_page);
+		}
 	} else {
 		/* One of our sibling threads was faster, back out. */
 		page_cache_release(new_page);
@@ -2192,7 +2195,7 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
 void update_mem_hiwater(struct task_struct *tsk)
 {
 	if (tsk->mm) {
-		unsigned long rss = get_mm_counter(tsk->mm, rss);
+		unsigned long rss = get_mm_rss(tsk->mm);
 
 		if (tsk->mm->hiwater_rss < rss)
 			tsk->mm->hiwater_rss = rss;
diff --git a/mm/nommu.c b/mm/nommu.c
index 0ef241ae3763..599924886eb5 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1083,7 +1083,7 @@ void update_mem_hiwater(struct task_struct *tsk)
 	unsigned long rss;
 
 	if (likely(tsk->mm)) {
-		rss = get_mm_counter(tsk->mm, rss);
+		rss = get_mm_rss(tsk->mm);
 		if (tsk->mm->hiwater_rss < rss)
 			tsk->mm->hiwater_rss = rss;
 		if (tsk->mm->hiwater_vm < tsk->mm->total_vm)
diff --git a/mm/rmap.c b/mm/rmap.c
index 1fc559e09ca8..504757624cce 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -445,8 +445,6 @@ void page_add_anon_rmap(struct page *page,
 {
 	BUG_ON(PageReserved(page));
 
-	inc_mm_counter(vma->vm_mm, anon_rss);
-
 	if (atomic_inc_and_test(&page->_mapcount)) {
 		struct anon_vma *anon_vma = vma->anon_vma;
 
@@ -561,9 +559,9 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
 		set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
 		BUG_ON(pte_file(*pte));
 		dec_mm_counter(mm, anon_rss);
-	}
+	} else
+		dec_mm_counter(mm, file_rss);
 
-	dec_mm_counter(mm, rss);
 	page_remove_rmap(page);
 	page_cache_release(page);
 
@@ -667,7 +665,7 @@ static void try_to_unmap_cluster(unsigned long cursor,
 
 		page_remove_rmap(page);
 		page_cache_release(page);
-		dec_mm_counter(mm, rss);
+		dec_mm_counter(mm, file_rss);
 		(*mapcount)--;
 	}
 
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 05c851291241..296e0bbf7836 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -407,7 +407,7 @@ void free_swap_and_cache(swp_entry_t entry)
 static void unuse_pte(struct vm_area_struct *vma, pte_t *pte,
 		unsigned long addr, swp_entry_t entry, struct page *page)
 {
-	inc_mm_counter(vma->vm_mm, rss);
+	inc_mm_counter(vma->vm_mm, anon_rss);
 	get_page(page);
 	set_pte_at(vma->vm_mm, addr, pte,
 		   pte_mkold(mk_pte(page, vma->vm_page_prot)));

From ae859762332f19bfc06f4c4a1b1fefb41e9e1084 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:05 -0700
Subject: [PATCH 33/91] [PATCH] mm: batch updating mm_counters

tlb_finish_mmu used to batch zap_pte_range's update of mm rss, which may be
worthwhile if the mm is contended, and would reduce atomic operations if the
counts were atomic.  Let zap_pte_range now batch its updates to file_rss and
anon_rss, per page-table in case we drop the lock outside; and copy_pte_range
batch them too.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 47 ++++++++++++++++++++++++++++++++---------------
 1 file changed, 32 insertions(+), 15 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 59d42e50fa53..da642b5528fa 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -332,6 +332,16 @@ out:
 	return pte_offset_kernel(pmd, address);
 }
 
+static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
+{
+	if (file_rss)
+		add_mm_counter(mm, file_rss, file_rss);
+	if (anon_rss)
+		add_mm_counter(mm, anon_rss, anon_rss);
+}
+
+#define NO_RSS 2	/* Increment neither file_rss nor anon_rss */
+
 /*
  * copy one vm_area from one task to the other. Assumes the page tables
  * already present in the new task to be cleared in the whole range
@@ -341,7 +351,7 @@ out:
  * but may be dropped within p[mg]d_alloc() and pte_alloc_map().
  */
 
-static inline void
+static inline int
 copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		pte_t *dst_pte, pte_t *src_pte, unsigned long vm_flags,
 		unsigned long addr)
@@ -349,6 +359,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	pte_t pte = *src_pte;
 	struct page *page;
 	unsigned long pfn;
+	int anon = NO_RSS;
 
 	/* pte contains position in swap or file, so copy. */
 	if (unlikely(!pte_present(pte))) {
@@ -361,8 +372,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 				spin_unlock(&mmlist_lock);
 			}
 		}
-		set_pte_at(dst_mm, addr, dst_pte, pte);
-		return;
+		goto out_set_pte;
 	}
 
 	pfn = pte_pfn(pte);
@@ -375,10 +385,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	if (pfn_valid(pfn))
 		page = pfn_to_page(pfn);
 
-	if (!page || PageReserved(page)) {
-		set_pte_at(dst_mm, addr, dst_pte, pte);
-		return;
-	}
+	if (!page || PageReserved(page))
+		goto out_set_pte;
 
 	/*
 	 * If it's a COW mapping, write protect it both
@@ -397,12 +405,12 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		pte = pte_mkclean(pte);
 	pte = pte_mkold(pte);
 	get_page(page);
-	if (PageAnon(page))
-		inc_mm_counter(dst_mm, anon_rss);
-	else
-		inc_mm_counter(dst_mm, file_rss);
-	set_pte_at(dst_mm, addr, dst_pte, pte);
 	page_dup_rmap(page);
+	anon = !!PageAnon(page);
+
+out_set_pte:
+	set_pte_at(dst_mm, addr, dst_pte, pte);
+	return anon;
 }
 
 static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
@@ -412,8 +420,10 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	pte_t *src_pte, *dst_pte;
 	unsigned long vm_flags = vma->vm_flags;
 	int progress = 0;
+	int rss[NO_RSS+1], anon;
 
 again:
+	rss[1] = rss[0] = 0;
 	dst_pte = pte_alloc_map(dst_mm, dst_pmd, addr);
 	if (!dst_pte)
 		return -ENOMEM;
@@ -436,13 +446,16 @@ again:
 			progress++;
 			continue;
 		}
-		copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vm_flags, addr);
+		anon = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
+							vm_flags, addr);
+		rss[anon]++;
 		progress += 8;
 	} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
 	spin_unlock(&src_mm->page_table_lock);
 
 	pte_unmap_nested(src_pte - 1);
 	pte_unmap(dst_pte - 1);
+	add_mm_rss(dst_mm, rss[0], rss[1]);
 	cond_resched_lock(&dst_mm->page_table_lock);
 	if (addr != end)
 		goto again;
@@ -533,6 +546,8 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
 				struct zap_details *details)
 {
 	pte_t *pte;
+	int file_rss = 0;
+	int anon_rss = 0;
 
 	pte = pte_offset_map(pmd, addr);
 	do {
@@ -576,13 +591,13 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
 				set_pte_at(tlb->mm, addr, pte,
 					   pgoff_to_pte(page->index));
 			if (PageAnon(page))
-				dec_mm_counter(tlb->mm, anon_rss);
+				anon_rss++;
 			else {
 				if (pte_dirty(ptent))
 					set_page_dirty(page);
 				if (pte_young(ptent))
 					mark_page_accessed(page);
-				dec_mm_counter(tlb->mm, file_rss);
+				file_rss++;
 			}
 			page_remove_rmap(page);
 			tlb_remove_page(tlb, page);
@@ -598,6 +613,8 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
 			free_swap_and_cache(pte_to_swp_entry(ptent));
 		pte_clear_full(tlb->mm, addr, pte, tlb->fullmm);
 	} while (pte++, addr += PAGE_SIZE, addr != end);
+
+	add_mm_rss(tlb->mm, -file_rss, -anon_rss);
 	pte_unmap(pte - 1);
 }
 

From fd3e42fcc888a773572282575d2fdbf5cfd6216e Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:06 -0700
Subject: [PATCH 34/91] [PATCH] mm: dup_mmap use oldmm more

Use the parent's oldmm throughout dup_mmap, instead of perversely going back
to current->mm.  (Can you hear the sigh of relief from those mpnts?  Usually I
squash them, but not today.)

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index 2048ed7b5872..0e7fe4a8a8df 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -182,16 +182,16 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 }
 
 #ifdef CONFIG_MMU
-static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
+static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 {
-	struct vm_area_struct * mpnt, *tmp, **pprev;
+	struct vm_area_struct *mpnt, *tmp, **pprev;
 	struct rb_node **rb_link, *rb_parent;
 	int retval;
 	unsigned long charge;
 	struct mempolicy *pol;
 
 	down_write(&oldmm->mmap_sem);
-	flush_cache_mm(current->mm);
+	flush_cache_mm(oldmm);
 	mm->locked_vm = 0;
 	mm->mmap = NULL;
 	mm->mmap_cache = NULL;
@@ -204,7 +204,7 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
 	rb_parent = NULL;
 	pprev = &mm->mmap;
 
-	for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) {
+	for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
 		struct file *file;
 
 		if (mpnt->vm_flags & VM_DONTCOPY) {
@@ -265,7 +265,7 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
 		rb_parent = &tmp->vm_rb;
 
 		mm->map_count++;
-		retval = copy_page_range(mm, current->mm, tmp);
+		retval = copy_page_range(mm, oldmm, tmp);
 		spin_unlock(&mm->page_table_lock);
 
 		if (tmp->vm_ops && tmp->vm_ops->open)
@@ -277,7 +277,7 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
 	retval = 0;
 
 out:
-	flush_tlb_mm(current->mm);
+	flush_tlb_mm(oldmm);
 	up_write(&oldmm->mmap_sem);
 	return retval;
 fail_nomem_policy:

From 7ee78232501ea9de2b6c8f10d32c9a0fee541357 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:08 -0700
Subject: [PATCH 35/91] [PATCH] mm: dup_mmap down new mmap_sem

One anomaly remains from when Andrea rationalized the responsibilities of
mmap_sem and page_table_lock: in dup_mmap we add vmas to the child holding its
page_table_lock, but not the mmap_sem which normally guards the vma list and
rbtree.  Which could be an issue for unuse_mm: though since it just walks down
the list (today with page_table_lock, tomorrow not), it's probably okay.  Will
need a memory barrier?  Oh, keep it simple, Nick and I agreed, no harm in
taking child's mmap_sem here.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index 0e7fe4a8a8df..2a587b3224e3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -192,6 +192,8 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 
 	down_write(&oldmm->mmap_sem);
 	flush_cache_mm(oldmm);
+	down_write(&mm->mmap_sem);
+
 	mm->locked_vm = 0;
 	mm->mmap = NULL;
 	mm->mmap_cache = NULL;
@@ -251,10 +253,7 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 		}
 
 		/*
-		 * Link in the new vma and copy the page table entries:
-		 * link in first so that swapoff can see swap entries.
-		 * Note that, exceptionally, here the vma is inserted
-		 * without holding mm->mmap_sem.
+		 * Link in the new vma and copy the page table entries.
 		 */
 		spin_lock(&mm->page_table_lock);
 		*pprev = tmp;
@@ -275,8 +274,8 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 			goto out;
 	}
 	retval = 0;
-
 out:
+	up_write(&mm->mmap_sem);
 	flush_tlb_mm(oldmm);
 	up_write(&oldmm->mmap_sem);
 	return retval;

From 147efea8ebb034b48aee806caae1da9a2ee41b38 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:09 -0700
Subject: [PATCH 36/91] [PATCH] mm: sh64 hugetlbpage.c

The sh64 hugetlbpage.c seems to be erroneous, left over from a bygone age,
clashing with the common hugetlb.c.  Replace it by a copy of the sh
hugetlbpage.c.  Except, delete that mk_pte_huge macro neither uses.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Acked-by: Paul Mundt <lethal@linux-sh.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/sh/mm/hugetlbpage.c   |   2 -
 arch/sh64/mm/hugetlbpage.c | 188 +++----------------------------------
 2 files changed, 12 insertions(+), 178 deletions(-)

diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c
index 95bb1a6c6060..6b7a7688c98e 100644
--- a/arch/sh/mm/hugetlbpage.c
+++ b/arch/sh/mm/hugetlbpage.c
@@ -54,8 +54,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 	return pte;
 }
 
-#define mk_pte_huge(entry) do { pte_val(entry) |= _PAGE_SZHUGE; } while (0)
-
 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 		     pte_t *ptep, pte_t entry)
 {
diff --git a/arch/sh64/mm/hugetlbpage.c b/arch/sh64/mm/hugetlbpage.c
index dcd9c8a8baf8..ed6a505b3ee2 100644
--- a/arch/sh64/mm/hugetlbpage.c
+++ b/arch/sh64/mm/hugetlbpage.c
@@ -54,41 +54,31 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 	return pte;
 }
 
-#define mk_pte_huge(entry) do { pte_val(entry) |= _PAGE_SZHUGE; } while (0)
-
-static void set_huge_pte(struct mm_struct *mm, struct vm_area_struct *vma,
-			 struct page *page, pte_t * page_table, int write_access)
+void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
+		     pte_t *ptep, pte_t entry)
 {
-	unsigned long i;
-	pte_t entry;
-
-	add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE);
-
-	if (write_access)
-		entry = pte_mkwrite(pte_mkdirty(mk_pte(page,
-						       vma->vm_page_prot)));
-	else
-		entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
-	entry = pte_mkyoung(entry);
-	mk_pte_huge(entry);
+	int i;
 
 	for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) {
-		set_pte(page_table, entry);
-		page_table++;
-
+		set_pte_at(mm, addr, ptep, entry);
+		ptep++;
+		addr += PAGE_SIZE;
 		pte_val(entry) += PAGE_SIZE;
 	}
 }
 
-pte_t huge_ptep_get_and_clear(pte_t *ptep)
+pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
+			      pte_t *ptep)
 {
 	pte_t entry;
+	int i;
 
 	entry = *ptep;
 
 	for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) {
-		pte_clear(pte);
-		pte++;
+		pte_clear(mm, addr, ptep);
+		addr += PAGE_SIZE;
+		ptep++;
 	}
 
 	return entry;
@@ -106,79 +96,6 @@ int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
 	return 0;
 }
 
-int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
-			    struct vm_area_struct *vma)
-{
-	pte_t *src_pte, *dst_pte, entry;
-	struct page *ptepage;
-	unsigned long addr = vma->vm_start;
-	unsigned long end = vma->vm_end;
-	int i;
-
-	while (addr < end) {
-		dst_pte = huge_pte_alloc(dst, addr);
-		if (!dst_pte)
-			goto nomem;
-		src_pte = huge_pte_offset(src, addr);
-		BUG_ON(!src_pte || pte_none(*src_pte));
-		entry = *src_pte;
-		ptepage = pte_page(entry);
-		get_page(ptepage);
-		for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) {
-			set_pte(dst_pte, entry);
-			pte_val(entry) += PAGE_SIZE;
-			dst_pte++;
-		}
-		add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE);
-		addr += HPAGE_SIZE;
-	}
-	return 0;
-
-nomem:
-	return -ENOMEM;
-}
-
-int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
-			struct page **pages, struct vm_area_struct **vmas,
-			unsigned long *position, int *length, int i)
-{
-	unsigned long vaddr = *position;
-	int remainder = *length;
-
-	WARN_ON(!is_vm_hugetlb_page(vma));
-
-	while (vaddr < vma->vm_end && remainder) {
-		if (pages) {
-			pte_t *pte;
-			struct page *page;
-
-			pte = huge_pte_offset(mm, vaddr);
-
-			/* hugetlb should be locked, and hence, prefaulted */
-			BUG_ON(!pte || pte_none(*pte));
-
-			page = pte_page(*pte);
-
-			WARN_ON(!PageCompound(page));
-
-			get_page(page);
-			pages[i] = page;
-		}
-
-		if (vmas)
-			vmas[i] = vma;
-
-		vaddr += PAGE_SIZE;
-		--remainder;
-		++i;
-	}
-
-	*length = remainder;
-	*position = vaddr;
-
-	return i;
-}
-
 struct page *follow_huge_addr(struct mm_struct *mm,
 			      unsigned long address, int write)
 {
@@ -195,84 +112,3 @@ struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 {
 	return NULL;
 }
-
-void unmap_hugepage_range(struct vm_area_struct *vma,
-			  unsigned long start, unsigned long end)
-{
-	struct mm_struct *mm = vma->vm_mm;
-	unsigned long address;
-	pte_t *pte;
-	struct page *page;
-	int i;
-
-	BUG_ON(start & (HPAGE_SIZE - 1));
-	BUG_ON(end & (HPAGE_SIZE - 1));
-
-	for (address = start; address < end; address += HPAGE_SIZE) {
-		pte = huge_pte_offset(mm, address);
-		BUG_ON(!pte);
-		if (pte_none(*pte))
-			continue;
-		page = pte_page(*pte);
-		put_page(page);
-		for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) {
-			pte_clear(mm, address+(i*PAGE_SIZE), pte);
-			pte++;
-		}
-	}
-	add_mm_counter(mm, rss, -((end - start) >> PAGE_SHIFT));
-	flush_tlb_range(vma, start, end);
-}
-
-int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
-{
-	struct mm_struct *mm = current->mm;
-	unsigned long addr;
-	int ret = 0;
-
-	BUG_ON(vma->vm_start & ~HPAGE_MASK);
-	BUG_ON(vma->vm_end & ~HPAGE_MASK);
-
-	spin_lock(&mm->page_table_lock);
-	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
-		unsigned long idx;
-		pte_t *pte = huge_pte_alloc(mm, addr);
-		struct page *page;
-
-		if (!pte) {
-			ret = -ENOMEM;
-			goto out;
-		}
-		if (!pte_none(*pte))
-			continue;
-
-		idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
-			+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
-		page = find_get_page(mapping, idx);
-		if (!page) {
-			/* charge the fs quota first */
-			if (hugetlb_get_quota(mapping)) {
-				ret = -ENOMEM;
-				goto out;
-			}
-			page = alloc_huge_page();
-			if (!page) {
-				hugetlb_put_quota(mapping);
-				ret = -ENOMEM;
-				goto out;
-			}
-			ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
-			if (! ret) {
-				unlock_page(page);
-			} else {
-				hugetlb_put_quota(mapping);
-				free_huge_page(page);
-				goto out;
-			}
-		}
-		set_huge_pte(mm, vma, page, pte, vma->vm_flags & VM_WRITE);
-	}
-out:
-	spin_unlock(&mm->page_table_lock);
-	return ret;
-}

From f9c98d0287de42221c624482fd4f8d485c98ab22 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:10 -0700
Subject: [PATCH 37/91] [PATCH] mm: m68k kill stram swap

Please, please now delete the Atari CONFIG_STRAM_SWAP code.  It may be
excellent and ingenious code, but its reference to swap_vfsmnt betrays that it
hasn't been built since 2.5.1 (four years old come December), it's delving
deep into matters which are the preserve of core mm code, its only purpose is
to give the more conscientious mm guys an anxiety attack from time to time;
yet we keep on breaking it more and more.

If you want to use RAM for swap, then if the MTD driver does not already
provide just what you need, I'm sure David could be persuaded to add the
extra.  But you'd also like to be able to allocate extents of that swap for
other use: we can give you a core interface for that if you need.  But unbuilt
for four years suggests to me that there's no need at all.

I cannot swear the patch below won't break your build, but believe so.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/kernel-parameters.txt   |   2 -
 Documentation/m68k/kernel-options.txt |  24 +-
 arch/m68k/Kconfig                     |  24 +-
 arch/m68k/atari/stram.c               | 918 +-------------------------
 4 files changed, 17 insertions(+), 951 deletions(-)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 90766b75d1b7..5dffcfefc3c7 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1460,8 +1460,6 @@ running once the system is up.
 	stifb=		[HW]
 			Format: bpp:<bpp1>[:<bpp2>[:<bpp3>...]]
 
-	stram_swap=	[HW,M68k]
-
 	swiotlb=	[IA-64] Number of I/O TLB slabs
 
 	switches=	[HW,M68k]
diff --git a/Documentation/m68k/kernel-options.txt b/Documentation/m68k/kernel-options.txt
index e191baad8308..d5d3f064f552 100644
--- a/Documentation/m68k/kernel-options.txt
+++ b/Documentation/m68k/kernel-options.txt
@@ -626,7 +626,7 @@ ignored (others aren't affected).
     can be performed in optimal order. Not all SCSI devices support
     tagged queuing (:-().
 
-4.6 switches=
+4.5 switches=
 -------------
 
 Syntax: switches=<list of switches>
@@ -661,28 +661,6 @@ correctly.
 earlier initialization ("ov_"-less) takes precedence. But the
 switching-off on reset still happens in this case.
 
-4.5) stram_swap=
-----------------
-
-Syntax: stram_swap=<do_swap>[,<max_swap>]
-
-  This option is available only if the kernel has been compiled with
-CONFIG_STRAM_SWAP enabled. Normally, the kernel then determines
-dynamically whether to actually use ST-RAM as swap space. (Currently,
-the fraction of ST-RAM must be less or equal 1/3 of total memory to
-enable this swapping.) You can override the kernel's decision by
-specifying this option. 1 for <do_swap> means always enable the swap,
-even if you have less alternate RAM. 0 stands for never swap to
-ST-RAM, even if it's small enough compared to the rest of memory.
-
-  If ST-RAM swapping is enabled, the kernel usually uses all free
-ST-RAM as swap "device". If the kernel resides in ST-RAM, the region
-allocated by it is obviously never used for swapping :-) You can also
-limit this amount by specifying the second parameter, <max_swap>, if
-you want to use parts of ST-RAM as normal system memory. <max_swap> is
-in kBytes and the number should be a multiple of 4 (otherwise: rounded
-down).
-
 5) Options for Amiga Only:
 ==========================
 
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig
index ba960bbc8e6d..1dd5d18b2201 100644
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -388,33 +388,11 @@ config AMIGA_PCMCIA
 	  Include support in the kernel for pcmcia on Amiga 1200 and Amiga
 	  600. If you intend to use pcmcia cards say Y; otherwise say N.
 
-config STRAM_SWAP
-	bool "Support for ST-RAM as swap space"
-	depends on ATARI && BROKEN
-	---help---
-	  Some Atari 68k machines (including the 520STF and 1020STE) divide
-	  their addressable memory into ST and TT sections.  The TT section
-	  (up to 512MB) is the main memory; the ST section (up to 4MB) is
-	  accessible to the built-in graphics board, runs slower, and is
-	  present mainly for backward compatibility with older machines.
-
-	  This enables support for using (parts of) ST-RAM as swap space,
-	  instead of as normal system memory. This can first enhance system
-	  performance if you have lots of alternate RAM (compared to the size
-	  of ST-RAM), because executable code always will reside in faster
-	  memory. ST-RAM will remain as ultra-fast swap space. On the other
-	  hand, it allows much improved dynamic allocations of ST-RAM buffers
-	  for device driver modules (e.g. floppy, ACSI, SLM printer, DMA
-	  sound). The probability that such allocations at module load time
-	  fail is drastically reduced.
-
 config STRAM_PROC
 	bool "ST-RAM statistics in /proc"
 	depends on ATARI
 	help
-	  Say Y here to report ST-RAM usage statistics in /proc/stram.  See
-	  the help for CONFIG_STRAM_SWAP for discussion of ST-RAM and its
-	  uses.
+	  Say Y here to report ST-RAM usage statistics in /proc/stram.
 
 config HEARTBEAT
 	bool "Use power LED as a heartbeat" if AMIGA || APOLLO || ATARI || MAC ||Q40
diff --git a/arch/m68k/atari/stram.c b/arch/m68k/atari/stram.c
index 5a3c106b40c8..22e0481a5f7b 100644
--- a/arch/m68k/atari/stram.c
+++ b/arch/m68k/atari/stram.c
@@ -15,11 +15,9 @@
 #include <linux/kdev_t.h>
 #include <linux/major.h>
 #include <linux/init.h>
-#include <linux/swap.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/pagemap.h>
-#include <linux/shm.h>
 #include <linux/bootmem.h>
 #include <linux/mount.h>
 #include <linux/blkdev.h>
@@ -33,8 +31,6 @@
 #include <asm/io.h>
 #include <asm/semaphore.h>
 
-#include <linux/swapops.h>
-
 #undef DEBUG
 
 #ifdef DEBUG
@@ -49,8 +45,7 @@
 #include <linux/proc_fs.h>
 #endif
 
-/* Pre-swapping comments:
- *
+/*
  * ++roman:
  *
  * New version of ST-Ram buffer allocation. Instead of using the
@@ -75,76 +70,6 @@
  *
  */
 
-/*
- * New Nov 1997: Use ST-RAM as swap space!
- *
- * In the past, there were often problems with modules that require ST-RAM
- * buffers. Such drivers have to use __get_dma_pages(), which unfortunately
- * often isn't very successful in allocating more than 1 page :-( [1] The net
- * result was that most of the time you couldn't insmod such modules (ataflop,
- * ACSI, SCSI on Falcon, Atari internal framebuffer, not to speak of acsi_slm,
- * which needs a 1 MB buffer... :-).
- *
- * To overcome this limitation, ST-RAM can now be turned into a very
- * high-speed swap space. If a request for an ST-RAM buffer comes, the kernel
- * now tries to unswap some pages on that swap device to make some free (and
- * contiguous) space. This works much better in comparison to
- * __get_dma_pages(), since used swap pages can be selectively freed by either
- * moving them to somewhere else in swap space, or by reading them back into
- * system memory. Ok, there operation of unswapping isn't really cheap (for
- * each page, one has to go through the page tables of all processes), but it
- * doesn't happen that often (only when allocation ST-RAM, i.e. when loading a
- * module that needs ST-RAM). But it at least makes it possible to load such
- * modules!
- *
- * It could also be that overall system performance increases a bit due to
- * ST-RAM swapping, since slow ST-RAM isn't used anymore for holding data or
- * executing code in. It's then just a (very fast, compared to disk) back
- * storage for not-so-often needed data. (But this effect must be compared
- * with the loss of total memory...) Don't know if the effect is already
- * visible on a TT, where the speed difference between ST- and TT-RAM isn't
- * that dramatic, but it should on machines where TT-RAM is really much faster
- * (e.g. Afterburner).
- *
- *   [1]: __get_free_pages() does a fine job if you only want one page, but if
- * you want more (contiguous) pages, it can give you such a block only if
- * there's already a free one. The algorithm can't try to free buffers or swap
- * out something in order to make more free space, since all that page-freeing
- * mechanisms work "target-less", i.e. they just free something, but not in a
- * specific place. I.e., __get_free_pages() can't do anything to free
- * *adjacent* pages :-( This situation becomes even worse for DMA memory,
- * since the freeing algorithms are also blind to DMA capability of pages.
- */
-
-/* 1998-10-20: ++andreas
-   unswap_by_move disabled because it does not handle swapped shm pages.
-*/
-
-/* 2000-05-01: ++andreas
-   Integrated with bootmem.  Remove all traces of unswap_by_move.
-*/
-
-#ifdef CONFIG_STRAM_SWAP
-#define ALIGN_IF_SWAP(x)	PAGE_ALIGN(x)
-#else
-#define ALIGN_IF_SWAP(x)	(x)
-#endif
-
-/* get index of swap page at address 'addr' */
-#define SWAP_NR(addr)		(((addr) - swap_start) >> PAGE_SHIFT)
-
-/* get address of swap page #'nr' */
-#define SWAP_ADDR(nr)		(swap_start + ((nr) << PAGE_SHIFT))
-
-/* get number of pages for 'n' bytes (already page-aligned) */
-#define N_PAGES(n)			((n) >> PAGE_SHIFT)
-
-/* The following two numbers define the maximum fraction of ST-RAM in total
- * memory, below that the kernel would automatically use ST-RAM as swap
- * space. This decision can be overridden with stram_swap= */
-#define MAX_STRAM_FRACTION_NOM		1
-#define MAX_STRAM_FRACTION_DENOM	3
-
 /* Start and end (virtual) of ST-RAM */
 static void *stram_start, *stram_end;
 
@@ -164,10 +89,9 @@ typedef struct stram_block {
 } BLOCK;
 
 /* values for flags field */
-#define BLOCK_FREE		0x01	/* free structure in the BLOCKs pool */
+#define BLOCK_FREE	0x01	/* free structure in the BLOCKs pool */
 #define BLOCK_KMALLOCED	0x02	/* structure allocated by kmalloc() */
-#define BLOCK_GFP		0x08	/* block allocated with __get_dma_pages() */
-#define BLOCK_INSWAP	0x10	/* block allocated in swap space */
+#define BLOCK_GFP	0x08	/* block allocated with __get_dma_pages() */
 
 /* list of allocated blocks */
 static BLOCK *alloc_list;
@@ -179,60 +103,8 @@ static BLOCK *alloc_list;
 #define N_STATIC_BLOCKS	20
 static BLOCK static_blocks[N_STATIC_BLOCKS];
 
-#ifdef CONFIG_STRAM_SWAP
-/* max. number of bytes to use for swapping
- *  0 = no ST-RAM swapping
- * -1 = do swapping (to whole ST-RAM) if it's less than MAX_STRAM_FRACTION of
- *      total memory
- */
-static int max_swap_size = -1;
-
-/* start and end of swapping area */
-static void *swap_start, *swap_end;
-
-/* The ST-RAM's swap info structure */
-static struct swap_info_struct *stram_swap_info;
-
-/* The ST-RAM's swap type */
-static int stram_swap_type;
-
-/* Semaphore for get_stram_region.  */
-static DECLARE_MUTEX(stram_swap_sem);
-
-/* major and minor device number of the ST-RAM device; for the major, we use
- * the same as Amiga z2ram, which is really similar and impossible on Atari,
- * and for the minor a relatively odd number to avoid the user creating and
- * using that device. */
-#define	STRAM_MAJOR		Z2RAM_MAJOR
-#define	STRAM_MINOR		13
-
-/* Some impossible pointer value */
-#define MAGIC_FILE_P	(struct file *)0xffffdead
-
-#ifdef DO_PROC
-static unsigned stat_swap_read;
-static unsigned stat_swap_write;
-static unsigned stat_swap_force;
-#endif /* DO_PROC */
-
-#endif /* CONFIG_STRAM_SWAP */
-
 /***************************** Prototypes *****************************/
 
-#ifdef CONFIG_STRAM_SWAP
-static int swap_init(void *start_mem, void *swap_data);
-static void *get_stram_region( unsigned long n_pages );
-static void free_stram_region( unsigned long offset, unsigned long n_pages
-			       );
-static int in_some_region(void *addr);
-static unsigned long find_free_region( unsigned long n_pages, unsigned long
-				       *total_free, unsigned long
-				       *region_free );
-static void do_stram_request(request_queue_t *);
-static int stram_open( struct inode *inode, struct file *filp );
-static int stram_release( struct inode *inode, struct file *filp );
-static void reserve_region(void *start, void *end);
-#endif
 static BLOCK *add_region( void *addr, unsigned long size );
 static BLOCK *find_region( void *addr );
 static int remove_region( BLOCK *block );
@@ -279,84 +151,11 @@ void __init atari_stram_init(void)
  */
 void __init atari_stram_reserve_pages(void *start_mem)
 {
-#ifdef CONFIG_STRAM_SWAP
-	/* if max_swap_size is negative (i.e. no stram_swap= option given),
-	 * determine at run time whether to use ST-RAM swapping */
-	if (max_swap_size < 0)
-		/* Use swapping if ST-RAM doesn't make up more than MAX_STRAM_FRACTION
-		 * of total memory. In that case, the max. size is set to 16 MB,
-		 * because ST-RAM can never be bigger than that.
-		 * Also, never use swapping on a Hades, there's no separate ST-RAM in
-		 * that machine. */
-		max_swap_size =
-			(!MACH_IS_HADES &&
-			 (N_PAGES(stram_end-stram_start)*MAX_STRAM_FRACTION_DENOM <=
-			  ((unsigned long)high_memory>>PAGE_SHIFT)*MAX_STRAM_FRACTION_NOM)) ? 16*1024*1024 : 0;
-	DPRINTK( "atari_stram_reserve_pages: max_swap_size = %d\n", max_swap_size );
-#endif
-
 	/* always reserve first page of ST-RAM, the first 2 kB are
 	 * supervisor-only! */
 	if (!kernel_in_stram)
 		reserve_bootmem (0, PAGE_SIZE);
 
-#ifdef CONFIG_STRAM_SWAP
-	{
-		void *swap_data;
-
-		start_mem = (void *) PAGE_ALIGN ((unsigned long) start_mem);
-		/* determine first page to use as swap: if the kernel is
-		   in TT-RAM, this is the first page of (usable) ST-RAM;
-		   otherwise just use the end of kernel data (= start_mem) */
-		swap_start = !kernel_in_stram ? stram_start + PAGE_SIZE : start_mem;
-		/* decrement by one page, rest of kernel assumes that first swap page
-		 * is always reserved and maybe doesn't handle swp_entry == 0
-		 * correctly */
-		swap_start -= PAGE_SIZE;
-		swap_end = stram_end;
-		if (swap_end-swap_start > max_swap_size)
-			swap_end =  swap_start + max_swap_size;
-		DPRINTK( "atari_stram_reserve_pages: swapping enabled; "
-				 "swap=%p-%p\n", swap_start, swap_end);
-
-		/* reserve some amount of memory for maintainance of
-		 * swapping itself: one page for each 2048 (PAGE_SIZE/2)
-		 * swap pages. (2 bytes for each page) */
-		swap_data = start_mem;
-		start_mem += ((SWAP_NR(swap_end) + PAGE_SIZE/2 - 1)
-			      >> (PAGE_SHIFT-1)) << PAGE_SHIFT;
-		/* correct swap_start if necessary */
-		if (swap_start + PAGE_SIZE == swap_data)
-			swap_start = start_mem - PAGE_SIZE;
-
-		if (!swap_init( start_mem, swap_data )) {
-			printk( KERN_ERR "ST-RAM swap space initialization failed\n" );
-			max_swap_size = 0;
-			return;
-		}
-		/* reserve region for swapping meta-data */
-		reserve_region(swap_data, start_mem);
-		/* reserve swapping area itself */
-		reserve_region(swap_start + PAGE_SIZE, swap_end);
-
-		/*
-		 * If the whole ST-RAM is used for swapping, there are no allocatable
-		 * dma pages left. But unfortunately, some shared parts of the kernel
-		 * (particularly the SCSI mid-level) call __get_dma_pages()
-		 * unconditionally :-( These calls then fail, and scsi.c even doesn't
-		 * check for NULL return values and just crashes. The quick fix for
-		 * this (instead of doing much clean up work in the SCSI code) is to
-		 * pretend all pages are DMA-able by setting mach_max_dma_address to
-		 * ULONG_MAX. This doesn't change any functionality so far, since
-		 * get_dma_pages() shouldn't be used on Atari anyway anymore (better
-		 * use atari_stram_alloc()), and the Atari SCSI drivers don't need DMA
-		 * memory. But unfortunately there's now no kind of warning (even not
-		 * a NULL return value) if you use get_dma_pages() nevertheless :-(
-		 * You just will get non-DMA-able memory...
-		 */
-		mach_max_dma_address = 0xffffffff;
-	}
-#endif
 }
 
 void atari_stram_mem_init_hook (void)
@@ -367,7 +166,6 @@ void atari_stram_mem_init_hook (void)
 
 /*
  * This is main public interface: somehow allocate a ST-RAM block
- * There are three strategies:
  *
  *  - If we're before mem_init(), we have to make a static allocation. The
  *    region is taken in the kernel data area (if the kernel is in ST-RAM) or
@@ -375,14 +173,9 @@ void atari_stram_mem_init_hook (void)
  *    rsvd_stram_* region. The ST-RAM is somewhere in the middle of kernel
  *    address space in the latter case.
  *
- *  - If mem_init() already has been called and ST-RAM swapping is enabled,
- *    try to get the memory from the (pseudo) swap-space, either free already
- *    or by moving some other pages out of the swap.
- *
- *  - If mem_init() already has been called, and ST-RAM swapping is not
- *    enabled, the only possibility is to try with __get_dma_pages(). This has
- *    the disadvantage that it's very hard to get more than 1 page, and it is
- *    likely to fail :-(
+ *  - If mem_init() already has been called, try with __get_dma_pages().
+ *    This has the disadvantage that it's very hard to get more than 1 page,
+ *    and it is likely to fail :-(
  *
  */
 void *atari_stram_alloc(long size, const char *owner)
@@ -393,27 +186,13 @@ void *atari_stram_alloc(long size, const char *owner)
 
 	DPRINTK("atari_stram_alloc(size=%08lx,owner=%s)\n", size, owner);
 
-	size = ALIGN_IF_SWAP(size);
-	DPRINTK( "atari_stram_alloc: rounded size = %08lx\n", size );
-#ifdef CONFIG_STRAM_SWAP
-	if (max_swap_size) {
-		/* If swapping is active: make some free space in the swap
-		   "device". */
-		DPRINTK( "atari_stram_alloc: after mem_init, swapping ok, "
-				 "calling get_region\n" );
-		addr = get_stram_region( N_PAGES(size) );
-		flags = BLOCK_INSWAP;
-	}
-	else
-#endif
 	if (!mem_init_done)
 		return alloc_bootmem_low(size);
 	else {
-		/* After mem_init() and no swapping: can only resort to
-		 * __get_dma_pages() */
+		/* After mem_init(): can only resort to __get_dma_pages() */
 		addr = (void *)__get_dma_pages(GFP_KERNEL, get_order(size));
 		flags = BLOCK_GFP;
-		DPRINTK( "atari_stram_alloc: after mem_init, swapping off, "
+		DPRINTK( "atari_stram_alloc: after mem_init, "
 				 "get_pages=%p\n", addr );
 	}
 
@@ -422,12 +201,7 @@ void *atari_stram_alloc(long size, const char *owner)
 			/* out of memory for BLOCK structure :-( */
 			DPRINTK( "atari_stram_alloc: out of mem for BLOCK -- "
 					 "freeing again\n" );
-#ifdef CONFIG_STRAM_SWAP
-			if (flags == BLOCK_INSWAP)
-				free_stram_region( SWAP_NR(addr), N_PAGES(size) );
-			else
-#endif
-				free_pages((unsigned long)addr, get_order(size));
+			free_pages((unsigned long)addr, get_order(size));
 			return( NULL );
 		}
 		block->owner = owner;
@@ -451,25 +225,12 @@ void atari_stram_free( void *addr )
 	DPRINTK( "atari_stram_free: found block (%p): size=%08lx, owner=%s, "
 			 "flags=%02x\n", block, block->size, block->owner, block->flags );
 
-#ifdef CONFIG_STRAM_SWAP
-	if (!max_swap_size) {
-#endif
-		if (block->flags & BLOCK_GFP) {
-			DPRINTK("atari_stram_free: is kmalloced, order_size=%d\n",
-				get_order(block->size));
-			free_pages((unsigned long)addr, get_order(block->size));
-		}
-		else
-			goto fail;
-#ifdef CONFIG_STRAM_SWAP
-	}
-	else if (block->flags & BLOCK_INSWAP) {
-		DPRINTK( "atari_stram_free: is swap-alloced\n" );
-		free_stram_region( SWAP_NR(block->start), N_PAGES(block->size) );
-	}
-	else
+	if (!(block->flags & BLOCK_GFP))
 		goto fail;
-#endif
+
+	DPRINTK("atari_stram_free: is kmalloced, order_size=%d\n",
+		get_order(block->size));
+	free_pages((unsigned long)addr, get_order(block->size));
 	remove_region( block );
 	return;
 
@@ -478,612 +239,6 @@ void atari_stram_free( void *addr )
 			"(called from %p)\n", addr, __builtin_return_address(0) );
 }
 
-
-#ifdef CONFIG_STRAM_SWAP
-
-
-/* ------------------------------------------------------------------------ */
-/*						   Main Swapping Functions							*/
-/* ------------------------------------------------------------------------ */
-
-
-/*
- * Initialize ST-RAM swap device
- * (lots copied and modified from sys_swapon() in mm/swapfile.c)
- */
-static int __init swap_init(void *start_mem, void *swap_data)
-{
-	static struct dentry fake_dentry;
-	static struct vfsmount fake_vfsmnt;
-	struct swap_info_struct *p;
-	struct inode swap_inode;
-	unsigned int type;
-	void *addr;
-	int i, j, k, prev;
-
-	DPRINTK("swap_init(start_mem=%p, swap_data=%p)\n",
-		start_mem, swap_data);
-
-	/* need at least one page for swapping to (and this also isn't very
-	 * much... :-) */
-	if (swap_end - swap_start < 2*PAGE_SIZE) {
-		printk( KERN_WARNING "stram_swap_init: swap space too small\n" );
-		return( 0 );
-	}
-
-	/* find free slot in swap_info */
-	for( p = swap_info, type = 0; type < nr_swapfiles; type++, p++ )
-		if (!(p->flags & SWP_USED))
-			break;
-	if (type >= MAX_SWAPFILES) {
-		printk( KERN_WARNING "stram_swap_init: max. number of "
-				"swap devices exhausted\n" );
-		return( 0 );
-	}
-	if (type >= nr_swapfiles)
-		nr_swapfiles = type+1;
-
-	stram_swap_info = p;
-	stram_swap_type = type;
-
-	/* fake some dir cache entries to give us some name in /dev/swaps */
-	fake_dentry.d_parent = &fake_dentry;
-	fake_dentry.d_name.name = "stram (internal)";
-	fake_dentry.d_name.len = 16;
-	fake_vfsmnt.mnt_parent = &fake_vfsmnt;
-
-	p->flags        = SWP_USED;
-	p->swap_file    = &fake_dentry;
-	p->swap_vfsmnt  = &fake_vfsmnt;
-	p->swap_map	= swap_data;
-	p->cluster_nr   = 0;
-	p->next         = -1;
-	p->prio         = 0x7ff0;	/* a rather high priority, but not the higest
-								 * to give the user a chance to override */
-
-	/* call stram_open() directly, avoids at least the overhead in
-	 * constructing a dummy file structure... */
-	swap_inode.i_rdev = MKDEV( STRAM_MAJOR, STRAM_MINOR );
-	stram_open( &swap_inode, MAGIC_FILE_P );
-	p->max = SWAP_NR(swap_end);
-
-	/* initialize swap_map: set regions that are already allocated or belong
-	 * to kernel data space to SWAP_MAP_BAD, otherwise to free */
-	j = 0; /* # of free pages */
-	k = 0; /* # of already allocated pages (from pre-mem_init stram_alloc()) */
-	p->lowest_bit = 0;
-	p->highest_bit = 0;
-	for( i = 1, addr = SWAP_ADDR(1); i < p->max;
-		 i++, addr += PAGE_SIZE ) {
-		if (in_some_region( addr )) {
-			p->swap_map[i] = SWAP_MAP_BAD;
-			++k;
-		}
-		else if (kernel_in_stram && addr < start_mem ) {
-			p->swap_map[i] = SWAP_MAP_BAD;
-		}
-		else {
-			p->swap_map[i] = 0;
-			++j;
-			if (!p->lowest_bit) p->lowest_bit = i;
-			p->highest_bit = i;
-		}
-	}
-	/* first page always reserved (and doesn't really belong to swap space) */
-	p->swap_map[0] = SWAP_MAP_BAD;
-
-	/* now swapping to this device ok */
-	p->pages = j + k;
-	swap_list_lock();
-	nr_swap_pages += j;
-	p->flags = SWP_WRITEOK;
-
-	/* insert swap space into swap_list */
-	prev = -1;
-	for (i = swap_list.head; i >= 0; i = swap_info[i].next) {
-		if (p->prio >= swap_info[i].prio) {
-			break;
-		}
-		prev = i;
-	}
-	p->next = i;
-	if (prev < 0) {
-		swap_list.head = swap_list.next = p - swap_info;
-	} else {
-		swap_info[prev].next = p - swap_info;
-	}
-	swap_list_unlock();
-
-	printk( KERN_INFO "Using %dk (%d pages) of ST-RAM as swap space.\n",
-			p->pages << 2, p->pages );
-	return( 1 );
-}
-
-
-/*
- * The swap entry has been read in advance, and we return 1 to indicate
- * that the page has been used or is no longer needed.
- *
- * Always set the resulting pte to be nowrite (the same as COW pages
- * after one process has exited).  We don't know just how many PTEs will
- * share this swap entry, so be cautious and let do_wp_page work out
- * what to do if a write is requested later.
- */
-static inline void unswap_pte(struct vm_area_struct * vma, unsigned long
-			      address, pte_t *dir, swp_entry_t entry,
-			      struct page *page)
-{
-	pte_t pte = *dir;
-
-	if (pte_none(pte))
-		return;
-	if (pte_present(pte)) {
-		/* If this entry is swap-cached, then page must already
-                   hold the right address for any copies in physical
-                   memory */
-		if (pte_page(pte) != page)
-			return;
-		/* We will be removing the swap cache in a moment, so... */
-		set_pte(dir, pte_mkdirty(pte));
-		return;
-	}
-	if (pte_val(pte) != entry.val)
-		return;
-
-	DPRINTK("unswap_pte: replacing entry %08lx by new page %p",
-		entry.val, page);
-	set_pte(dir, pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
-	swap_free(entry);
-	get_page(page);
-	inc_mm_counter(vma->vm_mm, rss);
-}
-
-static inline void unswap_pmd(struct vm_area_struct * vma, pmd_t *dir,
-			      unsigned long address, unsigned long size,
-			      unsigned long offset, swp_entry_t entry,
-			      struct page *page)
-{
-	pte_t * pte;
-	unsigned long end;
-
-	if (pmd_none(*dir))
-		return;
-	if (pmd_bad(*dir)) {
-		pmd_ERROR(*dir);
-		pmd_clear(dir);
-		return;
-	}
-	pte = pte_offset_kernel(dir, address);
-	offset += address & PMD_MASK;
-	address &= ~PMD_MASK;
-	end = address + size;
-	if (end > PMD_SIZE)
-		end = PMD_SIZE;
-	do {
-		unswap_pte(vma, offset+address-vma->vm_start, pte, entry, page);
-		address += PAGE_SIZE;
-		pte++;
-	} while (address < end);
-}
-
-static inline void unswap_pgd(struct vm_area_struct * vma, pgd_t *dir,
-			      unsigned long address, unsigned long size,
-			      swp_entry_t entry, struct page *page)
-{
-	pmd_t * pmd;
-	unsigned long offset, end;
-
-	if (pgd_none(*dir))
-		return;
-	if (pgd_bad(*dir)) {
-		pgd_ERROR(*dir);
-		pgd_clear(dir);
-		return;
-	}
-	pmd = pmd_offset(dir, address);
-	offset = address & PGDIR_MASK;
-	address &= ~PGDIR_MASK;
-	end = address + size;
-	if (end > PGDIR_SIZE)
-		end = PGDIR_SIZE;
-	do {
-		unswap_pmd(vma, pmd, address, end - address, offset, entry,
-			   page);
-		address = (address + PMD_SIZE) & PMD_MASK;
-		pmd++;
-	} while (address < end);
-}
-
-static void unswap_vma(struct vm_area_struct * vma, pgd_t *pgdir,
-		       swp_entry_t entry, struct page *page)
-{
-	unsigned long start = vma->vm_start, end = vma->vm_end;
-
-	do {
-		unswap_pgd(vma, pgdir, start, end - start, entry, page);
-		start = (start + PGDIR_SIZE) & PGDIR_MASK;
-		pgdir++;
-	} while (start < end);
-}
-
-static void unswap_process(struct mm_struct * mm, swp_entry_t entry,
-			   struct page *page)
-{
-	struct vm_area_struct* vma;
-
-	/*
-	 * Go through process' page directory.
-	 */
-	if (!mm)
-		return;
-	for (vma = mm->mmap; vma; vma = vma->vm_next) {
-		pgd_t * pgd = pgd_offset(mm, vma->vm_start);
-		unswap_vma(vma, pgd, entry, page);
-	}
-}
-
-
-static int unswap_by_read(unsigned short *map, unsigned long max,
-			  unsigned long start, unsigned long n_pages)
-{
-	struct task_struct *p;
-	struct page *page;
-	swp_entry_t entry;
-	unsigned long i;
-
-	DPRINTK( "unswapping %lu..%lu by reading in\n",
-			 start, start+n_pages-1 );
-
-	for( i = start; i < start+n_pages; ++i ) {
-		if (map[i] == SWAP_MAP_BAD) {
-			printk( KERN_ERR "get_stram_region: page %lu already "
-					"reserved??\n", i );
-			continue;
-		}
-
-		if (map[i]) {
-			entry = swp_entry(stram_swap_type, i);
-			DPRINTK("unswap: map[i=%lu]=%u nr_swap=%ld\n",
-				i, map[i], nr_swap_pages);
-
-			swap_device_lock(stram_swap_info);
-			map[i]++;
-			swap_device_unlock(stram_swap_info);
-			/* Get a page for the entry, using the existing
-			   swap cache page if there is one.  Otherwise,
-			   get a clean page and read the swap into it. */
-			page = read_swap_cache_async(entry, NULL, 0);
-			if (!page) {
-				swap_free(entry);
-				return -ENOMEM;
-			}
-			read_lock(&tasklist_lock);
-			for_each_process(p)
-				unswap_process(p->mm, entry, page);
-			read_unlock(&tasklist_lock);
-			shmem_unuse(entry, page);
-			/* Now get rid of the extra reference to the
-			   temporary page we've been using. */
-			if (PageSwapCache(page))
-				delete_from_swap_cache(page);
-			__free_page(page);
-	#ifdef DO_PROC
-			stat_swap_force++;
-	#endif
-		}
-
-		DPRINTK( "unswap: map[i=%lu]=%u nr_swap=%ld\n",
-				 i, map[i], nr_swap_pages );
-		swap_list_lock();
-		swap_device_lock(stram_swap_info);
-		map[i] = SWAP_MAP_BAD;
-		if (stram_swap_info->lowest_bit == i)
-			stram_swap_info->lowest_bit++;
-		if (stram_swap_info->highest_bit == i)
-			stram_swap_info->highest_bit--;
-		--nr_swap_pages;
-		swap_device_unlock(stram_swap_info);
-		swap_list_unlock();
-	}
-
-	return 0;
-}
-
-/*
- * reserve a region in ST-RAM swap space for an allocation
- */
-static void *get_stram_region( unsigned long n_pages )
-{
-	unsigned short *map = stram_swap_info->swap_map;
-	unsigned long max = stram_swap_info->max;
-	unsigned long start, total_free, region_free;
-	int err;
-	void *ret = NULL;
-
-	DPRINTK( "get_stram_region(n_pages=%lu)\n", n_pages );
-
-	down(&stram_swap_sem);
-
-	/* disallow writing to the swap device now */
-	stram_swap_info->flags = SWP_USED;
-
-	/* find a region of n_pages pages in the swap space including as much free
-	 * pages as possible (and excluding any already-reserved pages). */
-	if (!(start = find_free_region( n_pages, &total_free, &region_free )))
-		goto end;
-	DPRINTK( "get_stram_region: region starts at %lu, has %lu free pages\n",
-			 start, region_free );
-
-	err = unswap_by_read(map, max, start, n_pages);
-	if (err)
-		goto end;
-
-	ret = SWAP_ADDR(start);
-  end:
-	/* allow using swap device again */
-	stram_swap_info->flags = SWP_WRITEOK;
-	up(&stram_swap_sem);
-	DPRINTK( "get_stram_region: returning %p\n", ret );
-	return( ret );
-}
-
-
-/*
- * free a reserved region in ST-RAM swap space
- */
-static void free_stram_region( unsigned long offset, unsigned long n_pages )
-{
-	unsigned short *map = stram_swap_info->swap_map;
-
-	DPRINTK( "free_stram_region(offset=%lu,n_pages=%lu)\n", offset, n_pages );
-
-	if (offset < 1 || offset + n_pages > stram_swap_info->max) {
-		printk( KERN_ERR "free_stram_region: Trying to free non-ST-RAM\n" );
-		return;
-	}
-
-	swap_list_lock();
-	swap_device_lock(stram_swap_info);
-	/* un-reserve the freed pages */
-	for( ; n_pages > 0; ++offset, --n_pages ) {
-		if (map[offset] != SWAP_MAP_BAD)
-			printk( KERN_ERR "free_stram_region: Swap page %lu was not "
-					"reserved\n", offset );
-		map[offset] = 0;
-	}
-
-	/* update swapping meta-data */
-	if (offset < stram_swap_info->lowest_bit)
-		stram_swap_info->lowest_bit = offset;
-	if (offset+n_pages-1 > stram_swap_info->highest_bit)
-		stram_swap_info->highest_bit = offset+n_pages-1;
-	if (stram_swap_info->prio > swap_info[swap_list.next].prio)
-		swap_list.next = swap_list.head;
-	nr_swap_pages += n_pages;
-	swap_device_unlock(stram_swap_info);
-	swap_list_unlock();
-}
-
-
-/* ------------------------------------------------------------------------ */
-/*						Utility Functions for Swapping						*/
-/* ------------------------------------------------------------------------ */
-
-
-/* is addr in some of the allocated regions? */
-static int in_some_region(void *addr)
-{
-	BLOCK *p;
-
-	for( p = alloc_list; p; p = p->next ) {
-		if (p->start <= addr && addr < p->start + p->size)
-			return( 1 );
-	}
-	return( 0 );
-}
-
-
-static unsigned long find_free_region(unsigned long n_pages,
-				      unsigned long *total_free,
-				      unsigned long *region_free)
-{
-	unsigned short *map = stram_swap_info->swap_map;
-	unsigned long max = stram_swap_info->max;
-	unsigned long head, tail, max_start;
-	long nfree, max_free;
-
-	/* first scan the swap space for a suitable place for the allocation */
-	head = 1;
-	max_start = 0;
-	max_free = -1;
-	*total_free = 0;
-
-  start_over:
-	/* increment tail until final window size reached, and count free pages */
-	nfree = 0;
-	for( tail = head; tail-head < n_pages && tail < max; ++tail ) {
-		if (map[tail] == SWAP_MAP_BAD) {
-			head = tail+1;
-			goto start_over;
-		}
-		if (!map[tail]) {
-			++nfree;
-			++*total_free;
-		}
-	}
-	if (tail-head < n_pages)
-		goto out;
-	if (nfree > max_free) {
-		max_start = head;
-		max_free  = nfree;
-		if (max_free >= n_pages)
-			/* don't need more free pages... :-) */
-			goto out;
-	}
-
-	/* now shift the window and look for the area where as much pages as
-	 * possible are free */
-	while( tail < max ) {
-		nfree -= (map[head++] == 0);
-		if (map[tail] == SWAP_MAP_BAD) {
-			head = tail+1;
-			goto start_over;
-		}
-		if (!map[tail]) {
-			++nfree;
-			++*total_free;
-		}
-		++tail;
-		if (nfree > max_free) {
-			max_start = head;
-			max_free  = nfree;
-			if (max_free >= n_pages)
-				/* don't need more free pages... :-) */
-				goto out;
-		}
-	}
-
-  out:
-	if (max_free < 0) {
-		printk( KERN_NOTICE "get_stram_region: ST-RAM too full or fragmented "
-				"-- can't allocate %lu pages\n", n_pages );
-		return( 0 );
-	}
-
-	*region_free = max_free;
-	return( max_start );
-}
-
-
-/* setup parameters from command line */
-void __init stram_swap_setup(char *str, int *ints)
-{
-	if (ints[0] >= 1)
-		max_swap_size = ((ints[1] < 0 ? 0 : ints[1]) * 1024) & PAGE_MASK;
-}
-
-
-/* ------------------------------------------------------------------------ */
-/*								ST-RAM device								*/
-/* ------------------------------------------------------------------------ */
-
-static int refcnt;
-
-static void do_stram_request(request_queue_t *q)
-{
-	struct request *req;
-
-	while ((req = elv_next_request(q)) != NULL) {
-		void *start = swap_start + (req->sector << 9);
-		unsigned long len = req->current_nr_sectors << 9;
-		if ((start + len) > swap_end) {
-			printk( KERN_ERR "stram: bad access beyond end of device: "
-					"block=%ld, count=%d\n",
-					req->sector,
-					req->current_nr_sectors );
-			end_request(req, 0);
-			continue;
-		}
-
-		if (req->cmd == READ) {
-			memcpy(req->buffer, start, len);
-#ifdef DO_PROC
-			stat_swap_read += N_PAGES(len);
-#endif
-		}
-		else {
-			memcpy(start, req->buffer, len);
-#ifdef DO_PROC
-			stat_swap_write += N_PAGES(len);
-#endif
-		}
-		end_request(req, 1);
-	}
-}
-
-
-static int stram_open( struct inode *inode, struct file *filp )
-{
-	if (filp != MAGIC_FILE_P) {
-		printk( KERN_NOTICE "Only kernel can open ST-RAM device\n" );
-		return( -EPERM );
-	}
-	if (refcnt)
-		return( -EBUSY );
-	++refcnt;
-	return( 0 );
-}
-
-static int stram_release( struct inode *inode, struct file *filp )
-{
-	if (filp != MAGIC_FILE_P) {
-		printk( KERN_NOTICE "Only kernel can close ST-RAM device\n" );
-		return( -EPERM );
-	}
-	if (refcnt > 0)
-		--refcnt;
-	return( 0 );
-}
-
-
-static struct block_device_operations stram_fops = {
-	.open =		stram_open,
-	.release =	stram_release,
-};
-
-static struct gendisk *stram_disk;
-static struct request_queue *stram_queue;
-static DEFINE_SPINLOCK(stram_lock);
-
-int __init stram_device_init(void)
-{
-	if (!MACH_IS_ATARI)
-		/* no point in initializing this, I hope */
-		return -ENXIO;
-
-	if (!max_swap_size)
-		/* swapping not enabled */
-		return -ENXIO;
-	stram_disk = alloc_disk(1);
-	if (!stram_disk)
-		return -ENOMEM;
-
-	if (register_blkdev(STRAM_MAJOR, "stram")) {
-		put_disk(stram_disk);
-		return -ENXIO;
-	}
-
-	stram_queue = blk_init_queue(do_stram_request, &stram_lock);
-	if (!stram_queue) {
-		unregister_blkdev(STRAM_MAJOR, "stram");
-		put_disk(stram_disk);
-		return -ENOMEM;
-	}
-
-	stram_disk->major = STRAM_MAJOR;
-	stram_disk->first_minor = STRAM_MINOR;
-	stram_disk->fops = &stram_fops;
-	stram_disk->queue = stram_queue;
-	sprintf(stram_disk->disk_name, "stram");
-	set_capacity(stram_disk, (swap_end - swap_start)/512);
-	add_disk(stram_disk);
-	return 0;
-}
-
-
-
-/* ------------------------------------------------------------------------ */
-/*							Misc Utility Functions							*/
-/* ------------------------------------------------------------------------ */
-
-/* reserve a range of pages */
-static void reserve_region(void *start, void *end)
-{
-	reserve_bootmem (virt_to_phys(start), end - start);
-}
-
-#endif /* CONFIG_STRAM_SWAP */
-
 
 /* ------------------------------------------------------------------------ */
 /*							  Region Management								*/
@@ -1173,50 +328,9 @@ int get_stram_list( char *buf )
 {
 	int len = 0;
 	BLOCK *p;
-#ifdef CONFIG_STRAM_SWAP
-	int i;
-	unsigned short *map = stram_swap_info->swap_map;
-	unsigned long max = stram_swap_info->max;
-	unsigned free = 0, used = 0, rsvd = 0;
-#endif
 
-#ifdef CONFIG_STRAM_SWAP
-	if (max_swap_size) {
-		for( i = 1; i < max; ++i ) {
-			if (!map[i])
-				++free;
-			else if (map[i] == SWAP_MAP_BAD)
-				++rsvd;
-			else
-				++used;
-		}
-		PRINT_PROC(
-			"Total ST-RAM:      %8u kB\n"
-			"Total ST-RAM swap: %8lu kB\n"
-			"Free swap:         %8u kB\n"
-			"Used swap:         %8u kB\n"
-			"Allocated swap:    %8u kB\n"
-			"Swap Reads:        %8u\n"
-			"Swap Writes:       %8u\n"
-			"Swap Forced Reads: %8u\n",
-			(stram_end - stram_start) >> 10,
-			(max-1) << (PAGE_SHIFT-10),
-			free << (PAGE_SHIFT-10),
-			used << (PAGE_SHIFT-10),
-			rsvd << (PAGE_SHIFT-10),
-			stat_swap_read,
-			stat_swap_write,
-			stat_swap_force );
-	}
-	else {
-#endif
-		PRINT_PROC( "ST-RAM swapping disabled\n" );
-		PRINT_PROC("Total ST-RAM:      %8u kB\n",
+	PRINT_PROC("Total ST-RAM:      %8u kB\n",
 			   (stram_end - stram_start) >> 10);
-#ifdef CONFIG_STRAM_SWAP
-	}
-#endif
-
 	PRINT_PROC( "Allocated regions:\n" );
 	for( p = alloc_list; p; p = p->next ) {
 		if (len + 50 >= PAGE_SIZE)
@@ -1227,8 +341,6 @@ int get_stram_list( char *buf )
 			   p->owner);
 		if (p->flags & BLOCK_GFP)
 			PRINT_PROC( "page-alloced)\n" );
-		else if (p->flags & BLOCK_INSWAP)
-			PRINT_PROC( "in swap)\n" );
 		else
 			PRINT_PROC( "??)\n" );
 	}

From b5810039a54e5babf428e9a1e89fc1940fabff11 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 29 Oct 2005 18:16:12 -0700
Subject: [PATCH 38/91] [PATCH] core remove PageReserved

Remove PageReserved() calls from core code by tightening VM_RESERVED
handling in mm/ to cover PageReserved functionality.

PageReserved special casing is removed from get_page and put_page.

All setting and clearing of PageReserved is retained, and it is now flagged
in the page_alloc checks to help ensure we don't introduce any refcount
based freeing of Reserved pages.

MAP_PRIVATE, PROT_WRITE of VM_RESERVED regions is tentatively being
deprecated.  We never completely handled it correctly anyway, and is be
reintroduced in future if required (Hugh has a proof of concept).

Once PageReserved() calls are removed from kernel/power/swsusp.c, and all
arch/ and driver code, the Set and Clear calls, and the PG_reserved bit can
be trivially removed.

Last real user of PageReserved is swsusp, which uses PageReserved to
determine whether a struct page points to valid memory or not.  This still
needs to be addressed (a generic page_is_ram() should work).

A last caveat: the ZERO_PAGE is now refcounted and managed with rmap (and
thus mapcounted and count towards shared rss).  These writes to the struct
page could cause excessive cacheline bouncing on big systems.  There are a
number of ways this could be addressed if it is an issue.

Signed-off-by: Nick Piggin <npiggin@suse.de>

Refcount bug fix for filemap_xip.c

Signed-off-by: Carsten Otte <cotte@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ppc64/kernel/vdso.c  |  12 ++--
 arch/sparc/mm/generic.c   |   3 +
 arch/sparc64/mm/generic.c |   3 +
 drivers/scsi/sg.c         |  12 ++--
 drivers/scsi/st.c         |  10 ++-
 fs/direct-io.c            |   4 +-
 include/linux/mm.h        |   5 +-
 kernel/power/swsusp.c     |  25 ++++---
 mm/bootmem.c              |   1 +
 mm/filemap_xip.c          |  11 +++-
 mm/fremap.c               |  23 ++++---
 mm/madvise.c              |   2 +-
 mm/memory.c               | 133 +++++++++++++++++++++++---------------
 mm/mempolicy.c            |  29 +++++----
 mm/mmap.c                 |  11 ++++
 mm/mprotect.c             |   8 +++
 mm/msync.c                |  17 +++--
 mm/page_alloc.c           |  14 ++--
 mm/rmap.c                 |  14 ++--
 mm/shmem.c                |   4 +-
 mm/swap.c                 |   4 +-
 sound/core/pcm_native.c   |   9 +--
 22 files changed, 219 insertions(+), 135 deletions(-)

diff --git a/arch/ppc64/kernel/vdso.c b/arch/ppc64/kernel/vdso.c
index efa985f05aca..4aacf521e3e4 100644
--- a/arch/ppc64/kernel/vdso.c
+++ b/arch/ppc64/kernel/vdso.c
@@ -176,13 +176,13 @@ static struct page * vdso_vma_nopage(struct vm_area_struct * vma,
 		return NOPAGE_SIGBUS;
 
 	/*
-	 * Last page is systemcfg, special handling here, no get_page() a
-	 * this is a reserved page
+	 * Last page is systemcfg.
 	 */
 	if ((vma->vm_end - address) <= PAGE_SIZE)
-		return virt_to_page(systemcfg);
+		pg = virt_to_page(systemcfg);
+	else
+		pg = virt_to_page(vbase + offset);
 
-	pg = virt_to_page(vbase + offset);
 	get_page(pg);
 	DBG(" ->page count: %d\n", page_count(pg));
 
@@ -259,7 +259,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int executable_stack)
 	 * gettimeofday will be totally dead. It's fine to use that for setting
 	 * breakpoints in the vDSO code pages though
 	 */
-	vma->vm_flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
+	vma->vm_flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC | VM_RESERVED;
 	vma->vm_flags |= mm->def_flags;
 	vma->vm_page_prot = protection_map[vma->vm_flags & 0x7];
 	vma->vm_ops = &vdso_vmops;
@@ -603,6 +603,8 @@ void __init vdso_init(void)
 		ClearPageReserved(pg);
 		get_page(pg);
 	}
+
+	get_page(virt_to_page(systemcfg));
 }
 
 int in_gate_area_no_task(unsigned long addr)
diff --git a/arch/sparc/mm/generic.c b/arch/sparc/mm/generic.c
index 20ccb957fb77..659c9a71f867 100644
--- a/arch/sparc/mm/generic.c
+++ b/arch/sparc/mm/generic.c
@@ -73,6 +73,9 @@ int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
 	int space = GET_IOSPACE(pfn);
 	unsigned long offset = GET_PFN(pfn) << PAGE_SHIFT;
 
+	/* See comment in mm/memory.c remap_pfn_range */
+	vma->vm_flags |= VM_IO | VM_RESERVED;
+
 	prot = __pgprot(pg_iobits);
 	offset -= from;
 	dir = pgd_offset(mm, from);
diff --git a/arch/sparc64/mm/generic.c b/arch/sparc64/mm/generic.c
index c954d91f01d0..afc01cec701f 100644
--- a/arch/sparc64/mm/generic.c
+++ b/arch/sparc64/mm/generic.c
@@ -127,6 +127,9 @@ int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
 	int space = GET_IOSPACE(pfn);
 	unsigned long offset = GET_PFN(pfn) << PAGE_SHIFT;
 
+	/* See comment in mm/memory.c remap_pfn_range */
+	vma->vm_flags |= VM_IO | VM_RESERVED;
+
 	prot = __pgprot(pg_iobits);
 	offset -= from;
 	dir = pgd_offset(mm, from);
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 861e51375d70..2d30b46806bf 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1886,13 +1886,17 @@ st_unmap_user_pages(struct scatterlist *sgl, const unsigned int nr_pages,
 	int i;
 
 	for (i=0; i < nr_pages; i++) {
-		if (dirtied && !PageReserved(sgl[i].page))
-			SetPageDirty(sgl[i].page);
-		/* unlock_page(sgl[i].page); */
+		struct page *page = sgl[i].page;
+
+		/* XXX: just for debug. Remove when PageReserved is removed */
+		BUG_ON(PageReserved(page));
+		if (dirtied)
+			SetPageDirty(page);
+		/* unlock_page(page); */
 		/* FIXME: cache flush missing for rw==READ
 		 * FIXME: call the correct reference counting function
 		 */
-		page_cache_release(sgl[i].page);
+		page_cache_release(page);
 	}
 
 	return 0;
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index 5eb54d8019b4..da9766283bd7 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -4526,12 +4526,16 @@ static int sgl_unmap_user_pages(struct scatterlist *sgl, const unsigned int nr_p
 	int i;
 
 	for (i=0; i < nr_pages; i++) {
-		if (dirtied && !PageReserved(sgl[i].page))
-			SetPageDirty(sgl[i].page);
+		struct page *page = sgl[i].page;
+
+		/* XXX: just for debug. Remove when PageReserved is removed */
+		BUG_ON(PageReserved(page));
+		if (dirtied)
+			SetPageDirty(page);
 		/* FIXME: cache flush missing for rw==READ
 		 * FIXME: call the correct reference counting function
 		 */
-		page_cache_release(sgl[i].page);
+		page_cache_release(page);
 	}
 
 	return 0;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 0d06097bc995..3931e7f1e6bf 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -162,6 +162,7 @@ static int dio_refill_pages(struct dio *dio)
 	up_read(&current->mm->mmap_sem);
 
 	if (ret < 0 && dio->blocks_available && (dio->rw == WRITE)) {
+		struct page *page = ZERO_PAGE(dio->curr_user_address);
 		/*
 		 * A memory fault, but the filesystem has some outstanding
 		 * mapped blocks.  We need to use those blocks up to avoid
@@ -169,7 +170,8 @@ static int dio_refill_pages(struct dio *dio)
 		 */
 		if (dio->page_errors == 0)
 			dio->page_errors = ret;
-		dio->pages[0] = ZERO_PAGE(dio->curr_user_address);
+		page_cache_get(page);
+		dio->pages[0] = page;
 		dio->head = 0;
 		dio->tail = 1;
 		ret = 0;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0c64484d8ae0..da42093250c3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -157,7 +157,7 @@ extern unsigned int kobjsize(const void *objp);
 
 #define VM_DONTCOPY	0x00020000      /* Do not copy this vma on fork */
 #define VM_DONTEXPAND	0x00040000	/* Cannot expand with mremap() */
-#define VM_RESERVED	0x00080000	/* Don't unmap it from swap_out */
+#define VM_RESERVED	0x00080000	/* Pages managed in a special way */
 #define VM_ACCOUNT	0x00100000	/* Is a VM accounted object */
 #define VM_HUGETLB	0x00400000	/* Huge TLB Page VM */
 #define VM_NONLINEAR	0x00800000	/* Is non-linear (remap_file_pages) */
@@ -338,7 +338,7 @@ static inline void get_page(struct page *page)
 
 static inline void put_page(struct page *page)
 {
-	if (!PageReserved(page) && put_page_testzero(page))
+	if (put_page_testzero(page))
 		__page_cache_release(page);
 }
 
@@ -723,6 +723,7 @@ void install_arg_page(struct vm_area_struct *, struct page *, unsigned long);
 
 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start,
 		int len, int write, int force, struct page **pages, struct vm_area_struct **vmas);
+void print_bad_pte(struct vm_area_struct *, pte_t, unsigned long);
 
 int __set_page_dirty_buffers(struct page *page);
 int __set_page_dirty_nobuffers(struct page *page);
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 10bc5ec496d7..016504ccfccf 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -578,15 +578,23 @@ static int save_highmem_zone(struct zone *zone)
 			continue;
 		page = pfn_to_page(pfn);
 		/*
-		 * This condition results from rvmalloc() sans vmalloc_32()
-		 * and architectural memory reservations. This should be
-		 * corrected eventually when the cases giving rise to this
-		 * are better understood.
+		 * PageReserved results from rvmalloc() sans vmalloc_32()
+		 * and architectural memory reservations.
+		 *
+		 * rvmalloc should not cause this, because all implementations
+		 * appear to always be using vmalloc_32 on architectures with
+		 * highmem. This is a good thing, because we would like to save
+		 * rvmalloc pages.
+		 *
+		 * It appears to be triggered by pages which do not point to
+		 * valid memory (see arch/i386/mm/init.c:one_highpage_init(),
+		 * which sets PageReserved if the page does not point to valid
+		 * RAM.
+		 *
+		 * XXX: must remove usage of PageReserved!
 		 */
-		if (PageReserved(page)) {
-			printk("highmem reserved page?!\n");
+		if (PageReserved(page))
 			continue;
-		}
 		BUG_ON(PageNosave(page));
 		if (PageNosaveFree(page))
 			continue;
@@ -672,10 +680,9 @@ static int saveable(struct zone * zone, unsigned long * zone_pfn)
 		return 0;
 
 	page = pfn_to_page(pfn);
-	BUG_ON(PageReserved(page) && PageNosave(page));
 	if (PageNosave(page))
 		return 0;
-	if (PageReserved(page) && pfn_is_nosave(pfn)) {
+	if (pfn_is_nosave(pfn)) {
 		pr_debug("[nosave pfn 0x%lx]", pfn);
 		return 0;
 	}
diff --git a/mm/bootmem.c b/mm/bootmem.c
index a58699b6579e..e8c567177dcf 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -305,6 +305,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
 				if (j + 16 < BITS_PER_LONG)
 					prefetchw(page + j + 16);
 				__ClearPageReserved(page + j);
+				set_page_count(page + j, 0);
 			}
 			__free_pages(page, order);
 			i += BITS_PER_LONG;
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 8c199f537732..9354ee279b13 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -174,6 +174,7 @@ __xip_unmap (struct address_space * mapping,
 	unsigned long address;
 	pte_t *pte;
 	pte_t pteval;
+	struct page *page = ZERO_PAGE(address);
 
 	spin_lock(&mapping->i_mmap_lock);
 	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
@@ -185,15 +186,17 @@ __xip_unmap (struct address_space * mapping,
 		 * We need the page_table_lock to protect us from page faults,
 		 * munmap, fork, etc...
 		 */
-		pte = page_check_address(ZERO_PAGE(address), mm,
-					 address);
+		pte = page_check_address(page, mm, address);
 		if (!IS_ERR(pte)) {
 			/* Nuke the page table entry. */
 			flush_cache_page(vma, address, pte_pfn(*pte));
 			pteval = ptep_clear_flush(vma, address, pte);
+			page_remove_rmap(page);
+			dec_mm_counter(mm, file_rss);
 			BUG_ON(pte_dirty(pteval));
 			pte_unmap(pte);
 			spin_unlock(&mm->page_table_lock);
+			page_cache_release(page);
 		}
 	}
 	spin_unlock(&mapping->i_mmap_lock);
@@ -228,7 +231,7 @@ xip_file_nopage(struct vm_area_struct * area,
 
 	page = mapping->a_ops->get_xip_page(mapping, pgoff*(PAGE_SIZE/512), 0);
 	if (!IS_ERR(page)) {
-		return page;
+		goto out;
 	}
 	if (PTR_ERR(page) != -ENODATA)
 		return NULL;
@@ -249,6 +252,8 @@ xip_file_nopage(struct vm_area_struct * area,
 		page = ZERO_PAGE(address);
 	}
 
+out:
+	page_cache_get(page);
 	return page;
 }
 
diff --git a/mm/fremap.c b/mm/fremap.c
index fd7f2a17ff3e..224cc1598b35 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -29,19 +29,20 @@ static inline void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
 		return;
 	if (pte_present(pte)) {
 		unsigned long pfn = pte_pfn(pte);
+		struct page *page;
 
 		flush_cache_page(vma, addr, pfn);
 		pte = ptep_clear_flush(vma, addr, ptep);
-		if (pfn_valid(pfn)) {
-			struct page *page = pfn_to_page(pfn);
-			if (!PageReserved(page)) {
-				if (pte_dirty(pte))
-					set_page_dirty(page);
-				page_remove_rmap(page);
-				page_cache_release(page);
-				dec_mm_counter(mm, file_rss);
-			}
+		if (unlikely(!pfn_valid(pfn))) {
+			print_bad_pte(vma, pte, addr);
+			return;
 		}
+		page = pfn_to_page(pfn);
+		if (pte_dirty(pte))
+			set_page_dirty(page);
+		page_remove_rmap(page);
+		page_cache_release(page);
+		dec_mm_counter(mm, file_rss);
 	} else {
 		if (!pte_file(pte))
 			free_swap_and_cache(pte_to_swp_entry(pte));
@@ -65,6 +66,8 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	pgd_t *pgd;
 	pte_t pte_val;
 
+	BUG_ON(vma->vm_flags & VM_RESERVED);
+
 	pgd = pgd_offset(mm, addr);
 	spin_lock(&mm->page_table_lock);
 	
@@ -125,6 +128,8 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
 	pgd_t *pgd;
 	pte_t pte_val;
 
+	BUG_ON(vma->vm_flags & VM_RESERVED);
+
 	pgd = pgd_offset(mm, addr);
 	spin_lock(&mm->page_table_lock);
 	
diff --git a/mm/madvise.c b/mm/madvise.c
index 20e075d1c64c..17aaf3e16449 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -126,7 +126,7 @@ static long madvise_dontneed(struct vm_area_struct * vma,
 			     unsigned long start, unsigned long end)
 {
 	*prev = vma;
-	if ((vma->vm_flags & VM_LOCKED) || is_vm_hugetlb_page(vma))
+	if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_RESERVED))
 		return -EINVAL;
 
 	if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
diff --git a/mm/memory.c b/mm/memory.c
index da642b5528fa..e83f9440bb66 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -342,6 +342,23 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
 
 #define NO_RSS 2	/* Increment neither file_rss nor anon_rss */
 
+/*
+ * This function is called to print an error when a pte in a
+ * !VM_RESERVED region is found pointing to an invalid pfn (which
+ * is an error.
+ *
+ * The calling function must still handle the error.
+ */
+void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr)
+{
+	printk(KERN_ERR "Bad pte = %08llx, process = %s, "
+			"vm_flags = %lx, vaddr = %lx\n",
+		(long long)pte_val(pte),
+		(vma->vm_mm == current->mm ? current->comm : "???"),
+		vma->vm_flags, vaddr);
+	dump_stack();
+}
+
 /*
  * copy one vm_area from one task to the other. Assumes the page tables
  * already present in the new task to be cleared in the whole range
@@ -353,9 +370,10 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
 
 static inline int
 copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
-		pte_t *dst_pte, pte_t *src_pte, unsigned long vm_flags,
+		pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
 		unsigned long addr)
 {
+	unsigned long vm_flags = vma->vm_flags;
 	pte_t pte = *src_pte;
 	struct page *page;
 	unsigned long pfn;
@@ -375,19 +393,23 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		goto out_set_pte;
 	}
 
-	pfn = pte_pfn(pte);
-	/* the pte points outside of valid memory, the
-	 * mapping is assumed to be good, meaningful
-	 * and not mapped via rmap - duplicate the
-	 * mapping as is.
+	/* If the region is VM_RESERVED, the mapping is not
+	 * mapped via rmap - duplicate the pte as is.
 	 */
-	page = NULL;
-	if (pfn_valid(pfn))
-		page = pfn_to_page(pfn);
-
-	if (!page || PageReserved(page))
+	if (vm_flags & VM_RESERVED)
 		goto out_set_pte;
 
+	pfn = pte_pfn(pte);
+	/* If the pte points outside of valid memory but
+	 * the region is not VM_RESERVED, we have a problem.
+	 */
+	if (unlikely(!pfn_valid(pfn))) {
+		print_bad_pte(vma, pte, addr);
+		goto out_set_pte; /* try to do something sane */
+	}
+
+	page = pfn_to_page(pfn);
+
 	/*
 	 * If it's a COW mapping, write protect it both
 	 * in the parent and the child
@@ -418,7 +440,6 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		unsigned long addr, unsigned long end)
 {
 	pte_t *src_pte, *dst_pte;
-	unsigned long vm_flags = vma->vm_flags;
 	int progress = 0;
 	int rss[NO_RSS+1], anon;
 
@@ -446,8 +467,7 @@ again:
 			progress++;
 			continue;
 		}
-		anon = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
-							vm_flags, addr);
+		anon = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma,addr);
 		rss[anon]++;
 		progress += 8;
 	} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
@@ -541,10 +561,12 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	return 0;
 }
 
-static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
+static void zap_pte_range(struct mmu_gather *tlb,
+				struct vm_area_struct *vma, pmd_t *pmd,
 				unsigned long addr, unsigned long end,
 				struct zap_details *details)
 {
+	struct mm_struct *mm = tlb->mm;
 	pte_t *pte;
 	int file_rss = 0;
 	int anon_rss = 0;
@@ -556,11 +578,12 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
 			continue;
 		if (pte_present(ptent)) {
 			struct page *page = NULL;
-			unsigned long pfn = pte_pfn(ptent);
-			if (pfn_valid(pfn)) {
-				page = pfn_to_page(pfn);
-				if (PageReserved(page))
-					page = NULL;
+			if (!(vma->vm_flags & VM_RESERVED)) {
+				unsigned long pfn = pte_pfn(ptent);
+				if (unlikely(!pfn_valid(pfn)))
+					print_bad_pte(vma, ptent, addr);
+				else
+					page = pfn_to_page(pfn);
 			}
 			if (unlikely(details) && page) {
 				/*
@@ -580,7 +603,7 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
 				     page->index > details->last_index))
 					continue;
 			}
-			ptent = ptep_get_and_clear_full(tlb->mm, addr, pte,
+			ptent = ptep_get_and_clear_full(mm, addr, pte,
 							tlb->fullmm);
 			tlb_remove_tlb_entry(tlb, pte, addr);
 			if (unlikely(!page))
@@ -588,7 +611,7 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
 			if (unlikely(details) && details->nonlinear_vma
 			    && linear_page_index(details->nonlinear_vma,
 						addr) != page->index)
-				set_pte_at(tlb->mm, addr, pte,
+				set_pte_at(mm, addr, pte,
 					   pgoff_to_pte(page->index));
 			if (PageAnon(page))
 				anon_rss++;
@@ -611,14 +634,15 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
 			continue;
 		if (!pte_file(ptent))
 			free_swap_and_cache(pte_to_swp_entry(ptent));
-		pte_clear_full(tlb->mm, addr, pte, tlb->fullmm);
+		pte_clear_full(mm, addr, pte, tlb->fullmm);
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 
-	add_mm_rss(tlb->mm, -file_rss, -anon_rss);
+	add_mm_rss(mm, -file_rss, -anon_rss);
 	pte_unmap(pte - 1);
 }
 
-static inline void zap_pmd_range(struct mmu_gather *tlb, pud_t *pud,
+static inline void zap_pmd_range(struct mmu_gather *tlb,
+				struct vm_area_struct *vma, pud_t *pud,
 				unsigned long addr, unsigned long end,
 				struct zap_details *details)
 {
@@ -630,11 +654,12 @@ static inline void zap_pmd_range(struct mmu_gather *tlb, pud_t *pud,
 		next = pmd_addr_end(addr, end);
 		if (pmd_none_or_clear_bad(pmd))
 			continue;
-		zap_pte_range(tlb, pmd, addr, next, details);
+		zap_pte_range(tlb, vma, pmd, addr, next, details);
 	} while (pmd++, addr = next, addr != end);
 }
 
-static inline void zap_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
+static inline void zap_pud_range(struct mmu_gather *tlb,
+				struct vm_area_struct *vma, pgd_t *pgd,
 				unsigned long addr, unsigned long end,
 				struct zap_details *details)
 {
@@ -646,7 +671,7 @@ static inline void zap_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
 		next = pud_addr_end(addr, end);
 		if (pud_none_or_clear_bad(pud))
 			continue;
-		zap_pmd_range(tlb, pud, addr, next, details);
+		zap_pmd_range(tlb, vma, pud, addr, next, details);
 	} while (pud++, addr = next, addr != end);
 }
 
@@ -667,7 +692,7 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd))
 			continue;
-		zap_pud_range(tlb, pgd, addr, next, details);
+		zap_pud_range(tlb, vma, pgd, addr, next, details);
 	} while (pgd++, addr = next, addr != end);
 	tlb_end_vma(tlb, vma);
 }
@@ -967,7 +992,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 			continue;
 		}
 
-		if (!vma || (vma->vm_flags & VM_IO)
+		if (!vma || (vma->vm_flags & (VM_IO | VM_RESERVED))
 				|| !(flags & vma->vm_flags))
 			return i ? : -EFAULT;
 
@@ -1027,8 +1052,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 			if (pages) {
 				pages[i] = page;
 				flush_dcache_page(page);
-				if (!PageReserved(page))
-					page_cache_get(page);
+				page_cache_get(page);
 			}
 			if (vmas)
 				vmas[i] = vma;
@@ -1051,7 +1075,11 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
 	if (!pte)
 		return -ENOMEM;
 	do {
-		pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(addr), prot));
+		struct page *page = ZERO_PAGE(addr);
+		pte_t zero_pte = pte_wrprotect(mk_pte(page, prot));
+		page_cache_get(page);
+		page_add_file_rmap(page);
+		inc_mm_counter(mm, file_rss);
 		BUG_ON(!pte_none(*pte));
 		set_pte_at(mm, addr, pte, zero_pte);
 	} while (pte++, addr += PAGE_SIZE, addr != end);
@@ -1132,8 +1160,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
 		return -ENOMEM;
 	do {
 		BUG_ON(!pte_none(*pte));
-		if (!pfn_valid(pfn) || PageReserved(pfn_to_page(pfn)))
-			set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
+		set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
 		pfn++;
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 	pte_unmap(pte - 1);
@@ -1195,8 +1222,8 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 	 * rest of the world about it:
 	 *   VM_IO tells people not to look at these pages
 	 *	(accesses can have side effects).
-	 *   VM_RESERVED tells swapout not to try to touch
-	 *	this region.
+	 *   VM_RESERVED tells the core MM not to "manage" these pages
+         *	(e.g. refcount, mapcount, try to swap them out).
 	 */
 	vma->vm_flags |= VM_IO | VM_RESERVED;
 
@@ -1256,11 +1283,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	pte_t entry;
 	int ret = VM_FAULT_MINOR;
 
+	BUG_ON(vma->vm_flags & VM_RESERVED);
+
 	if (unlikely(!pfn_valid(pfn))) {
 		/*
 		 * Page table corrupted: show pte and kill process.
 		 */
-		pte_ERROR(orig_pte);
+		print_bad_pte(vma, orig_pte, address);
 		ret = VM_FAULT_OOM;
 		goto unlock;
 	}
@@ -1284,8 +1313,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	/*
 	 * Ok, we need to copy. Oh, well..
 	 */
-	if (!PageReserved(old_page))
-		page_cache_get(old_page);
+	page_cache_get(old_page);
 	pte_unmap(page_table);
 	spin_unlock(&mm->page_table_lock);
 
@@ -1308,14 +1336,10 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	spin_lock(&mm->page_table_lock);
 	page_table = pte_offset_map(pmd, address);
 	if (likely(pte_same(*page_table, orig_pte))) {
-		if (PageReserved(old_page))
+		page_remove_rmap(old_page);
+		if (!PageAnon(old_page)) {
 			inc_mm_counter(mm, anon_rss);
-		else {
-			page_remove_rmap(old_page);
-			if (!PageAnon(old_page)) {
-				inc_mm_counter(mm, anon_rss);
-				dec_mm_counter(mm, file_rss);
-			}
+			dec_mm_counter(mm, file_rss);
 		}
 		flush_cache_page(vma, address, pfn);
 		entry = mk_pte(new_page, vma->vm_page_prot);
@@ -1769,14 +1793,13 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, pte_t *page_table, pmd_t *pmd,
 		int write_access)
 {
+	struct page *page = ZERO_PAGE(addr);
 	pte_t entry;
 
 	/* Mapping of ZERO_PAGE - vm_page_prot is readonly */
-	entry = mk_pte(ZERO_PAGE(addr), vma->vm_page_prot);
+	entry = mk_pte(page, vma->vm_page_prot);
 
 	if (write_access) {
-		struct page *page;
-
 		/* Allocate our own private page. */
 		pte_unmap(page_table);
 		spin_unlock(&mm->page_table_lock);
@@ -1800,6 +1823,10 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		lru_cache_add_active(page);
 		SetPageReferenced(page);
 		page_add_anon_rmap(page, vma, address);
+	} else {
+		inc_mm_counter(mm, file_rss);
+		page_add_file_rmap(page);
+		page_cache_get(page);
 	}
 
 	set_pte_at(mm, address, page_table, entry);
@@ -1916,7 +1943,7 @@ retry:
 			inc_mm_counter(mm, anon_rss);
 			lru_cache_add_active(new_page);
 			page_add_anon_rmap(new_page, vma, address);
-		} else if (!PageReserved(new_page)) {
+		} else if (!(vma->vm_flags & VM_RESERVED)) {
 			inc_mm_counter(mm, file_rss);
 			page_add_file_rmap(new_page);
 		}
@@ -1957,7 +1984,7 @@ static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		/*
 		 * Page table corrupted: show pte and kill process.
 		 */
-		pte_ERROR(orig_pte);
+		print_bad_pte(vma, orig_pte, address);
 		return VM_FAULT_OOM;
 	}
 	/* We can then assume vm->vm_ops && vma->vm_ops->populate */
@@ -2232,7 +2259,7 @@ static int __init gate_vma_init(void)
 	gate_vma.vm_start = FIXADDR_USER_START;
 	gate_vma.vm_end = FIXADDR_USER_END;
 	gate_vma.vm_page_prot = PAGE_READONLY;
-	gate_vma.vm_flags = 0;
+	gate_vma.vm_flags = VM_RESERVED;
 	return 0;
 }
 __initcall(gate_vma_init);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 43b1199af591..11d824f282f1 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -223,13 +223,13 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 }
 
 /* Ensure all existing pages follow the policy. */
-static int check_pte_range(struct mm_struct *mm, pmd_t *pmd,
+static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 		unsigned long addr, unsigned long end, nodemask_t *nodes)
 {
 	pte_t *orig_pte;
 	pte_t *pte;
 
-	spin_lock(&mm->page_table_lock);
+	spin_lock(&vma->vm_mm->page_table_lock);
 	orig_pte = pte = pte_offset_map(pmd, addr);
 	do {
 		unsigned long pfn;
@@ -238,18 +238,20 @@ static int check_pte_range(struct mm_struct *mm, pmd_t *pmd,
 		if (!pte_present(*pte))
 			continue;
 		pfn = pte_pfn(*pte);
-		if (!pfn_valid(pfn))
+		if (!pfn_valid(pfn)) {
+			print_bad_pte(vma, *pte, addr);
 			continue;
+		}
 		nid = pfn_to_nid(pfn);
 		if (!node_isset(nid, *nodes))
 			break;
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 	pte_unmap(orig_pte);
-	spin_unlock(&mm->page_table_lock);
+	spin_unlock(&vma->vm_mm->page_table_lock);
 	return addr != end;
 }
 
-static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud,
+static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 		unsigned long addr, unsigned long end, nodemask_t *nodes)
 {
 	pmd_t *pmd;
@@ -260,13 +262,13 @@ static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud,
 		next = pmd_addr_end(addr, end);
 		if (pmd_none_or_clear_bad(pmd))
 			continue;
-		if (check_pte_range(mm, pmd, addr, next, nodes))
+		if (check_pte_range(vma, pmd, addr, next, nodes))
 			return -EIO;
 	} while (pmd++, addr = next, addr != end);
 	return 0;
 }
 
-static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd,
+static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 		unsigned long addr, unsigned long end, nodemask_t *nodes)
 {
 	pud_t *pud;
@@ -277,24 +279,24 @@ static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd,
 		next = pud_addr_end(addr, end);
 		if (pud_none_or_clear_bad(pud))
 			continue;
-		if (check_pmd_range(mm, pud, addr, next, nodes))
+		if (check_pmd_range(vma, pud, addr, next, nodes))
 			return -EIO;
 	} while (pud++, addr = next, addr != end);
 	return 0;
 }
 
-static inline int check_pgd_range(struct mm_struct *mm,
+static inline int check_pgd_range(struct vm_area_struct *vma,
 		unsigned long addr, unsigned long end, nodemask_t *nodes)
 {
 	pgd_t *pgd;
 	unsigned long next;
 
-	pgd = pgd_offset(mm, addr);
+	pgd = pgd_offset(vma->vm_mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd))
 			continue;
-		if (check_pud_range(mm, pgd, addr, next, nodes))
+		if (check_pud_range(vma, pgd, addr, next, nodes))
 			return -EIO;
 	} while (pgd++, addr = next, addr != end);
 	return 0;
@@ -311,6 +313,8 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 	first = find_vma(mm, start);
 	if (!first)
 		return ERR_PTR(-EFAULT);
+	if (first->vm_flags & VM_RESERVED)
+		return ERR_PTR(-EACCES);
 	prev = NULL;
 	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 		if (!vma->vm_next && vma->vm_end < end)
@@ -323,8 +327,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 				endvma = end;
 			if (vma->vm_start > start)
 				start = vma->vm_start;
-			err = check_pgd_range(vma->vm_mm,
-					   start, endvma, nodes);
+			err = check_pgd_range(vma, start, endvma, nodes);
 			if (err) {
 				first = ERR_PTR(err);
 				break;
diff --git a/mm/mmap.c b/mm/mmap.c
index 459b9f068ad7..8a111792b8db 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1088,6 +1088,17 @@ munmap_back:
 		error = file->f_op->mmap(file, vma);
 		if (error)
 			goto unmap_and_free_vma;
+		if ((vma->vm_flags & (VM_SHARED | VM_WRITE | VM_RESERVED))
+						== (VM_WRITE | VM_RESERVED)) {
+			printk(KERN_WARNING "program %s is using MAP_PRIVATE, "
+				"PROT_WRITE mmap of VM_RESERVED memory, which "
+				"is deprecated. Please report this to "
+				"linux-kernel@vger.kernel.org\n",current->comm);
+			if (vma->vm_ops && vma->vm_ops->close)
+				vma->vm_ops->close(vma);
+			error = -EACCES;
+			goto unmap_and_free_vma;
+		}
 	} else if (vm_flags & VM_SHARED) {
 		error = shmem_zero_setup(vma);
 		if (error)
diff --git a/mm/mprotect.c b/mm/mprotect.c
index b426f01c5e9c..672a76fddd5e 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -125,6 +125,14 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
 	 * a MAP_NORESERVE private mapping to writable will now reserve.
 	 */
 	if (newflags & VM_WRITE) {
+		if (oldflags & VM_RESERVED) {
+			BUG_ON(oldflags & VM_WRITE);
+			printk(KERN_WARNING "program %s is using MAP_PRIVATE, "
+				"PROT_WRITE mprotect of VM_RESERVED memory, "
+				"which is deprecated. Please report this to "
+				"linux-kernel@vger.kernel.org\n",current->comm);
+			return -EACCES;
+		}
 		if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED|VM_HUGETLB))) {
 			charged = nrpages;
 			if (security_vm_enough_memory(charged))
diff --git a/mm/msync.c b/mm/msync.c
index 3b5f1c521d4b..860395486060 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -25,6 +25,7 @@
 static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 				unsigned long addr, unsigned long end)
 {
+	struct mm_struct *mm = vma->vm_mm;
 	pte_t *pte;
 	int progress = 0;
 
@@ -37,7 +38,7 @@ again:
 		if (progress >= 64) {
 			progress = 0;
 			if (need_resched() ||
-			    need_lockbreak(&vma->vm_mm->page_table_lock))
+			    need_lockbreak(&mm->page_table_lock))
 				break;
 		}
 		progress++;
@@ -46,11 +47,11 @@ again:
 		if (!pte_maybe_dirty(*pte))
 			continue;
 		pfn = pte_pfn(*pte);
-		if (!pfn_valid(pfn))
+		if (unlikely(!pfn_valid(pfn))) {
+			print_bad_pte(vma, *pte, addr);
 			continue;
+		}
 		page = pfn_to_page(pfn);
-		if (PageReserved(page))
-			continue;
 
 		if (ptep_clear_flush_dirty(vma, addr, pte) ||
 		    page_test_and_clear_dirty(page))
@@ -58,7 +59,7 @@ again:
 		progress += 3;
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 	pte_unmap(pte - 1);
-	cond_resched_lock(&vma->vm_mm->page_table_lock);
+	cond_resched_lock(&mm->page_table_lock);
 	if (addr != end)
 		goto again;
 }
@@ -102,8 +103,10 @@ static void msync_page_range(struct vm_area_struct *vma,
 
 	/* For hugepages we can't go walking the page table normally,
 	 * but that's ok, hugetlbfs is memory based, so we don't need
-	 * to do anything more on an msync() */
-	if (is_vm_hugetlb_page(vma))
+	 * to do anything more on an msync().
+	 * Can't do anything with VM_RESERVED regions either.
+	 */
+	if (vma->vm_flags & (VM_HUGETLB|VM_RESERVED))
 		return;
 
 	BUG_ON(addr >= end);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 60663232fbb2..0541288ebf4b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -114,7 +114,8 @@ static void bad_page(const char *function, struct page *page)
 			1 << PG_reclaim |
 			1 << PG_slab    |
 			1 << PG_swapcache |
-			1 << PG_writeback);
+			1 << PG_writeback |
+			1 << PG_reserved );
 	set_page_count(page, 0);
 	reset_page_mapcount(page);
 	page->mapping = NULL;
@@ -244,7 +245,6 @@ static inline int page_is_buddy(struct page *page, int order)
 {
        if (PagePrivate(page)           &&
            (page_order(page) == order) &&
-           !PageReserved(page)         &&
             page_count(page) == 0)
                return 1;
        return 0;
@@ -327,7 +327,8 @@ static inline void free_pages_check(const char *function, struct page *page)
 			1 << PG_reclaim	|
 			1 << PG_slab	|
 			1 << PG_swapcache |
-			1 << PG_writeback )))
+			1 << PG_writeback |
+			1 << PG_reserved )))
 		bad_page(function, page);
 	if (PageDirty(page))
 		__ClearPageDirty(page);
@@ -455,7 +456,8 @@ static void prep_new_page(struct page *page, int order)
 			1 << PG_reclaim	|
 			1 << PG_slab    |
 			1 << PG_swapcache |
-			1 << PG_writeback )))
+			1 << PG_writeback |
+			1 << PG_reserved )))
 		bad_page(__FUNCTION__, page);
 
 	page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
@@ -1016,7 +1018,7 @@ void __pagevec_free(struct pagevec *pvec)
 
 fastcall void __free_pages(struct page *page, unsigned int order)
 {
-	if (!PageReserved(page) && put_page_testzero(page)) {
+	if (put_page_testzero(page)) {
 		if (order == 0)
 			free_hot_page(page);
 		else
@@ -1674,7 +1676,7 @@ void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 			continue;
 		page = pfn_to_page(pfn);
 		set_page_links(page, zone, nid, pfn);
-		set_page_count(page, 0);
+		set_page_count(page, 1);
 		reset_page_mapcount(page);
 		SetPageReserved(page);
 		INIT_LIST_HEAD(&page->lru);
diff --git a/mm/rmap.c b/mm/rmap.c
index 504757624cce..f69d5342ce7f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -443,8 +443,6 @@ int page_referenced(struct page *page, int is_locked, int ignore_token)
 void page_add_anon_rmap(struct page *page,
 	struct vm_area_struct *vma, unsigned long address)
 {
-	BUG_ON(PageReserved(page));
-
 	if (atomic_inc_and_test(&page->_mapcount)) {
 		struct anon_vma *anon_vma = vma->anon_vma;
 
@@ -468,8 +466,7 @@ void page_add_anon_rmap(struct page *page,
 void page_add_file_rmap(struct page *page)
 {
 	BUG_ON(PageAnon(page));
-	if (!pfn_valid(page_to_pfn(page)) || PageReserved(page))
-		return;
+	BUG_ON(!pfn_valid(page_to_pfn(page)));
 
 	if (atomic_inc_and_test(&page->_mapcount))
 		inc_page_state(nr_mapped);
@@ -483,8 +480,6 @@ void page_add_file_rmap(struct page *page)
  */
 void page_remove_rmap(struct page *page)
 {
-	BUG_ON(PageReserved(page));
-
 	if (atomic_add_negative(-1, &page->_mapcount)) {
 		BUG_ON(page_mapcount(page) < 0);
 		/*
@@ -640,13 +635,13 @@ static void try_to_unmap_cluster(unsigned long cursor,
 			continue;
 
 		pfn = pte_pfn(*pte);
-		if (!pfn_valid(pfn))
+		if (unlikely(!pfn_valid(pfn))) {
+			print_bad_pte(vma, *pte, address);
 			continue;
+		}
 
 		page = pfn_to_page(pfn);
 		BUG_ON(PageAnon(page));
-		if (PageReserved(page))
-			continue;
 
 		if (ptep_clear_flush_young(vma, address, pte))
 			continue;
@@ -808,7 +803,6 @@ int try_to_unmap(struct page *page)
 {
 	int ret;
 
-	BUG_ON(PageReserved(page));
 	BUG_ON(!PageLocked(page));
 
 	if (PageAnon(page))
diff --git a/mm/shmem.c b/mm/shmem.c
index 6796311a23ef..37777f4c11f8 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1506,8 +1506,10 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
 			 */
 			if (!offset)
 				mark_page_accessed(page);
-		} else
+		} else {
 			page = ZERO_PAGE(0);
+			page_cache_get(page);
+		}
 
 		/*
 		 * Ok, we have the page, and it's up-to-date, so
diff --git a/mm/swap.c b/mm/swap.c
index 7771d2803f62..21d15f99805c 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -48,7 +48,7 @@ void put_page(struct page *page)
 		}
 		return;
 	}
-	if (!PageReserved(page) && put_page_testzero(page))
+	if (put_page_testzero(page))
 		__page_cache_release(page);
 }
 EXPORT_SYMBOL(put_page);
@@ -215,7 +215,7 @@ void release_pages(struct page **pages, int nr, int cold)
 		struct page *page = pages[i];
 		struct zone *pagezone;
 
-		if (PageReserved(page) || !put_page_testzero(page))
+		if (!put_page_testzero(page))
 			continue;
 
 		pagezone = page_zone(page);
diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
index 67abebabf83e..e97b2d162cc7 100644
--- a/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@ -2949,8 +2949,7 @@ static struct page * snd_pcm_mmap_status_nopage(struct vm_area_struct *area, uns
 		return NOPAGE_OOM;
 	runtime = substream->runtime;
 	page = virt_to_page(runtime->status);
-	if (!PageReserved(page))
-		get_page(page);
+	get_page(page);
 	if (type)
 		*type = VM_FAULT_MINOR;
 	return page;
@@ -2992,8 +2991,7 @@ static struct page * snd_pcm_mmap_control_nopage(struct vm_area_struct *area, un
 		return NOPAGE_OOM;
 	runtime = substream->runtime;
 	page = virt_to_page(runtime->control);
-	if (!PageReserved(page))
-		get_page(page);
+	get_page(page);
 	if (type)
 		*type = VM_FAULT_MINOR;
 	return page;
@@ -3066,8 +3064,7 @@ static struct page *snd_pcm_mmap_data_nopage(struct vm_area_struct *area, unsign
 		vaddr = runtime->dma_area + offset;
 		page = virt_to_page(vaddr);
 	}
-	if (!PageReserved(page))
-		get_page(page);
+	get_page(page);
 	if (type)
 		*type = VM_FAULT_MINOR;
 	return page;

From 8c10376271e097fa13cda956e1b2f3cb7e4d4dd9 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:13 -0700
Subject: [PATCH 39/91] [PATCH] mm: copy_one_pte inc rss

Small adjustment, following Nick's suggestion: it's more straightforward for
copy_pte_range to let copy_one_pte do the rss incrementation, than use an
index it passed back.  Saves a #define, and 16 bytes of .text.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index e83f9440bb66..7893eb4bb8c0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -340,8 +340,6 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
 		add_mm_counter(mm, anon_rss, anon_rss);
 }
 
-#define NO_RSS 2	/* Increment neither file_rss nor anon_rss */
-
 /*
  * This function is called to print an error when a pte in a
  * !VM_RESERVED region is found pointing to an invalid pfn (which
@@ -368,16 +366,15 @@ void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr)
  * but may be dropped within p[mg]d_alloc() and pte_alloc_map().
  */
 
-static inline int
+static inline void
 copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
-		unsigned long addr)
+		unsigned long addr, int *rss)
 {
 	unsigned long vm_flags = vma->vm_flags;
 	pte_t pte = *src_pte;
 	struct page *page;
 	unsigned long pfn;
-	int anon = NO_RSS;
 
 	/* pte contains position in swap or file, so copy. */
 	if (unlikely(!pte_present(pte))) {
@@ -428,11 +425,10 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	pte = pte_mkold(pte);
 	get_page(page);
 	page_dup_rmap(page);
-	anon = !!PageAnon(page);
+	rss[!!PageAnon(page)]++;
 
 out_set_pte:
 	set_pte_at(dst_mm, addr, dst_pte, pte);
-	return anon;
 }
 
 static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
@@ -441,7 +437,7 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 {
 	pte_t *src_pte, *dst_pte;
 	int progress = 0;
-	int rss[NO_RSS+1], anon;
+	int rss[2];
 
 again:
 	rss[1] = rss[0] = 0;
@@ -467,8 +463,7 @@ again:
 			progress++;
 			continue;
 		}
-		anon = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma,addr);
-		rss[anon]++;
+		copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);
 		progress += 8;
 	} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
 	spin_unlock(&src_mm->page_table_lock);

From 86d912f41dca32eca8827f2f878139735e69dc28 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:14 -0700
Subject: [PATCH 40/91] [PATCH] mm: zap_pte_range dec rss

Small adjustment: zap_pte_range decrement its rss counts from 0 then finally
add, avoiding negations - we don't have or need a sub_mm_rss.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 7893eb4bb8c0..bc6296398f8b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -609,13 +609,13 @@ static void zap_pte_range(struct mmu_gather *tlb,
 				set_pte_at(mm, addr, pte,
 					   pgoff_to_pte(page->index));
 			if (PageAnon(page))
-				anon_rss++;
+				anon_rss--;
 			else {
 				if (pte_dirty(ptent))
 					set_page_dirty(page);
 				if (pte_young(ptent))
 					mark_page_accessed(page);
-				file_rss++;
+				file_rss--;
 			}
 			page_remove_rmap(page);
 			tlb_remove_page(tlb, page);
@@ -632,7 +632,7 @@ static void zap_pte_range(struct mmu_gather *tlb,
 		pte_clear_full(mm, addr, pte, tlb->fullmm);
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 
-	add_mm_rss(mm, -file_rss, -anon_rss);
+	add_mm_rss(mm, file_rss, anon_rss);
 	pte_unmap(pte - 1);
 }
 

From 9e9bef07ce5a342aa6246ebc5c20829d0d5d63d0 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:15 -0700
Subject: [PATCH 41/91] [PATCH] mm: do_swap_page race major

Small adjustment: do_swap_page should report its !pte_same race as a major
fault if it had to read into swap cache, because whatever raced with it will
have found page already in cache and reported minor fault.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index bc6296398f8b..a25ee1d3e20a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1728,10 +1728,8 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	 */
 	spin_lock(&mm->page_table_lock);
 	page_table = pte_offset_map(pmd, address);
-	if (unlikely(!pte_same(*page_table, orig_pte))) {
-		ret = VM_FAULT_MINOR;
+	if (unlikely(!pte_same(*page_table, orig_pte)))
 		goto out_nomap;
-	}
 
 	if (unlikely(!PageUptodate(page))) {
 		ret = VM_FAULT_SIGBUS;

From d0de32d9b71e11cc51618c2045086e9694093d01 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:16 -0700
Subject: [PATCH 42/91] [PATCH] mm: do_mremap current mm

Cleanup: relieve do_mremap from its surfeit of current->mms.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mremap.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/mm/mremap.c b/mm/mremap.c
index f4e562098500..318eea5467a0 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -245,6 +245,7 @@ unsigned long do_mremap(unsigned long addr,
 	unsigned long old_len, unsigned long new_len,
 	unsigned long flags, unsigned long new_addr)
 {
+	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
 	unsigned long ret = -EINVAL;
 	unsigned long charged = 0;
@@ -285,7 +286,7 @@ unsigned long do_mremap(unsigned long addr,
 		if ((addr <= new_addr) && (addr+old_len) > new_addr)
 			goto out;
 
-		ret = do_munmap(current->mm, new_addr, new_len);
+		ret = do_munmap(mm, new_addr, new_len);
 		if (ret)
 			goto out;
 	}
@@ -296,7 +297,7 @@ unsigned long do_mremap(unsigned long addr,
 	 * do_munmap does all the needed commit accounting
 	 */
 	if (old_len >= new_len) {
-		ret = do_munmap(current->mm, addr+new_len, old_len - new_len);
+		ret = do_munmap(mm, addr+new_len, old_len - new_len);
 		if (ret && old_len != new_len)
 			goto out;
 		ret = addr;
@@ -309,7 +310,7 @@ unsigned long do_mremap(unsigned long addr,
 	 * Ok, we need to grow..  or relocate.
 	 */
 	ret = -EFAULT;
-	vma = find_vma(current->mm, addr);
+	vma = find_vma(mm, addr);
 	if (!vma || vma->vm_start > addr)
 		goto out;
 	if (is_vm_hugetlb_page(vma)) {
@@ -325,14 +326,14 @@ unsigned long do_mremap(unsigned long addr,
 	}
 	if (vma->vm_flags & VM_LOCKED) {
 		unsigned long locked, lock_limit;
-		locked = current->mm->locked_vm << PAGE_SHIFT;
+		locked = mm->locked_vm << PAGE_SHIFT;
 		lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
 		locked += new_len - old_len;
 		ret = -EAGAIN;
 		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
 			goto out;
 	}
-	if (!may_expand_vm(current->mm, (new_len - old_len) >> PAGE_SHIFT)) {
+	if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT)) {
 		ret = -ENOMEM;
 		goto out;
 	}
@@ -359,11 +360,10 @@ unsigned long do_mremap(unsigned long addr,
 			vma_adjust(vma, vma->vm_start,
 				addr + new_len, vma->vm_pgoff, NULL);
 
-			current->mm->total_vm += pages;
-			vm_stat_account(vma->vm_mm, vma->vm_flags,
-							vma->vm_file, pages);
+			mm->total_vm += pages;
+			vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
 			if (vma->vm_flags & VM_LOCKED) {
-				current->mm->locked_vm += pages;
+				mm->locked_vm += pages;
 				make_pages_present(addr + old_len,
 						   addr + new_len);
 			}

From 861f2fb8e796022b4928cab9c74fca6681a1c557 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:17 -0700
Subject: [PATCH 43/91] [PATCH] mm: zap_pte out of line

There used to be just one call to zap_pte, but it shouldn't be inline now
there are two.  Check for the common case pte_none before calling, and move
its rss accounting up into install_page or install_file_pte - which helps the
next patch.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/fremap.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/mm/fremap.c b/mm/fremap.c
index 224cc1598b35..7f08d10ceaff 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -20,34 +20,32 @@
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 
-static inline void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
+static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long addr, pte_t *ptep)
 {
 	pte_t pte = *ptep;
+	struct page *page = NULL;
 
-	if (pte_none(pte))
-		return;
 	if (pte_present(pte)) {
 		unsigned long pfn = pte_pfn(pte);
-		struct page *page;
-
 		flush_cache_page(vma, addr, pfn);
 		pte = ptep_clear_flush(vma, addr, ptep);
 		if (unlikely(!pfn_valid(pfn))) {
 			print_bad_pte(vma, pte, addr);
-			return;
+			goto out;
 		}
 		page = pfn_to_page(pfn);
 		if (pte_dirty(pte))
 			set_page_dirty(page);
 		page_remove_rmap(page);
 		page_cache_release(page);
-		dec_mm_counter(mm, file_rss);
 	} else {
 		if (!pte_file(pte))
 			free_swap_and_cache(pte_to_swp_entry(pte));
 		pte_clear(mm, addr, ptep);
 	}
+out:
+	return !!page;
 }
 
 /*
@@ -96,9 +94,9 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (page_mapcount(page) > INT_MAX/2)
 		goto err_unlock;
 
-	zap_pte(mm, vma, addr, pte);
+	if (pte_none(*pte) || !zap_pte(mm, vma, addr, pte))
+		inc_mm_counter(mm, file_rss);
 
-	inc_mm_counter(mm, file_rss);
 	flush_icache_page(vma, page);
 	set_pte_at(mm, addr, pte, mk_pte(page, prot));
 	page_add_file_rmap(page);
@@ -145,7 +143,8 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (!pte)
 		goto err_unlock;
 
-	zap_pte(mm, vma, addr, pte);
+	if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte))
+		dec_mm_counter(mm, file_rss);
 
 	set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff));
 	pte_val = *pte;

From 365e9c87a982c03d0af3886e29d877f581b59611 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:18 -0700
Subject: [PATCH 44/91] [PATCH] mm: update_hiwaters just in time

update_mem_hiwater has attracted various criticisms, in particular from those
concerned with mm scalability.  Originally it was called whenever rss or
total_vm got raised.  Then many of those callsites were replaced by a timer
tick call from account_system_time.  Now Frank van Maarseveen reports that to
be found inadequate.  How about this?  Works for Frank.

Replace update_mem_hiwater, a poor combination of two unrelated ops, by macros
update_hiwater_rss and update_hiwater_vm.  Don't attempt to keep
mm->hiwater_rss up to date at timer tick, nor every time we raise rss (usually
by 1): those are hot paths.  Do the opposite, update only when about to lower
rss (usually by many), or just before final accounting in do_exit.  Handle
mm->hiwater_vm in the same way, though it's much less of an issue.  Demand
that whoever collects these hiwater statistics do the work of taking the
maximum with rss or total_vm.

And there has been no collector of these hiwater statistics in the tree.  The
new convention needs an example, so match Frank's usage by adding a VmPeak
line above VmSize to /proc/<pid>/status, and also a VmHWM line above VmRSS
(High-Water-Mark or High-Water-Memory).

There was a particular anomaly during mremap move, that hiwater_vm might be
captured too high.  A fleeting such anomaly remains, but it's quickly
corrected now, whereas before it would stick.

What locking?  None: if the app is racy then these statistics will be racy,
it's not worth any overhead to make them exact.  But whenever it suits,
hiwater_vm is updated under exclusive mmap_sem, and hiwater_rss under
page_table_lock (for now) or with preemption disabled (later on): without
going to any trouble, minimize the time between reading current values and
updating, to minimize those occasions when a racing thread bumps a count up
and back down in between.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/compat.c           |  1 -
 fs/exec.c             |  1 -
 fs/proc/task_mmu.c    | 23 +++++++++++++++++++++--
 include/linux/mm.h    |  3 ---
 include/linux/sched.h | 10 ++++++++++
 kernel/exit.c         |  5 ++++-
 kernel/sched.c        |  2 --
 mm/fremap.c           |  4 +++-
 mm/hugetlb.c          |  3 +++
 mm/memory.c           | 17 +----------------
 mm/mmap.c             |  4 ++++
 mm/mremap.c           | 12 ++++++++++--
 mm/nommu.c            | 15 ++-------------
 mm/rmap.c             |  6 ++++++
 14 files changed, 64 insertions(+), 42 deletions(-)

diff --git a/fs/compat.c b/fs/compat.c
index a719e158e002..8e71cdbecc7c 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1490,7 +1490,6 @@ int compat_do_execve(char * filename,
 		/* execve success */
 		security_bprm_free(bprm);
 		acct_update_integrals(current);
-		update_mem_hiwater(current);
 		kfree(bprm);
 		return retval;
 	}
diff --git a/fs/exec.c b/fs/exec.c
index cefadf5ab83b..9bb55c8cf224 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1207,7 +1207,6 @@ int do_execve(char * filename,
 		/* execve success */
 		security_bprm_free(bprm);
 		acct_update_integrals(current);
-		update_mem_hiwater(current);
 		kfree(bprm);
 		return retval;
 	}
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index bccee7cf9ccd..7c89b4549049 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -14,22 +14,41 @@
 char *task_mem(struct mm_struct *mm, char *buffer)
 {
 	unsigned long data, text, lib;
+	unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
+
+	/*
+	 * Note: to minimize their overhead, mm maintains hiwater_vm and
+	 * hiwater_rss only when about to *lower* total_vm or rss.  Any
+	 * collector of these hiwater stats must therefore get total_vm
+	 * and rss too, which will usually be the higher.  Barriers? not
+	 * worth the effort, such snapshots can always be inconsistent.
+	 */
+	hiwater_vm = total_vm = mm->total_vm;
+	if (hiwater_vm < mm->hiwater_vm)
+		hiwater_vm = mm->hiwater_vm;
+	hiwater_rss = total_rss = get_mm_rss(mm);
+	if (hiwater_rss < mm->hiwater_rss)
+		hiwater_rss = mm->hiwater_rss;
 
 	data = mm->total_vm - mm->shared_vm - mm->stack_vm;
 	text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
 	lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
 	buffer += sprintf(buffer,
+		"VmPeak:\t%8lu kB\n"
 		"VmSize:\t%8lu kB\n"
 		"VmLck:\t%8lu kB\n"
+		"VmHWM:\t%8lu kB\n"
 		"VmRSS:\t%8lu kB\n"
 		"VmData:\t%8lu kB\n"
 		"VmStk:\t%8lu kB\n"
 		"VmExe:\t%8lu kB\n"
 		"VmLib:\t%8lu kB\n"
 		"VmPTE:\t%8lu kB\n",
-		(mm->total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
+		hiwater_vm << (PAGE_SHIFT-10),
+		(total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
 		mm->locked_vm << (PAGE_SHIFT-10),
-		get_mm_rss(mm) << (PAGE_SHIFT-10),
+		hiwater_rss << (PAGE_SHIFT-10),
+		total_rss << (PAGE_SHIFT-10),
 		data << (PAGE_SHIFT-10),
 		mm->stack_vm << (PAGE_SHIFT-10), text, lib,
 		(PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index da42093250c3..7d4552fe0864 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -938,9 +938,6 @@ static inline void vm_stat_account(struct mm_struct *mm,
 }
 #endif /* CONFIG_PROC_FS */
 
-/* update per process rss and vm hiwater data */
-extern void update_mem_hiwater(struct task_struct *tsk);
-
 #ifndef CONFIG_DEBUG_PAGEALLOC
 static inline void
 kernel_map_pages(struct page *page, int numpages, int enable)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index afcaac66cbd5..a9c0b7d26303 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -256,6 +256,16 @@ extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
 #define dec_mm_counter(mm, member) (mm)->_##member--
 #define get_mm_rss(mm) ((mm)->_file_rss + (mm)->_anon_rss)
 
+#define update_hiwater_rss(mm)	do {			\
+	unsigned long _rss = get_mm_rss(mm);		\
+	if ((mm)->hiwater_rss < _rss)			\
+		(mm)->hiwater_rss = _rss;		\
+} while (0)
+#define update_hiwater_vm(mm)	do {			\
+	if ((mm)->hiwater_vm < (mm)->total_vm)		\
+		(mm)->hiwater_vm = (mm)->total_vm;	\
+} while (0)
+
 typedef unsigned long mm_counter_t;
 
 struct mm_struct {
diff --git a/kernel/exit.c b/kernel/exit.c
index 3b25b182d2be..79f52b85d6ed 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -839,7 +839,10 @@ fastcall NORET_TYPE void do_exit(long code)
 				preempt_count());
 
 	acct_update_integrals(tsk);
-	update_mem_hiwater(tsk);
+	if (tsk->mm) {
+		update_hiwater_rss(tsk->mm);
+		update_hiwater_vm(tsk->mm);
+	}
 	group_dead = atomic_dec_and_test(&tsk->signal->live);
 	if (group_dead) {
  		del_timer_sync(&tsk->signal->real_timer);
diff --git a/kernel/sched.c b/kernel/sched.c
index 1e5cafdf4e27..4f26c544d02c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2511,8 +2511,6 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
 		cpustat->idle = cputime64_add(cpustat->idle, tmp);
 	/* Account for system time used */
 	acct_update_integrals(p);
-	/* Update rss highwater mark */
-	update_mem_hiwater(p);
 }
 
 /*
diff --git a/mm/fremap.c b/mm/fremap.c
index 7f08d10ceaff..49719a35769a 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -143,8 +143,10 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (!pte)
 		goto err_unlock;
 
-	if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte))
+	if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte)) {
+		update_hiwater_rss(mm);
 		dec_mm_counter(mm, file_rss);
+	}
 
 	set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff));
 	pte_val = *pte;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 094455bcbbf7..ac5f044bf514 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -310,6 +310,9 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 	BUG_ON(start & ~HPAGE_MASK);
 	BUG_ON(end & ~HPAGE_MASK);
 
+	/* Update high watermark before we lower rss */
+	update_hiwater_rss(mm);
+
 	for (address = start; address < end; address += HPAGE_SIZE) {
 		ptep = huge_pte_offset(mm, address);
 		if (! ptep)
diff --git a/mm/memory.c b/mm/memory.c
index a25ee1d3e20a..692ad810263d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -820,6 +820,7 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
 	lru_add_drain();
 	spin_lock(&mm->page_table_lock);
 	tlb = tlb_gather_mmu(mm, 0);
+	update_hiwater_rss(mm);
 	end = unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details);
 	tlb_finish_mmu(tlb, address, end);
 	spin_unlock(&mm->page_table_lock);
@@ -2225,22 +2226,6 @@ unsigned long vmalloc_to_pfn(void * vmalloc_addr)
 
 EXPORT_SYMBOL(vmalloc_to_pfn);
 
-/*
- * update_mem_hiwater
- *	- update per process rss and vm high water data
- */
-void update_mem_hiwater(struct task_struct *tsk)
-{
-	if (tsk->mm) {
-		unsigned long rss = get_mm_rss(tsk->mm);
-
-		if (tsk->mm->hiwater_rss < rss)
-			tsk->mm->hiwater_rss = rss;
-		if (tsk->mm->hiwater_vm < tsk->mm->total_vm)
-			tsk->mm->hiwater_vm = tsk->mm->total_vm;
-	}
-}
-
 #if !defined(__HAVE_ARCH_GATE_AREA)
 
 #if defined(AT_SYSINFO_EHDR)
diff --git a/mm/mmap.c b/mm/mmap.c
index 8a111792b8db..c43b28457007 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1640,6 +1640,8 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr)
  */
 static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
 {
+	/* Update high watermark before we lower total_vm */
+	update_hiwater_vm(mm);
 	do {
 		long nrpages = vma_pages(vma);
 
@@ -1668,6 +1670,7 @@ static void unmap_region(struct mm_struct *mm,
 	lru_add_drain();
 	spin_lock(&mm->page_table_lock);
 	tlb = tlb_gather_mmu(mm, 0);
+	update_hiwater_rss(mm);
 	unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL);
 	vm_unacct_memory(nr_accounted);
 	free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
@@ -1953,6 +1956,7 @@ void exit_mmap(struct mm_struct *mm)
 
 	flush_cache_mm(mm);
 	tlb = tlb_gather_mmu(mm, 1);
+	/* Don't update_hiwater_rss(mm) here, do_exit already did */
 	/* Use -1 here to ensure all VMAs in the mm are unmapped */
 	end = unmap_vmas(&tlb, mm, vma, 0, -1, &nr_accounted, NULL);
 	vm_unacct_memory(nr_accounted);
diff --git a/mm/mremap.c b/mm/mremap.c
index 318eea5467a0..ccf456477020 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -167,6 +167,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 	unsigned long new_pgoff;
 	unsigned long moved_len;
 	unsigned long excess = 0;
+	unsigned long hiwater_vm;
 	int split = 0;
 
 	/*
@@ -205,9 +206,15 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 	}
 
 	/*
-	 * if we failed to move page tables we still do total_vm increment
-	 * since do_munmap() will decrement it by old_len == new_len
+	 * If we failed to move page tables we still do total_vm increment
+	 * since do_munmap() will decrement it by old_len == new_len.
+	 *
+	 * Since total_vm is about to be raised artificially high for a
+	 * moment, we need to restore high watermark afterwards: if stats
+	 * are taken meanwhile, total_vm and hiwater_vm appear too high.
+	 * If this were a serious issue, we'd add a flag to do_munmap().
 	 */
+	hiwater_vm = mm->hiwater_vm;
 	mm->total_vm += new_len >> PAGE_SHIFT;
 	vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
 
@@ -216,6 +223,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 		vm_unacct_memory(excess >> PAGE_SHIFT);
 		excess = 0;
 	}
+	mm->hiwater_vm = hiwater_vm;
 
 	/* Restore VM_ACCOUNT if one or two pieces of vma left */
 	if (excess) {
diff --git a/mm/nommu.c b/mm/nommu.c
index 599924886eb5..dfb124ffb9be 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -931,6 +931,8 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
 	realalloc -= kobjsize(vml);
 	askedalloc -= sizeof(*vml);
 	kfree(vml);
+
+	update_hiwater_vm(mm);
 	mm->total_vm -= len >> PAGE_SHIFT;
 
 #ifdef DEBUG
@@ -1078,19 +1080,6 @@ void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
 {
 }
 
-void update_mem_hiwater(struct task_struct *tsk)
-{
-	unsigned long rss;
-
-	if (likely(tsk->mm)) {
-		rss = get_mm_rss(tsk->mm);
-		if (tsk->mm->hiwater_rss < rss)
-			tsk->mm->hiwater_rss = rss;
-		if (tsk->mm->hiwater_vm < tsk->mm->total_vm)
-			tsk->mm->hiwater_vm = tsk->mm->total_vm;
-	}
-}
-
 void unmap_mapping_range(struct address_space *mapping,
 			 loff_t const holebegin, loff_t const holelen,
 			 int even_cows)
diff --git a/mm/rmap.c b/mm/rmap.c
index f69d5342ce7f..4c52c56c9905 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -538,6 +538,9 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
 	if (pte_dirty(pteval))
 		set_page_dirty(page);
 
+	/* Update high watermark before we lower rss */
+	update_hiwater_rss(mm);
+
 	if (PageAnon(page)) {
 		swp_entry_t entry = { .val = page->private };
 		/*
@@ -628,6 +631,9 @@ static void try_to_unmap_cluster(unsigned long cursor,
 	if (!pmd_present(*pmd))
 		goto out_unlock;
 
+	/* Update high watermark before we lower rss */
+	update_hiwater_rss(mm);
+
 	for (original_pte = pte = pte_offset_map(pmd, address);
 			address < end; pte++, address += PAGE_SIZE) {
 

From f449952bc8bde7fbc73c6d20dff92b627a21f8b9 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:19 -0700
Subject: [PATCH 45/91] [PATCH] mm: mm_struct hiwaters moved

Slight and timid rearrangement of mm_struct: hiwater_rss and hiwater_vm were
tacked on the end, but it seems better to keep them near _file_rss, _anon_rss
and total_vm, in the same cacheline on those arches verified.

There are likely to be more profitable rearrangements, but less obvious (is it
good or bad that saved_auxv[AT_VECTOR_SIZE] isolates cpu_vm_mask and context
from many others?), needing serious instrumentation.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index a9c0b7d26303..292cb57ce38f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -291,16 +291,19 @@ struct mm_struct {
 						 * by mmlist_lock
 						 */
 
-	unsigned long start_code, end_code, start_data, end_data;
-	unsigned long start_brk, brk, start_stack;
-	unsigned long arg_start, arg_end, env_start, env_end;
-	unsigned long total_vm, locked_vm, shared_vm;
-	unsigned long exec_vm, stack_vm, reserved_vm, def_flags, nr_ptes;
-
 	/* Special counters protected by the page_table_lock */
 	mm_counter_t _file_rss;
 	mm_counter_t _anon_rss;
 
+	unsigned long hiwater_rss;	/* High-watermark of RSS usage */
+	unsigned long hiwater_vm;	/* High-water virtual memory usage */
+
+	unsigned long total_vm, locked_vm, shared_vm, exec_vm;
+	unsigned long stack_vm, reserved_vm, def_flags, nr_ptes;
+	unsigned long start_code, end_code, start_data, end_data;
+	unsigned long start_brk, brk, start_stack;
+	unsigned long arg_start, arg_end, env_start, env_end;
+
 	unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
 
 	unsigned dumpable:2;
@@ -320,11 +323,7 @@ struct mm_struct {
 	/* aio bits */
 	rwlock_t		ioctx_list_lock;
 	struct kioctx		*ioctx_list;
-
 	struct kioctx		default_kioctx;
-
-	unsigned long hiwater_rss;	/* High-water RSS usage */
-	unsigned long hiwater_vm;	/* High-water virtual memory usage */
 };
 
 struct sighand_struct {

From 46dea3d092d23a58b42499cc8a21de0fad079f4a Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:20 -0700
Subject: [PATCH 46/91] [PATCH] mm: ia64 use expand_upwards

ia64 has expand_backing_store function for growing its Register Backing Store
vma upwards.  But more complete code for this purpose is found in the
CONFIG_STACK_GROWSUP part of mm/mmap.c.  Uglify its #ifdefs further to provide
expand_upwards for ia64 as well as expand_stack for parisc.

The Register Backing Store vma should be marked VM_ACCOUNT.  Implement the
intention of growing it only a page at a time, instead of passing an address
outside of the vma to handle_mm_fault, with unknown consequences.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/mm/fault.c | 34 +++++++---------------------------
 arch/ia64/mm/init.c  |  2 +-
 include/linux/mm.h   |  3 ++-
 mm/mmap.c            | 17 ++++++++++++++---
 4 files changed, 24 insertions(+), 32 deletions(-)

diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
index f21b55549787..af7eb087dca7 100644
--- a/arch/ia64/mm/fault.c
+++ b/arch/ia64/mm/fault.c
@@ -19,32 +19,6 @@
 
 extern void die (char *, struct pt_regs *, long);
 
-/*
- * This routine is analogous to expand_stack() but instead grows the
- * register backing store (which grows towards higher addresses).
- * Since the register backing store is access sequentially, we
- * disallow growing the RBS by more than a page at a time.  Note that
- * the VM_GROWSUP flag can be set on any VM area but that's fine
- * because the total process size is still limited by RLIMIT_STACK and
- * RLIMIT_AS.
- */
-static inline long
-expand_backing_store (struct vm_area_struct *vma, unsigned long address)
-{
-	unsigned long grow;
-
-	grow = PAGE_SIZE >> PAGE_SHIFT;
-	if (address - vma->vm_start > current->signal->rlim[RLIMIT_STACK].rlim_cur
-	    || (((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) > current->signal->rlim[RLIMIT_AS].rlim_cur))
-		return -ENOMEM;
-	vma->vm_end += PAGE_SIZE;
-	vma->vm_mm->total_vm += grow;
-	if (vma->vm_flags & VM_LOCKED)
-		vma->vm_mm->locked_vm += grow;
-	vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file, grow);
-	return 0;
-}
-
 /*
  * Return TRUE if ADDRESS points at a page in the kernel's mapped segment
  * (inside region 5, on ia64) and that page is present.
@@ -185,7 +159,13 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re
 		if (REGION_NUMBER(address) != REGION_NUMBER(vma->vm_start)
 		    || REGION_OFFSET(address) >= RGN_MAP_LIMIT)
 			goto bad_area;
-		if (expand_backing_store(vma, address))
+		/*
+		 * Since the register backing store is accessed sequentially,
+		 * we disallow growing it by more than a page at a time.
+		 */
+		if (address > vma->vm_end + PAGE_SIZE - sizeof(long))
+			goto bad_area;
+		if (expand_upwards(vma, address))
 			goto bad_area;
 	}
 	goto good_area;
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 98246acd4991..0063b2c50908 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -158,7 +158,7 @@ ia64_init_addr_space (void)
 		vma->vm_start = current->thread.rbs_bot & PAGE_MASK;
 		vma->vm_end = vma->vm_start + PAGE_SIZE;
 		vma->vm_page_prot = protection_map[VM_DATA_DEFAULT_FLAGS & 0x7];
-		vma->vm_flags = VM_DATA_DEFAULT_FLAGS | VM_GROWSUP;
+		vma->vm_flags = VM_DATA_DEFAULT_FLAGS|VM_GROWSUP|VM_ACCOUNT;
 		down_write(&current->mm->mmap_sem);
 		if (insert_vm_struct(current->mm, vma)) {
 			up_write(&current->mm->mmap_sem);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7d4552fe0864..89398032bc4b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -896,7 +896,8 @@ void handle_ra_miss(struct address_space *mapping,
 unsigned long max_sane_readahead(unsigned long nr);
 
 /* Do stack extension */
-extern int expand_stack(struct vm_area_struct * vma, unsigned long address);
+extern int expand_stack(struct vm_area_struct *vma, unsigned long address);
+extern int expand_upwards(struct vm_area_struct *vma, unsigned long address);
 
 /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
 extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr);
diff --git a/mm/mmap.c b/mm/mmap.c
index c43b28457007..d931d7e49ac9 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1508,11 +1508,15 @@ static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, un
 	return 0;
 }
 
-#ifdef CONFIG_STACK_GROWSUP
+#if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64)
 /*
- * vma is the first one with address > vma->vm_end.  Have to extend vma.
+ * PA-RISC uses this for its stack; IA64 for its Register Backing Store.
+ * vma is the last one with address > vma->vm_end.  Have to extend vma.
  */
-int expand_stack(struct vm_area_struct * vma, unsigned long address)
+#ifdef CONFIG_STACK_GROWSUP
+static inline
+#endif
+int expand_upwards(struct vm_area_struct *vma, unsigned long address)
 {
 	int error;
 
@@ -1550,6 +1554,13 @@ int expand_stack(struct vm_area_struct * vma, unsigned long address)
 	anon_vma_unlock(vma);
 	return error;
 }
+#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
+
+#ifdef CONFIG_STACK_GROWSUP
+int expand_stack(struct vm_area_struct *vma, unsigned long address)
+{
+	return expand_upwards(vma, address);
+}
 
 struct vm_area_struct *
 find_extend_vma(struct mm_struct *mm, unsigned long addr)

From 872fec16d9a0ed3b75b8893aa217e49cca575ee5 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:21 -0700
Subject: [PATCH 47/91] [PATCH] mm: init_mm without ptlock

First step in pushing down the page_table_lock.  init_mm.page_table_lock has
been used throughout the architectures (usually for ioremap): not to serialize
kernel address space allocation (that's usually vmlist_lock), but because
pud_alloc,pmd_alloc,pte_alloc_kernel expect caller holds it.

Reverse that: don't lock or unlock init_mm.page_table_lock in any of the
architectures; instead rely on pud_alloc,pmd_alloc,pte_alloc_kernel to take
and drop it when allocating a new one, to check lest a racing task already
did.  Similarly no page_table_lock in vmalloc's map_vm_area.

Some temporary ugliness in __pud_alloc and __pmd_alloc: since they also handle
user mms, which are converted only by a later patch, for now they have to lock
differently according to whether or not it's init_mm.

If sources get muddled, there's a danger that an arch source taking
init_mm.page_table_lock will be mixed with common source also taking it (or
neither take it).  So break the rules and make another change, which should
break the build for such a mismatch: remove the redundant mm arg from
pte_alloc_kernel (ppc64 scrapped its distinct ioremap_mm in 2.6.13).

Exceptions: arm26 used pte_alloc_kernel on user mm, now pte_alloc_map; ia64
used pte_alloc_map on init_mm, now pte_alloc_kernel; parisc had bad args to
pmd_alloc and pte_alloc_kernel in unused USE_HPPA_IOREMAP code; ppc64
map_io_page forgot to unlock on failure; ppc mmu_mapin_ram and ppc64 im_free
took page_table_lock for no good reason.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/alpha/mm/remap.c         |  6 +---
 arch/arm/mm/consistent.c      |  6 +---
 arch/arm/mm/ioremap.c         |  4 +--
 arch/arm26/mm/memc.c          |  3 +-
 arch/cris/mm/ioremap.c        |  4 +--
 arch/frv/mm/dma-alloc.c       |  5 +--
 arch/i386/mm/ioremap.c        |  4 +--
 arch/ia64/mm/init.c           | 11 ++-----
 arch/m32r/mm/ioremap.c        |  4 +--
 arch/m68k/mm/kmap.c           |  2 +-
 arch/m68k/sun3x/dvma.c        |  2 +-
 arch/mips/mm/ioremap.c        |  4 +--
 arch/parisc/kernel/pci-dma.c  |  2 +-
 arch/parisc/mm/ioremap.c      |  6 ++--
 arch/ppc/kernel/dma-mapping.c |  6 +---
 arch/ppc/mm/4xx_mmu.c         |  4 ---
 arch/ppc/mm/pgtable.c         |  4 +--
 arch/ppc64/mm/imalloc.c       |  5 ---
 arch/ppc64/mm/init.c          |  4 +--
 arch/s390/mm/ioremap.c        |  4 +--
 arch/sh/mm/ioremap.c          |  4 +--
 arch/sh64/mm/ioremap.c        |  4 +--
 arch/x86_64/mm/ioremap.c      |  4 +--
 include/linux/mm.h            |  2 +-
 mm/memory.c                   | 60 ++++++++++++++++-------------------
 mm/vmalloc.c                  |  4 +--
 26 files changed, 54 insertions(+), 114 deletions(-)

diff --git a/arch/alpha/mm/remap.c b/arch/alpha/mm/remap.c
index 19817ad3d89b..a78356c3ead5 100644
--- a/arch/alpha/mm/remap.c
+++ b/arch/alpha/mm/remap.c
@@ -2,7 +2,6 @@
 #include <asm/pgalloc.h>
 #include <asm/cacheflush.h>
 
-/* called with the page_table_lock held */
 static inline void 
 remap_area_pte(pte_t * pte, unsigned long address, unsigned long size, 
 	       unsigned long phys_addr, unsigned long flags)
@@ -31,7 +30,6 @@ remap_area_pte(pte_t * pte, unsigned long address, unsigned long size,
 	} while (address && (address < end));
 }
 
-/* called with the page_table_lock held */
 static inline int 
 remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned long size, 
 	       unsigned long phys_addr, unsigned long flags)
@@ -46,7 +44,7 @@ remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned long size,
 	if (address >= end)
 		BUG();
 	do {
-		pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address);
+		pte_t * pte = pte_alloc_kernel(pmd, address);
 		if (!pte)
 			return -ENOMEM;
 		remap_area_pte(pte, address, end - address, 
@@ -70,7 +68,6 @@ __alpha_remap_area_pages(unsigned long address, unsigned long phys_addr,
 	flush_cache_all();
 	if (address >= end)
 		BUG();
-	spin_lock(&init_mm.page_table_lock);
 	do {
 		pmd_t *pmd;
 		pmd = pmd_alloc(&init_mm, dir, address);
@@ -84,7 +81,6 @@ __alpha_remap_area_pages(unsigned long address, unsigned long phys_addr,
 		address = (address + PGDIR_SIZE) & PGDIR_MASK;
 		dir++;
 	} while (address && (address < end));
-	spin_unlock(&init_mm.page_table_lock);
 	return error;
 }
 
diff --git a/arch/arm/mm/consistent.c b/arch/arm/mm/consistent.c
index 82f4d5e27c54..47b0b767f080 100644
--- a/arch/arm/mm/consistent.c
+++ b/arch/arm/mm/consistent.c
@@ -397,8 +397,6 @@ static int __init consistent_init(void)
 	pte_t *pte;
 	int ret = 0;
 
-	spin_lock(&init_mm.page_table_lock);
-
 	do {
 		pgd = pgd_offset(&init_mm, CONSISTENT_BASE);
 		pmd = pmd_alloc(&init_mm, pgd, CONSISTENT_BASE);
@@ -409,7 +407,7 @@ static int __init consistent_init(void)
 		}
 		WARN_ON(!pmd_none(*pmd));
 
-		pte = pte_alloc_kernel(&init_mm, pmd, CONSISTENT_BASE);
+		pte = pte_alloc_kernel(pmd, CONSISTENT_BASE);
 		if (!pte) {
 			printk(KERN_ERR "%s: no pte tables\n", __func__);
 			ret = -ENOMEM;
@@ -419,8 +417,6 @@ static int __init consistent_init(void)
 		consistent_pte = pte;
 	} while (0);
 
-	spin_unlock(&init_mm.page_table_lock);
-
 	return ret;
 }
 
diff --git a/arch/arm/mm/ioremap.c b/arch/arm/mm/ioremap.c
index 6fb1258df1b5..0f128c28fee4 100644
--- a/arch/arm/mm/ioremap.c
+++ b/arch/arm/mm/ioremap.c
@@ -75,7 +75,7 @@ remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned long size,
 
 	pgprot = __pgprot(L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY | L_PTE_WRITE | flags);
 	do {
-		pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address);
+		pte_t * pte = pte_alloc_kernel(pmd, address);
 		if (!pte)
 			return -ENOMEM;
 		remap_area_pte(pte, address, end - address, address + phys_addr, pgprot);
@@ -97,7 +97,6 @@ remap_area_pages(unsigned long start, unsigned long phys_addr,
 	phys_addr -= address;
 	dir = pgd_offset(&init_mm, address);
 	BUG_ON(address >= end);
-	spin_lock(&init_mm.page_table_lock);
 	do {
 		pmd_t *pmd = pmd_alloc(&init_mm, dir, address);
 		if (!pmd) {
@@ -114,7 +113,6 @@ remap_area_pages(unsigned long start, unsigned long phys_addr,
 		dir++;
 	} while (address && (address < end));
 
-	spin_unlock(&init_mm.page_table_lock);
 	flush_cache_vmap(start, end);
 	return err;
 }
diff --git a/arch/arm26/mm/memc.c b/arch/arm26/mm/memc.c
index 8e8a2bb2487d..d6b008b8db76 100644
--- a/arch/arm26/mm/memc.c
+++ b/arch/arm26/mm/memc.c
@@ -92,7 +92,7 @@ pgd_t *get_pgd_slow(struct mm_struct *mm)
 	if (!new_pmd)
 		goto no_pmd;
 
-	new_pte = pte_alloc_kernel(mm, new_pmd, 0);
+	new_pte = pte_alloc_map(mm, new_pmd, 0);
 	if (!new_pte)
 		goto no_pte;
 
@@ -101,6 +101,7 @@ pgd_t *get_pgd_slow(struct mm_struct *mm)
 	init_pte = pte_offset(init_pmd, 0);
 
 	set_pte(new_pte, *init_pte);
+	pte_unmap(new_pte);
 
 	/*
 	 * the page table entries are zeroed
diff --git a/arch/cris/mm/ioremap.c b/arch/cris/mm/ioremap.c
index ebba11e270fa..a92ac9877582 100644
--- a/arch/cris/mm/ioremap.c
+++ b/arch/cris/mm/ioremap.c
@@ -52,7 +52,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned lo
 	if (address >= end)
 		BUG();
 	do {
-		pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address);
+		pte_t * pte = pte_alloc_kernel(pmd, address);
 		if (!pte)
 			return -ENOMEM;
 		remap_area_pte(pte, address, end - address, address + phys_addr, prot);
@@ -74,7 +74,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr,
 	flush_cache_all();
 	if (address >= end)
 		BUG();
-	spin_lock(&init_mm.page_table_lock);
 	do {
 		pud_t *pud;
 		pmd_t *pmd;
@@ -94,7 +93,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr,
 		address = (address + PGDIR_SIZE) & PGDIR_MASK;
 		dir++;
 	} while (address && (address < end));
-	spin_unlock(&init_mm.page_table_lock);
 	flush_tlb_all();
 	return error;
 }
diff --git a/arch/frv/mm/dma-alloc.c b/arch/frv/mm/dma-alloc.c
index cfc4f97490c6..342823aad758 100644
--- a/arch/frv/mm/dma-alloc.c
+++ b/arch/frv/mm/dma-alloc.c
@@ -55,21 +55,18 @@ static int map_page(unsigned long va, unsigned long pa, pgprot_t prot)
 	pte_t *pte;
 	int err = -ENOMEM;
 
-	spin_lock(&init_mm.page_table_lock);
-
 	/* Use upper 10 bits of VA to index the first level map */
 	pge = pgd_offset_k(va);
 	pue = pud_offset(pge, va);
 	pme = pmd_offset(pue, va);
 
 	/* Use middle 10 bits of VA to index the second-level map */
-	pte = pte_alloc_kernel(&init_mm, pme, va);
+	pte = pte_alloc_kernel(pme, va);
 	if (pte != 0) {
 		err = 0;
 		set_pte(pte, mk_pte_phys(pa & PAGE_MASK, prot));
 	}
 
-	spin_unlock(&init_mm.page_table_lock);
 	return err;
 }
 
diff --git a/arch/i386/mm/ioremap.c b/arch/i386/mm/ioremap.c
index f379b8d67558..5d09de8d1c6b 100644
--- a/arch/i386/mm/ioremap.c
+++ b/arch/i386/mm/ioremap.c
@@ -28,7 +28,7 @@ static int ioremap_pte_range(pmd_t *pmd, unsigned long addr,
 	unsigned long pfn;
 
 	pfn = phys_addr >> PAGE_SHIFT;
-	pte = pte_alloc_kernel(&init_mm, pmd, addr);
+	pte = pte_alloc_kernel(pmd, addr);
 	if (!pte)
 		return -ENOMEM;
 	do {
@@ -87,14 +87,12 @@ static int ioremap_page_range(unsigned long addr,
 	flush_cache_all();
 	phys_addr -= addr;
 	pgd = pgd_offset_k(addr);
-	spin_lock(&init_mm.page_table_lock);
 	do {
 		next = pgd_addr_end(addr, end);
 		err = ioremap_pud_range(pgd, addr, next, phys_addr+addr, flags);
 		if (err)
 			break;
 	} while (pgd++, addr = next, addr != end);
-	spin_unlock(&init_mm.page_table_lock);
 	flush_tlb_all();
 	return err;
 }
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 0063b2c50908..e3215ba64ffd 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -275,26 +275,21 @@ put_kernel_page (struct page *page, unsigned long address, pgprot_t pgprot)
 
 	pgd = pgd_offset_k(address);		/* note: this is NOT pgd_offset()! */
 
-	spin_lock(&init_mm.page_table_lock);
 	{
 		pud = pud_alloc(&init_mm, pgd, address);
 		if (!pud)
 			goto out;
-
 		pmd = pmd_alloc(&init_mm, pud, address);
 		if (!pmd)
 			goto out;
-		pte = pte_alloc_map(&init_mm, pmd, address);
+		pte = pte_alloc_kernel(pmd, address);
 		if (!pte)
 			goto out;
-		if (!pte_none(*pte)) {
-			pte_unmap(pte);
+		if (!pte_none(*pte))
 			goto out;
-		}
 		set_pte(pte, mk_pte(page, pgprot));
-		pte_unmap(pte);
 	}
-  out:	spin_unlock(&init_mm.page_table_lock);
+  out:
 	/* no need for flush_tlb */
 	return page;
 }
diff --git a/arch/m32r/mm/ioremap.c b/arch/m32r/mm/ioremap.c
index 70c59055c19c..a151849a605e 100644
--- a/arch/m32r/mm/ioremap.c
+++ b/arch/m32r/mm/ioremap.c
@@ -67,7 +67,7 @@ remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned long size,
 	if (address >= end)
 		BUG();
 	do {
-		pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address);
+		pte_t * pte = pte_alloc_kernel(pmd, address);
 		if (!pte)
 			return -ENOMEM;
 		remap_area_pte(pte, address, end - address, address + phys_addr, flags);
@@ -90,7 +90,6 @@ remap_area_pages(unsigned long address, unsigned long phys_addr,
 	flush_cache_all();
 	if (address >= end)
 		BUG();
-	spin_lock(&init_mm.page_table_lock);
 	do {
 		pmd_t *pmd;
 		pmd = pmd_alloc(&init_mm, dir, address);
@@ -104,7 +103,6 @@ remap_area_pages(unsigned long address, unsigned long phys_addr,
 		address = (address + PGDIR_SIZE) & PGDIR_MASK;
 		dir++;
 	} while (address && (address < end));
-	spin_unlock(&init_mm.page_table_lock);
 	flush_tlb_all();
 	return error;
 }
diff --git a/arch/m68k/mm/kmap.c b/arch/m68k/mm/kmap.c
index 5dcb3fa35ea9..fe2383e36b06 100644
--- a/arch/m68k/mm/kmap.c
+++ b/arch/m68k/mm/kmap.c
@@ -201,7 +201,7 @@ void *__ioremap(unsigned long physaddr, unsigned long size, int cacheflag)
 			virtaddr += PTRTREESIZE;
 			size -= PTRTREESIZE;
 		} else {
-			pte_dir = pte_alloc_kernel(&init_mm, pmd_dir, virtaddr);
+			pte_dir = pte_alloc_kernel(pmd_dir, virtaddr);
 			if (!pte_dir) {
 				printk("ioremap: no mem for pte_dir\n");
 				return NULL;
diff --git a/arch/m68k/sun3x/dvma.c b/arch/m68k/sun3x/dvma.c
index 32e55adfeb8e..117481e86305 100644
--- a/arch/m68k/sun3x/dvma.c
+++ b/arch/m68k/sun3x/dvma.c
@@ -116,7 +116,7 @@ inline int dvma_map_cpu(unsigned long kaddr,
 			pte_t *pte;
 			unsigned long end3;
 
-			if((pte = pte_alloc_kernel(&init_mm, pmd, vaddr)) == NULL) {
+			if((pte = pte_alloc_kernel(pmd, vaddr)) == NULL) {
 				ret = -ENOMEM;
 				goto out;
 			}
diff --git a/arch/mips/mm/ioremap.c b/arch/mips/mm/ioremap.c
index 9c44ca70befa..3101d1db5592 100644
--- a/arch/mips/mm/ioremap.c
+++ b/arch/mips/mm/ioremap.c
@@ -55,7 +55,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address,
 	if (address >= end)
 		BUG();
 	do {
-		pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address);
+		pte_t * pte = pte_alloc_kernel(pmd, address);
 		if (!pte)
 			return -ENOMEM;
 		remap_area_pte(pte, address, end - address, address + phys_addr, flags);
@@ -77,7 +77,6 @@ static int remap_area_pages(unsigned long address, phys_t phys_addr,
 	flush_cache_all();
 	if (address >= end)
 		BUG();
-	spin_lock(&init_mm.page_table_lock);
 	do {
 		pud_t *pud;
 		pmd_t *pmd;
@@ -96,7 +95,6 @@ static int remap_area_pages(unsigned long address, phys_t phys_addr,
 		address = (address + PGDIR_SIZE) & PGDIR_MASK;
 		dir++;
 	} while (address && (address < end));
-	spin_unlock(&init_mm.page_table_lock);
 	flush_tlb_all();
 	return error;
 }
diff --git a/arch/parisc/kernel/pci-dma.c b/arch/parisc/kernel/pci-dma.c
index ae6213d71670..f94a02ef3d95 100644
--- a/arch/parisc/kernel/pci-dma.c
+++ b/arch/parisc/kernel/pci-dma.c
@@ -114,7 +114,7 @@ static inline int map_pmd_uncached(pmd_t * pmd, unsigned long vaddr,
 	if (end > PGDIR_SIZE)
 		end = PGDIR_SIZE;
 	do {
-		pte_t * pte = pte_alloc_kernel(&init_mm, pmd, vaddr);
+		pte_t * pte = pte_alloc_kernel(pmd, vaddr);
 		if (!pte)
 			return -ENOMEM;
 		if (map_pte_uncached(pte, orig_vaddr, end - vaddr, paddr_ptr))
diff --git a/arch/parisc/mm/ioremap.c b/arch/parisc/mm/ioremap.c
index f2df502cdae3..5c7a1b3b9326 100644
--- a/arch/parisc/mm/ioremap.c
+++ b/arch/parisc/mm/ioremap.c
@@ -52,7 +52,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned lo
 	if (address >= end)
 		BUG();
 	do {
-		pte_t * pte = pte_alloc_kernel(NULL, pmd, address);
+		pte_t * pte = pte_alloc_kernel(pmd, address);
 		if (!pte)
 			return -ENOMEM;
 		remap_area_pte(pte, address, end - address, address + phys_addr, flags);
@@ -75,10 +75,9 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr,
 	flush_cache_all();
 	if (address >= end)
 		BUG();
-	spin_lock(&init_mm.page_table_lock);
 	do {
 		pmd_t *pmd;
-		pmd = pmd_alloc(dir, address);
+		pmd = pmd_alloc(&init_mm, dir, address);
 		error = -ENOMEM;
 		if (!pmd)
 			break;
@@ -89,7 +88,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr,
 		address = (address + PGDIR_SIZE) & PGDIR_MASK;
 		dir++;
 	} while (address && (address < end));
-	spin_unlock(&init_mm.page_table_lock);
 	flush_tlb_all();
 	return error;
 }
diff --git a/arch/ppc/kernel/dma-mapping.c b/arch/ppc/kernel/dma-mapping.c
index 0f710d2baec6..685fd0defe23 100644
--- a/arch/ppc/kernel/dma-mapping.c
+++ b/arch/ppc/kernel/dma-mapping.c
@@ -335,8 +335,6 @@ static int __init dma_alloc_init(void)
 	pte_t *pte;
 	int ret = 0;
 
-	spin_lock(&init_mm.page_table_lock);
-
 	do {
 		pgd = pgd_offset(&init_mm, CONSISTENT_BASE);
 		pmd = pmd_alloc(&init_mm, pgd, CONSISTENT_BASE);
@@ -347,7 +345,7 @@ static int __init dma_alloc_init(void)
 		}
 		WARN_ON(!pmd_none(*pmd));
 
-		pte = pte_alloc_kernel(&init_mm, pmd, CONSISTENT_BASE);
+		pte = pte_alloc_kernel(pmd, CONSISTENT_BASE);
 		if (!pte) {
 			printk(KERN_ERR "%s: no pte tables\n", __func__);
 			ret = -ENOMEM;
@@ -357,8 +355,6 @@ static int __init dma_alloc_init(void)
 		consistent_pte = pte;
 	} while (0);
 
-	spin_unlock(&init_mm.page_table_lock);
-
 	return ret;
 }
 
diff --git a/arch/ppc/mm/4xx_mmu.c b/arch/ppc/mm/4xx_mmu.c
index b7bcbc232f39..4d006aa1a0d1 100644
--- a/arch/ppc/mm/4xx_mmu.c
+++ b/arch/ppc/mm/4xx_mmu.c
@@ -110,13 +110,11 @@ unsigned long __init mmu_mapin_ram(void)
 		pmd_t *pmdp;
 		unsigned long val = p | _PMD_SIZE_16M | _PAGE_HWEXEC | _PAGE_HWWRITE;
 
-		spin_lock(&init_mm.page_table_lock);
 		pmdp = pmd_offset(pgd_offset_k(v), v);
 		pmd_val(*pmdp++) = val;
 		pmd_val(*pmdp++) = val;
 		pmd_val(*pmdp++) = val;
 		pmd_val(*pmdp++) = val;
-		spin_unlock(&init_mm.page_table_lock);
 
 		v += LARGE_PAGE_SIZE_16M;
 		p += LARGE_PAGE_SIZE_16M;
@@ -127,10 +125,8 @@ unsigned long __init mmu_mapin_ram(void)
 		pmd_t *pmdp;
 		unsigned long val = p | _PMD_SIZE_4M | _PAGE_HWEXEC | _PAGE_HWWRITE;
 
-		spin_lock(&init_mm.page_table_lock);
 		pmdp = pmd_offset(pgd_offset_k(v), v);
 		pmd_val(*pmdp) = val;
-		spin_unlock(&init_mm.page_table_lock);
 
 		v += LARGE_PAGE_SIZE_4M;
 		p += LARGE_PAGE_SIZE_4M;
diff --git a/arch/ppc/mm/pgtable.c b/arch/ppc/mm/pgtable.c
index 43505b1fc5d8..6ea9185fd120 100644
--- a/arch/ppc/mm/pgtable.c
+++ b/arch/ppc/mm/pgtable.c
@@ -280,18 +280,16 @@ map_page(unsigned long va, phys_addr_t pa, int flags)
 	pte_t *pg;
 	int err = -ENOMEM;
 
-	spin_lock(&init_mm.page_table_lock);
 	/* Use upper 10 bits of VA to index the first level map */
 	pd = pmd_offset(pgd_offset_k(va), va);
 	/* Use middle 10 bits of VA to index the second-level map */
-	pg = pte_alloc_kernel(&init_mm, pd, va);
+	pg = pte_alloc_kernel(pd, va);
 	if (pg != 0) {
 		err = 0;
 		set_pte_at(&init_mm, va, pg, pfn_pte(pa >> PAGE_SHIFT, __pgprot(flags)));
 		if (mem_init_done)
 			flush_HPTE(0, va, pmd_val(*pd));
 	}
-	spin_unlock(&init_mm.page_table_lock);
 	return err;
 }
 
diff --git a/arch/ppc64/mm/imalloc.c b/arch/ppc64/mm/imalloc.c
index c65b87b92756..f4ca29cf5364 100644
--- a/arch/ppc64/mm/imalloc.c
+++ b/arch/ppc64/mm/imalloc.c
@@ -300,12 +300,7 @@ void im_free(void * addr)
 	for (p = &imlist ; (tmp = *p) ; p = &tmp->next) {
 		if (tmp->addr == addr) {
 			*p = tmp->next;
-
-			/* XXX: do we need the lock? */
-			spin_lock(&init_mm.page_table_lock);
 			unmap_vm_area(tmp);
-			spin_unlock(&init_mm.page_table_lock);
-
 			kfree(tmp);
 			up(&imlist_sem);
 			return;
diff --git a/arch/ppc64/mm/init.c b/arch/ppc64/mm/init.c
index be64b157afce..a45584b3440c 100644
--- a/arch/ppc64/mm/init.c
+++ b/arch/ppc64/mm/init.c
@@ -155,7 +155,6 @@ static int map_io_page(unsigned long ea, unsigned long pa, int flags)
 	unsigned long vsid;
 
 	if (mem_init_done) {
-		spin_lock(&init_mm.page_table_lock);
 		pgdp = pgd_offset_k(ea);
 		pudp = pud_alloc(&init_mm, pgdp, ea);
 		if (!pudp)
@@ -163,12 +162,11 @@ static int map_io_page(unsigned long ea, unsigned long pa, int flags)
 		pmdp = pmd_alloc(&init_mm, pudp, ea);
 		if (!pmdp)
 			return -ENOMEM;
-		ptep = pte_alloc_kernel(&init_mm, pmdp, ea);
+		ptep = pte_alloc_kernel(pmdp, ea);
 		if (!ptep)
 			return -ENOMEM;
 		set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
 							  __pgprot(flags)));
-		spin_unlock(&init_mm.page_table_lock);
 	} else {
 		unsigned long va, vpn, hash, hpteg;
 
diff --git a/arch/s390/mm/ioremap.c b/arch/s390/mm/ioremap.c
index c6c39d868bc8..0f6e9ecbefe2 100644
--- a/arch/s390/mm/ioremap.c
+++ b/arch/s390/mm/ioremap.c
@@ -58,7 +58,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned lo
 	if (address >= end)
 		BUG();
 	do {
-		pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address);
+		pte_t * pte = pte_alloc_kernel(pmd, address);
 		if (!pte)
 			return -ENOMEM;
 		remap_area_pte(pte, address, end - address, address + phys_addr, flags);
@@ -80,7 +80,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr,
 	flush_cache_all();
 	if (address >= end)
 		BUG();
-	spin_lock(&init_mm.page_table_lock);
 	do {
 		pmd_t *pmd;
 		pmd = pmd_alloc(&init_mm, dir, address);
@@ -94,7 +93,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr,
 		address = (address + PGDIR_SIZE) & PGDIR_MASK;
 		dir++;
 	} while (address && (address < end));
-	spin_unlock(&init_mm.page_table_lock);
 	flush_tlb_all();
 	return 0;
 }
diff --git a/arch/sh/mm/ioremap.c b/arch/sh/mm/ioremap.c
index 9f490c2742f0..e794e27a72f1 100644
--- a/arch/sh/mm/ioremap.c
+++ b/arch/sh/mm/ioremap.c
@@ -57,7 +57,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address,
 	if (address >= end)
 		BUG();
 	do {
-		pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address);
+		pte_t * pte = pte_alloc_kernel(pmd, address);
 		if (!pte)
 			return -ENOMEM;
 		remap_area_pte(pte, address, end - address, address + phys_addr, flags);
@@ -79,7 +79,6 @@ int remap_area_pages(unsigned long address, unsigned long phys_addr,
 	flush_cache_all();
 	if (address >= end)
 		BUG();
-	spin_lock(&init_mm.page_table_lock);
 	do {
 		pmd_t *pmd;
 		pmd = pmd_alloc(&init_mm, dir, address);
@@ -93,7 +92,6 @@ int remap_area_pages(unsigned long address, unsigned long phys_addr,
 		address = (address + PGDIR_SIZE) & PGDIR_MASK;
 		dir++;
 	} while (address && (address < end));
-	spin_unlock(&init_mm.page_table_lock);
 	flush_tlb_all();
 	return error;
 }
diff --git a/arch/sh64/mm/ioremap.c b/arch/sh64/mm/ioremap.c
index f4003da556bc..fb1866fa2c9d 100644
--- a/arch/sh64/mm/ioremap.c
+++ b/arch/sh64/mm/ioremap.c
@@ -79,7 +79,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned lo
 		BUG();
 
 	do {
-		pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address);
+		pte_t * pte = pte_alloc_kernel(pmd, address);
 		if (!pte)
 			return -ENOMEM;
 		remap_area_pte(pte, address, end - address, address + phys_addr, flags);
@@ -101,7 +101,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr,
 	flush_cache_all();
 	if (address >= end)
 		BUG();
-	spin_lock(&init_mm.page_table_lock);
 	do {
 		pmd_t *pmd = pmd_alloc(&init_mm, dir, address);
 		error = -ENOMEM;
@@ -115,7 +114,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr,
 		address = (address + PGDIR_SIZE) & PGDIR_MASK;
 		dir++;
 	} while (address && (address < end));
-	spin_unlock(&init_mm.page_table_lock);
 	flush_tlb_all();
 	return 0;
 }
diff --git a/arch/x86_64/mm/ioremap.c b/arch/x86_64/mm/ioremap.c
index 6972df480d2b..ecf7acb5db9b 100644
--- a/arch/x86_64/mm/ioremap.c
+++ b/arch/x86_64/mm/ioremap.c
@@ -60,7 +60,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned lo
 	if (address >= end)
 		BUG();
 	do {
-		pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address);
+		pte_t * pte = pte_alloc_kernel(pmd, address);
 		if (!pte)
 			return -ENOMEM;
 		remap_area_pte(pte, address, end - address, address + phys_addr, flags);
@@ -105,7 +105,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr,
 	flush_cache_all();
 	if (address >= end)
 		BUG();
-	spin_lock(&init_mm.page_table_lock);
 	do {
 		pud_t *pud;
 		pud = pud_alloc(&init_mm, pgd, address);
@@ -119,7 +118,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr,
 		address = (address + PGDIR_SIZE) & PGDIR_MASK;
 		pgd++;
 	} while (address && (address < end));
-	spin_unlock(&init_mm.page_table_lock);
 	flush_tlb_all();
 	return error;
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 89398032bc4b..b9fa82b96d9e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -706,7 +706,7 @@ static inline void unmap_shared_mapping_range(struct address_space *mapping,
 extern int vmtruncate(struct inode * inode, loff_t offset);
 extern pud_t *FASTCALL(__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address));
 extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address));
-extern pte_t *FASTCALL(pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address));
+extern pte_t *FASTCALL(pte_alloc_kernel(pmd_t *pmd, unsigned long address));
 extern pte_t *FASTCALL(pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address));
 extern int install_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot);
 extern int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long pgoff, pgprot_t prot);
diff --git a/mm/memory.c b/mm/memory.c
index 692ad810263d..95a4553c75f7 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -307,28 +307,22 @@ out:
 	return pte_offset_map(pmd, address);
 }
 
-pte_t fastcall * pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
+pte_t fastcall * pte_alloc_kernel(pmd_t *pmd, unsigned long address)
 {
 	if (!pmd_present(*pmd)) {
 		pte_t *new;
 
-		spin_unlock(&mm->page_table_lock);
-		new = pte_alloc_one_kernel(mm, address);
-		spin_lock(&mm->page_table_lock);
+		new = pte_alloc_one_kernel(&init_mm, address);
 		if (!new)
 			return NULL;
 
-		/*
-		 * Because we dropped the lock, we should re-check the
-		 * entry, as somebody else could have populated it..
-		 */
-		if (pmd_present(*pmd)) {
+		spin_lock(&init_mm.page_table_lock);
+		if (pmd_present(*pmd))
 			pte_free_kernel(new);
-			goto out;
-		}
-		pmd_populate_kernel(mm, pmd, new);
+		else
+			pmd_populate_kernel(&init_mm, pmd, new);
+		spin_unlock(&init_mm.page_table_lock);
 	}
-out:
 	return pte_offset_kernel(pmd, address);
 }
 
@@ -2097,30 +2091,30 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 #ifndef __PAGETABLE_PUD_FOLDED
 /*
  * Allocate page upper directory.
- *
- * We've already handled the fast-path in-line, and we own the
- * page table lock.
+ * We've already handled the fast-path in-line.
  */
 pud_t fastcall *__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
 {
 	pud_t *new;
 
-	spin_unlock(&mm->page_table_lock);
+	if (mm != &init_mm)		/* Temporary bridging hack */
+		spin_unlock(&mm->page_table_lock);
 	new = pud_alloc_one(mm, address);
-	spin_lock(&mm->page_table_lock);
-	if (!new)
+	if (!new) {
+		if (mm != &init_mm)	/* Temporary bridging hack */
+			spin_lock(&mm->page_table_lock);
 		return NULL;
+	}
 
-	/*
-	 * Because we dropped the lock, we should re-check the
-	 * entry, as somebody else could have populated it..
-	 */
+	spin_lock(&mm->page_table_lock);
 	if (pgd_present(*pgd)) {
 		pud_free(new);
 		goto out;
 	}
 	pgd_populate(mm, pgd, new);
  out:
+	if (mm == &init_mm)		/* Temporary bridging hack */
+		spin_unlock(&mm->page_table_lock);
 	return pud_offset(pgd, address);
 }
 #endif /* __PAGETABLE_PUD_FOLDED */
@@ -2128,24 +2122,22 @@ pud_t fastcall *__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long addr
 #ifndef __PAGETABLE_PMD_FOLDED
 /*
  * Allocate page middle directory.
- *
- * We've already handled the fast-path in-line, and we own the
- * page table lock.
+ * We've already handled the fast-path in-line.
  */
 pmd_t fastcall *__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
 {
 	pmd_t *new;
 
-	spin_unlock(&mm->page_table_lock);
+	if (mm != &init_mm)		/* Temporary bridging hack */
+		spin_unlock(&mm->page_table_lock);
 	new = pmd_alloc_one(mm, address);
-	spin_lock(&mm->page_table_lock);
-	if (!new)
+	if (!new) {
+		if (mm != &init_mm)	/* Temporary bridging hack */
+			spin_lock(&mm->page_table_lock);
 		return NULL;
+	}
 
-	/*
-	 * Because we dropped the lock, we should re-check the
-	 * entry, as somebody else could have populated it..
-	 */
+	spin_lock(&mm->page_table_lock);
 #ifndef __ARCH_HAS_4LEVEL_HACK
 	if (pud_present(*pud)) {
 		pmd_free(new);
@@ -2161,6 +2153,8 @@ pmd_t fastcall *__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long addr
 #endif /* __ARCH_HAS_4LEVEL_HACK */
 
  out:
+	if (mm == &init_mm)		/* Temporary bridging hack */
+		spin_unlock(&mm->page_table_lock);
 	return pmd_offset(pud, address);
 }
 #endif /* __PAGETABLE_PMD_FOLDED */
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 5e9120598799..54a90e83cb31 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -89,7 +89,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
 {
 	pte_t *pte;
 
-	pte = pte_alloc_kernel(&init_mm, pmd, addr);
+	pte = pte_alloc_kernel(pmd, addr);
 	if (!pte)
 		return -ENOMEM;
 	do {
@@ -147,14 +147,12 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
 
 	BUG_ON(addr >= end);
 	pgd = pgd_offset_k(addr);
-	spin_lock(&init_mm.page_table_lock);
 	do {
 		next = pgd_addr_end(addr, end);
 		err = vmap_pud_range(pgd, addr, next, prot, pages);
 		if (err)
 			break;
 	} while (pgd++, addr = next, addr != end);
-	spin_unlock(&init_mm.page_table_lock);
 	flush_cache_vmap((unsigned long) area->addr, end);
 	return err;
 }

From 1bb3630e89cb8a7b3d3807629c20c5bad88290ff Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:22 -0700
Subject: [PATCH 48/91] [PATCH] mm: ptd_alloc inline and out

It seems odd to me that, whereas pud_alloc and pmd_alloc test inline, only
calling out-of-line __pud_alloc __pmd_alloc if allocation needed,
pte_alloc_map and pte_alloc_kernel are entirely out-of-line.  Though it does
add a little to kernel size, change them to macros testing inline, calling
__pte_alloc or __pte_alloc_kernel to allocate out-of-line.  Mark none of them
as fastcalls, leave that to CONFIG_REGPARM or not.

It also seems more natural for the out-of-line functions to leave the offset
calculation and map to the inline, which has to do it anyway for the common
case.  At least mremap move wants __pte_alloc without _map.

Macros rather than inline functions, certainly to avoid the header file issues
which arise from CONFIG_HIGHPTE needing kmap_types.h, but also in case any
architectures I haven't built would have other such problems.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-generic/4level-fixup.h | 11 +---
 include/linux/mm.h                 | 38 ++++++------
 mm/memory.c                        | 93 ++++++++++++------------------
 mm/mremap.c                        |  7 +--
 4 files changed, 61 insertions(+), 88 deletions(-)

diff --git a/include/asm-generic/4level-fixup.h b/include/asm-generic/4level-fixup.h
index c20ec257ecc0..68c6fea994d9 100644
--- a/include/asm-generic/4level-fixup.h
+++ b/include/asm-generic/4level-fixup.h
@@ -10,14 +10,9 @@
 
 #define pud_t				pgd_t
 
-#define pmd_alloc(mm, pud, address)			\
-({	pmd_t *ret;					\
-	if (pgd_none(*pud))				\
- 		ret = __pmd_alloc(mm, pud, address);	\
- 	else						\
-		ret = pmd_offset(pud, address);		\
- 	ret;						\
-})
+#define pmd_alloc(mm, pud, address) \
+	((unlikely(pgd_none(*(pud))) && __pmd_alloc(mm, pud, address))? \
+ 		NULL: pmd_offset(pud, address))
 
 #define pud_alloc(mm, pgd, address)	(pgd)
 #define pud_offset(pgd, start)		(pgd)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index b9fa82b96d9e..22c2d6922c0e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -704,10 +704,6 @@ static inline void unmap_shared_mapping_range(struct address_space *mapping,
 }
 
 extern int vmtruncate(struct inode * inode, loff_t offset);
-extern pud_t *FASTCALL(__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address));
-extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address));
-extern pte_t *FASTCALL(pte_alloc_kernel(pmd_t *pmd, unsigned long address));
-extern pte_t *FASTCALL(pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address));
 extern int install_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot);
 extern int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long pgoff, pgprot_t prot);
 extern int __handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access);
@@ -760,32 +756,36 @@ struct shrinker;
 extern struct shrinker *set_shrinker(int, shrinker_t);
 extern void remove_shrinker(struct shrinker *shrinker);
 
-/*
- * On a two-level or three-level page table, this ends up being trivial. Thus
- * the inlining and the symmetry break with pte_alloc_map() that does all
- * of this out-of-line.
- */
+int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address);
+int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address);
+int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address);
+int __pte_alloc_kernel(pmd_t *pmd, unsigned long address);
+
 /*
  * The following ifdef needed to get the 4level-fixup.h header to work.
  * Remove it when 4level-fixup.h has been removed.
  */
-#ifdef CONFIG_MMU
-#ifndef __ARCH_HAS_4LEVEL_HACK 
+#if defined(CONFIG_MMU) && !defined(__ARCH_HAS_4LEVEL_HACK)
 static inline pud_t *pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
 {
-	if (pgd_none(*pgd))
-		return __pud_alloc(mm, pgd, address);
-	return pud_offset(pgd, address);
+	return (unlikely(pgd_none(*pgd)) && __pud_alloc(mm, pgd, address))?
+		NULL: pud_offset(pgd, address);
 }
 
 static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
 {
-	if (pud_none(*pud))
-		return __pmd_alloc(mm, pud, address);
-	return pmd_offset(pud, address);
+	return (unlikely(pud_none(*pud)) && __pmd_alloc(mm, pud, address))?
+		NULL: pmd_offset(pud, address);
 }
-#endif
-#endif /* CONFIG_MMU */
+#endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */
+
+#define pte_alloc_map(mm, pmd, address)			\
+	((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \
+		NULL: pte_offset_map(pmd, address))
+
+#define pte_alloc_kernel(pmd, address)			\
+	((unlikely(!pmd_present(*(pmd))) && __pte_alloc_kernel(pmd, address))? \
+		NULL: pte_offset_kernel(pmd, address))
 
 extern void free_area_init(unsigned long * zones_size);
 extern void free_area_init_node(int nid, pg_data_t *pgdat,
diff --git a/mm/memory.c b/mm/memory.c
index 95a4553c75f7..4bdd1186b43b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -280,50 +280,39 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
 	}
 }
 
-pte_t fastcall *pte_alloc_map(struct mm_struct *mm, pmd_t *pmd,
-				unsigned long address)
+int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
 {
-	if (!pmd_present(*pmd)) {
-		struct page *new;
+	struct page *new;
 
-		spin_unlock(&mm->page_table_lock);
-		new = pte_alloc_one(mm, address);
-		spin_lock(&mm->page_table_lock);
-		if (!new)
-			return NULL;
-		/*
-		 * Because we dropped the lock, we should re-check the
-		 * entry, as somebody else could have populated it..
-		 */
-		if (pmd_present(*pmd)) {
-			pte_free(new);
-			goto out;
-		}
+	spin_unlock(&mm->page_table_lock);
+	new = pte_alloc_one(mm, address);
+	spin_lock(&mm->page_table_lock);
+	if (!new)
+		return -ENOMEM;
+
+	if (pmd_present(*pmd))		/* Another has populated it */
+		pte_free(new);
+	else {
 		mm->nr_ptes++;
 		inc_page_state(nr_page_table_pages);
 		pmd_populate(mm, pmd, new);
 	}
-out:
-	return pte_offset_map(pmd, address);
+	return 0;
 }
 
-pte_t fastcall * pte_alloc_kernel(pmd_t *pmd, unsigned long address)
+int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
 {
-	if (!pmd_present(*pmd)) {
-		pte_t *new;
+	pte_t *new = pte_alloc_one_kernel(&init_mm, address);
+	if (!new)
+		return -ENOMEM;
 
-		new = pte_alloc_one_kernel(&init_mm, address);
-		if (!new)
-			return NULL;
-
-		spin_lock(&init_mm.page_table_lock);
-		if (pmd_present(*pmd))
-			pte_free_kernel(new);
-		else
-			pmd_populate_kernel(&init_mm, pmd, new);
-		spin_unlock(&init_mm.page_table_lock);
-	}
-	return pte_offset_kernel(pmd, address);
+	spin_lock(&init_mm.page_table_lock);
+	if (pmd_present(*pmd))		/* Another has populated it */
+		pte_free_kernel(new);
+	else
+		pmd_populate_kernel(&init_mm, pmd, new);
+	spin_unlock(&init_mm.page_table_lock);
+	return 0;
 }
 
 static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
@@ -2093,7 +2082,7 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
  * Allocate page upper directory.
  * We've already handled the fast-path in-line.
  */
-pud_t fastcall *__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
+int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
 {
 	pud_t *new;
 
@@ -2103,19 +2092,17 @@ pud_t fastcall *__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long addr
 	if (!new) {
 		if (mm != &init_mm)	/* Temporary bridging hack */
 			spin_lock(&mm->page_table_lock);
-		return NULL;
+		return -ENOMEM;
 	}
 
 	spin_lock(&mm->page_table_lock);
-	if (pgd_present(*pgd)) {
+	if (pgd_present(*pgd))		/* Another has populated it */
 		pud_free(new);
-		goto out;
-	}
-	pgd_populate(mm, pgd, new);
- out:
+	else
+		pgd_populate(mm, pgd, new);
 	if (mm == &init_mm)		/* Temporary bridging hack */
 		spin_unlock(&mm->page_table_lock);
-	return pud_offset(pgd, address);
+	return 0;
 }
 #endif /* __PAGETABLE_PUD_FOLDED */
 
@@ -2124,7 +2111,7 @@ pud_t fastcall *__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long addr
  * Allocate page middle directory.
  * We've already handled the fast-path in-line.
  */
-pmd_t fastcall *__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
+int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
 {
 	pmd_t *new;
 
@@ -2134,28 +2121,24 @@ pmd_t fastcall *__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long addr
 	if (!new) {
 		if (mm != &init_mm)	/* Temporary bridging hack */
 			spin_lock(&mm->page_table_lock);
-		return NULL;
+		return -ENOMEM;
 	}
 
 	spin_lock(&mm->page_table_lock);
 #ifndef __ARCH_HAS_4LEVEL_HACK
-	if (pud_present(*pud)) {
+	if (pud_present(*pud))		/* Another has populated it */
 		pmd_free(new);
-		goto out;
-	}
-	pud_populate(mm, pud, new);
+	else
+		pud_populate(mm, pud, new);
 #else
-	if (pgd_present(*pud)) {
+	if (pgd_present(*pud))		/* Another has populated it */
 		pmd_free(new);
-		goto out;
-	}
-	pgd_populate(mm, pud, new);
+	else
+		pgd_populate(mm, pud, new);
 #endif /* __ARCH_HAS_4LEVEL_HACK */
-
- out:
 	if (mm == &init_mm)		/* Temporary bridging hack */
 		spin_unlock(&mm->page_table_lock);
-	return pmd_offset(pud, address);
+	return 0;
 }
 #endif /* __PAGETABLE_PMD_FOLDED */
 
diff --git a/mm/mremap.c b/mm/mremap.c
index ccf456477020..616facc3d28a 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -51,7 +51,6 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr)
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd = NULL;
-	pte_t *pte;
 
 	/*
 	 * We do need page_table_lock: because allocators expect that.
@@ -66,12 +65,8 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr)
 	if (!pmd)
 		goto out;
 
-	pte = pte_alloc_map(mm, pmd, addr);
-	if (!pte) {
+	if (!pmd_present(*pmd) && __pte_alloc(mm, pmd, addr))
 		pmd = NULL;
-		goto out;
-	}
-	pte_unmap(pte);
 out:
 	spin_unlock(&mm->page_table_lock);
 	return pmd;

From c74df32c724a1652ad8399b4891bb02c9d43743a Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:23 -0700
Subject: [PATCH 49/91] [PATCH] mm: ptd_alloc take ptlock

Second step in pushing down the page_table_lock.  Remove the temporary
bridging hack from __pud_alloc, __pmd_alloc, __pte_alloc: expect callers not
to hold page_table_lock, whether it's on init_mm or a user mm; take
page_table_lock internally to check if a racing task already allocated.

Convert their callers from common code.  But avoid coming back to change them
again later: instead of moving the spin_lock(&mm->page_table_lock) down,
switch over to new macros pte_alloc_map_lock and pte_unmap_unlock, which
encapsulate the mapping+locking and unlocking+unmapping together, and in the
end may use alternatives to the mm page_table_lock itself.

These callers all hold mmap_sem (some exclusively, some not), so at no level
can a page table be whipped away from beneath them; and pte_alloc uses the
"atomic" pmd_present to test whether it needs to allocate.  It appears that on
all arches we can safely descend without page_table_lock.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c          |  14 +++---
 include/linux/mm.h |  18 ++++++++
 kernel/fork.c      |   2 -
 mm/fremap.c        |  48 ++++++++-------------
 mm/hugetlb.c       |  12 ++++--
 mm/memory.c        | 104 ++++++++++++++-------------------------------
 mm/mremap.c        |  27 ++++--------
 7 files changed, 90 insertions(+), 135 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index 9bb55c8cf224..ba73797eb4cb 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -309,25 +309,24 @@ void install_arg_page(struct vm_area_struct *vma,
 	pud_t * pud;
 	pmd_t * pmd;
 	pte_t * pte;
+	spinlock_t *ptl;
 
 	if (unlikely(anon_vma_prepare(vma)))
-		goto out_sig;
+		goto out;
 
 	flush_dcache_page(page);
 	pgd = pgd_offset(mm, address);
-
-	spin_lock(&mm->page_table_lock);
 	pud = pud_alloc(mm, pgd, address);
 	if (!pud)
 		goto out;
 	pmd = pmd_alloc(mm, pud, address);
 	if (!pmd)
 		goto out;
-	pte = pte_alloc_map(mm, pmd, address);
+	pte = pte_alloc_map_lock(mm, pmd, address, &ptl);
 	if (!pte)
 		goto out;
 	if (!pte_none(*pte)) {
-		pte_unmap(pte);
+		pte_unmap_unlock(pte, ptl);
 		goto out;
 	}
 	inc_mm_counter(mm, anon_rss);
@@ -335,14 +334,11 @@ void install_arg_page(struct vm_area_struct *vma,
 	set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte(
 					page, vma->vm_page_prot))));
 	page_add_anon_rmap(page, vma, address);
-	pte_unmap(pte);
-	spin_unlock(&mm->page_table_lock);
+	pte_unmap_unlock(pte, ptl);
 
 	/* no need for flush_tlb */
 	return;
 out:
-	spin_unlock(&mm->page_table_lock);
-out_sig:
 	__free_page(page);
 	force_sig(SIGKILL, current);
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 22c2d6922c0e..d4c3512e7db4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -779,10 +779,28 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a
 }
 #endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */
 
+#define pte_offset_map_lock(mm, pmd, address, ptlp)	\
+({							\
+	spinlock_t *__ptl = &(mm)->page_table_lock;	\
+	pte_t *__pte = pte_offset_map(pmd, address);	\
+	*(ptlp) = __ptl;				\
+	spin_lock(__ptl);				\
+	__pte;						\
+})
+
+#define pte_unmap_unlock(pte, ptl)	do {		\
+	spin_unlock(ptl);				\
+	pte_unmap(pte);					\
+} while (0)
+
 #define pte_alloc_map(mm, pmd, address)			\
 	((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \
 		NULL: pte_offset_map(pmd, address))
 
+#define pte_alloc_map_lock(mm, pmd, address, ptlp)	\
+	((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \
+		NULL: pte_offset_map_lock(mm, pmd, address, ptlp))
+
 #define pte_alloc_kernel(pmd, address)			\
 	((unlikely(!pmd_present(*(pmd))) && __pte_alloc_kernel(pmd, address))? \
 		NULL: pte_offset_kernel(pmd, address))
diff --git a/kernel/fork.c b/kernel/fork.c
index 2a587b3224e3..8a069612eac3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -255,7 +255,6 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 		/*
 		 * Link in the new vma and copy the page table entries.
 		 */
-		spin_lock(&mm->page_table_lock);
 		*pprev = tmp;
 		pprev = &tmp->vm_next;
 
@@ -265,7 +264,6 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 
 		mm->map_count++;
 		retval = copy_page_range(mm, oldmm, tmp);
-		spin_unlock(&mm->page_table_lock);
 
 		if (tmp->vm_ops && tmp->vm_ops->open)
 			tmp->vm_ops->open(tmp);
diff --git a/mm/fremap.c b/mm/fremap.c
index 49719a35769a..d862be3bc3e3 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -63,23 +63,20 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	pud_t *pud;
 	pgd_t *pgd;
 	pte_t pte_val;
+	spinlock_t *ptl;
 
 	BUG_ON(vma->vm_flags & VM_RESERVED);
 
 	pgd = pgd_offset(mm, addr);
-	spin_lock(&mm->page_table_lock);
-	
 	pud = pud_alloc(mm, pgd, addr);
 	if (!pud)
-		goto err_unlock;
-
+		goto out;
 	pmd = pmd_alloc(mm, pud, addr);
 	if (!pmd)
-		goto err_unlock;
-
-	pte = pte_alloc_map(mm, pmd, addr);
+		goto out;
+	pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
 	if (!pte)
-		goto err_unlock;
+		goto out;
 
 	/*
 	 * This page may have been truncated. Tell the
@@ -89,10 +86,10 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	inode = vma->vm_file->f_mapping->host;
 	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 	if (!page->mapping || page->index >= size)
-		goto err_unlock;
+		goto unlock;
 	err = -ENOMEM;
 	if (page_mapcount(page) > INT_MAX/2)
-		goto err_unlock;
+		goto unlock;
 
 	if (pte_none(*pte) || !zap_pte(mm, vma, addr, pte))
 		inc_mm_counter(mm, file_rss);
@@ -101,17 +98,15 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	set_pte_at(mm, addr, pte, mk_pte(page, prot));
 	page_add_file_rmap(page);
 	pte_val = *pte;
-	pte_unmap(pte);
 	update_mmu_cache(vma, addr, pte_val);
-
 	err = 0;
-err_unlock:
-	spin_unlock(&mm->page_table_lock);
+unlock:
+	pte_unmap_unlock(pte, ptl);
+out:
 	return err;
 }
 EXPORT_SYMBOL(install_page);
 
-
 /*
  * Install a file pte to a given virtual memory address, release any
  * previously existing mapping.
@@ -125,23 +120,20 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
 	pud_t *pud;
 	pgd_t *pgd;
 	pte_t pte_val;
+	spinlock_t *ptl;
 
 	BUG_ON(vma->vm_flags & VM_RESERVED);
 
 	pgd = pgd_offset(mm, addr);
-	spin_lock(&mm->page_table_lock);
-	
 	pud = pud_alloc(mm, pgd, addr);
 	if (!pud)
-		goto err_unlock;
-
+		goto out;
 	pmd = pmd_alloc(mm, pud, addr);
 	if (!pmd)
-		goto err_unlock;
-
-	pte = pte_alloc_map(mm, pmd, addr);
+		goto out;
+	pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
 	if (!pte)
-		goto err_unlock;
+		goto out;
 
 	if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte)) {
 		update_hiwater_rss(mm);
@@ -150,17 +142,13 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff));
 	pte_val = *pte;
-	pte_unmap(pte);
 	update_mmu_cache(vma, addr, pte_val);
-	spin_unlock(&mm->page_table_lock);
-	return 0;
-
-err_unlock:
-	spin_unlock(&mm->page_table_lock);
+	pte_unmap_unlock(pte, ptl);
+	err = 0;
+out:
 	return err;
 }
 
-
 /***
  * sys_remap_file_pages - remap arbitrary pages of a shared backing store
  *                        file within an existing vma.
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ac5f044bf514..ea0826ff2663 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -277,12 +277,15 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 	unsigned long addr;
 
 	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
+		src_pte = huge_pte_offset(src, addr);
+		if (!src_pte)
+			continue;
 		dst_pte = huge_pte_alloc(dst, addr);
 		if (!dst_pte)
 			goto nomem;
+		spin_lock(&dst->page_table_lock);
 		spin_lock(&src->page_table_lock);
-		src_pte = huge_pte_offset(src, addr);
-		if (src_pte && !pte_none(*src_pte)) {
+		if (!pte_none(*src_pte)) {
 			entry = *src_pte;
 			ptepage = pte_page(entry);
 			get_page(ptepage);
@@ -290,6 +293,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 			set_huge_pte_at(dst, addr, dst_pte, entry);
 		}
 		spin_unlock(&src->page_table_lock);
+		spin_unlock(&dst->page_table_lock);
 	}
 	return 0;
 
@@ -354,7 +358,6 @@ int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
 
 	hugetlb_prefault_arch_hook(mm);
 
-	spin_lock(&mm->page_table_lock);
 	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
 		unsigned long idx;
 		pte_t *pte = huge_pte_alloc(mm, addr);
@@ -389,11 +392,12 @@ int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
 				goto out;
 			}
 		}
+		spin_lock(&mm->page_table_lock);
 		add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
 		set_huge_pte_at(mm, addr, pte, make_huge_pte(vma, page));
+		spin_unlock(&mm->page_table_lock);
 	}
 out:
-	spin_unlock(&mm->page_table_lock);
 	return ret;
 }
 
diff --git a/mm/memory.c b/mm/memory.c
index 4bdd1186b43b..a40e4b1cee4f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -282,14 +282,11 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
 
 int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
 {
-	struct page *new;
-
-	spin_unlock(&mm->page_table_lock);
-	new = pte_alloc_one(mm, address);
-	spin_lock(&mm->page_table_lock);
+	struct page *new = pte_alloc_one(mm, address);
 	if (!new)
 		return -ENOMEM;
 
+	spin_lock(&mm->page_table_lock);
 	if (pmd_present(*pmd))		/* Another has populated it */
 		pte_free(new);
 	else {
@@ -297,6 +294,7 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
 		inc_page_state(nr_page_table_pages);
 		pmd_populate(mm, pmd, new);
 	}
+	spin_unlock(&mm->page_table_lock);
 	return 0;
 }
 
@@ -344,9 +342,6 @@ void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr)
  * copy one vm_area from one task to the other. Assumes the page tables
  * already present in the new task to be cleared in the whole range
  * covered by this vma.
- *
- * dst->page_table_lock is held on entry and exit,
- * but may be dropped within p[mg]d_alloc() and pte_alloc_map().
  */
 
 static inline void
@@ -419,17 +414,19 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		unsigned long addr, unsigned long end)
 {
 	pte_t *src_pte, *dst_pte;
+	spinlock_t *src_ptl, *dst_ptl;
 	int progress = 0;
 	int rss[2];
 
 again:
 	rss[1] = rss[0] = 0;
-	dst_pte = pte_alloc_map(dst_mm, dst_pmd, addr);
+	dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
 	if (!dst_pte)
 		return -ENOMEM;
 	src_pte = pte_offset_map_nested(src_pmd, addr);
+	src_ptl = &src_mm->page_table_lock;
+	spin_lock(src_ptl);
 
-	spin_lock(&src_mm->page_table_lock);
 	do {
 		/*
 		 * We are holding two locks at this point - either of them
@@ -438,8 +435,8 @@ again:
 		if (progress >= 32) {
 			progress = 0;
 			if (need_resched() ||
-			    need_lockbreak(&src_mm->page_table_lock) ||
-			    need_lockbreak(&dst_mm->page_table_lock))
+			    need_lockbreak(src_ptl) ||
+			    need_lockbreak(dst_ptl))
 				break;
 		}
 		if (pte_none(*src_pte)) {
@@ -449,12 +446,12 @@ again:
 		copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);
 		progress += 8;
 	} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
-	spin_unlock(&src_mm->page_table_lock);
 
+	spin_unlock(src_ptl);
 	pte_unmap_nested(src_pte - 1);
-	pte_unmap(dst_pte - 1);
 	add_mm_rss(dst_mm, rss[0], rss[1]);
-	cond_resched_lock(&dst_mm->page_table_lock);
+	pte_unmap_unlock(dst_pte - 1, dst_ptl);
+	cond_resched();
 	if (addr != end)
 		goto again;
 	return 0;
@@ -1049,8 +1046,9 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
 			unsigned long addr, unsigned long end, pgprot_t prot)
 {
 	pte_t *pte;
+	spinlock_t *ptl;
 
-	pte = pte_alloc_map(mm, pmd, addr);
+	pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
 	if (!pte)
 		return -ENOMEM;
 	do {
@@ -1062,7 +1060,7 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
 		BUG_ON(!pte_none(*pte));
 		set_pte_at(mm, addr, pte, zero_pte);
 	} while (pte++, addr += PAGE_SIZE, addr != end);
-	pte_unmap(pte - 1);
+	pte_unmap_unlock(pte - 1, ptl);
 	return 0;
 }
 
@@ -1112,14 +1110,12 @@ int zeromap_page_range(struct vm_area_struct *vma,
 	BUG_ON(addr >= end);
 	pgd = pgd_offset(mm, addr);
 	flush_cache_range(vma, addr, end);
-	spin_lock(&mm->page_table_lock);
 	do {
 		next = pgd_addr_end(addr, end);
 		err = zeromap_pud_range(mm, pgd, addr, next, prot);
 		if (err)
 			break;
 	} while (pgd++, addr = next, addr != end);
-	spin_unlock(&mm->page_table_lock);
 	return err;
 }
 
@@ -1133,8 +1129,9 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
 			unsigned long pfn, pgprot_t prot)
 {
 	pte_t *pte;
+	spinlock_t *ptl;
 
-	pte = pte_alloc_map(mm, pmd, addr);
+	pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
 	if (!pte)
 		return -ENOMEM;
 	do {
@@ -1142,7 +1139,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
 		set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
 		pfn++;
 	} while (pte++, addr += PAGE_SIZE, addr != end);
-	pte_unmap(pte - 1);
+	pte_unmap_unlock(pte - 1, ptl);
 	return 0;
 }
 
@@ -1210,7 +1207,6 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 	pfn -= addr >> PAGE_SHIFT;
 	pgd = pgd_offset(mm, addr);
 	flush_cache_range(vma, addr, end);
-	spin_lock(&mm->page_table_lock);
 	do {
 		next = pgd_addr_end(addr, end);
 		err = remap_pud_range(mm, pgd, addr, next,
@@ -1218,7 +1214,6 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 		if (err)
 			break;
 	} while (pgd++, addr = next, addr != end);
-	spin_unlock(&mm->page_table_lock);
 	return err;
 }
 EXPORT_SYMBOL(remap_pfn_range);
@@ -1985,17 +1980,9 @@ static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma,
  * with external mmu caches can use to update those (ie the Sparc or
  * PowerPC hashed page tables that act as extended TLBs).
  *
- * Note the "page_table_lock". It is to protect against kswapd removing
- * pages from under us. Note that kswapd only ever _removes_ pages, never
- * adds them. As such, once we have noticed that the page is not present,
- * we can drop the lock early.
- *
- * The adding of pages is protected by the MM semaphore (which we hold),
- * so we don't need to worry about a page being suddenly been added into
- * our VM.
- *
- * We enter with the pagetable spinlock held, we are supposed to
- * release it when done.
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), and pte mapped but not yet locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
  */
 static inline int handle_pte_fault(struct mm_struct *mm,
 		struct vm_area_struct *vma, unsigned long address,
@@ -2003,6 +1990,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
 {
 	pte_t entry;
 
+	spin_lock(&mm->page_table_lock);
 	entry = *pte;
 	if (!pte_present(entry)) {
 		if (pte_none(entry)) {
@@ -2051,30 +2039,18 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (unlikely(is_vm_hugetlb_page(vma)))
 		return hugetlb_fault(mm, vma, address, write_access);
 
-	/*
-	 * We need the page table lock to synchronize with kswapd
-	 * and the SMP-safe atomic PTE updates.
-	 */
 	pgd = pgd_offset(mm, address);
-	spin_lock(&mm->page_table_lock);
-
 	pud = pud_alloc(mm, pgd, address);
 	if (!pud)
-		goto oom;
-
+		return VM_FAULT_OOM;
 	pmd = pmd_alloc(mm, pud, address);
 	if (!pmd)
-		goto oom;
-
+		return VM_FAULT_OOM;
 	pte = pte_alloc_map(mm, pmd, address);
 	if (!pte)
-		goto oom;
-	
-	return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
+		return VM_FAULT_OOM;
 
- oom:
-	spin_unlock(&mm->page_table_lock);
-	return VM_FAULT_OOM;
+	return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
 }
 
 #ifndef __PAGETABLE_PUD_FOLDED
@@ -2084,24 +2060,16 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
  */
 int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
 {
-	pud_t *new;
-
-	if (mm != &init_mm)		/* Temporary bridging hack */
-		spin_unlock(&mm->page_table_lock);
-	new = pud_alloc_one(mm, address);
-	if (!new) {
-		if (mm != &init_mm)	/* Temporary bridging hack */
-			spin_lock(&mm->page_table_lock);
+	pud_t *new = pud_alloc_one(mm, address);
+	if (!new)
 		return -ENOMEM;
-	}
 
 	spin_lock(&mm->page_table_lock);
 	if (pgd_present(*pgd))		/* Another has populated it */
 		pud_free(new);
 	else
 		pgd_populate(mm, pgd, new);
-	if (mm == &init_mm)		/* Temporary bridging hack */
-		spin_unlock(&mm->page_table_lock);
+	spin_unlock(&mm->page_table_lock);
 	return 0;
 }
 #endif /* __PAGETABLE_PUD_FOLDED */
@@ -2113,16 +2081,9 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
  */
 int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
 {
-	pmd_t *new;
-
-	if (mm != &init_mm)		/* Temporary bridging hack */
-		spin_unlock(&mm->page_table_lock);
-	new = pmd_alloc_one(mm, address);
-	if (!new) {
-		if (mm != &init_mm)	/* Temporary bridging hack */
-			spin_lock(&mm->page_table_lock);
+	pmd_t *new = pmd_alloc_one(mm, address);
+	if (!new)
 		return -ENOMEM;
-	}
 
 	spin_lock(&mm->page_table_lock);
 #ifndef __ARCH_HAS_4LEVEL_HACK
@@ -2136,8 +2097,7 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
 	else
 		pgd_populate(mm, pud, new);
 #endif /* __ARCH_HAS_4LEVEL_HACK */
-	if (mm == &init_mm)		/* Temporary bridging hack */
-		spin_unlock(&mm->page_table_lock);
+	spin_unlock(&mm->page_table_lock);
 	return 0;
 }
 #endif /* __PAGETABLE_PMD_FOLDED */
diff --git a/mm/mremap.c b/mm/mremap.c
index 616facc3d28a..8de77b632a20 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -28,9 +28,6 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
 	pud_t *pud;
 	pmd_t *pmd;
 
-	/*
-	 * We don't need page_table_lock: we have mmap_sem exclusively.
-	 */
 	pgd = pgd_offset(mm, addr);
 	if (pgd_none_or_clear_bad(pgd))
 		return NULL;
@@ -50,25 +47,20 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pgd;
 	pud_t *pud;
-	pmd_t *pmd = NULL;
+	pmd_t *pmd;
 
-	/*
-	 * We do need page_table_lock: because allocators expect that.
-	 */
-	spin_lock(&mm->page_table_lock);
 	pgd = pgd_offset(mm, addr);
 	pud = pud_alloc(mm, pgd, addr);
 	if (!pud)
-		goto out;
+		return NULL;
 
 	pmd = pmd_alloc(mm, pud, addr);
 	if (!pmd)
-		goto out;
+		return NULL;
 
 	if (!pmd_present(*pmd) && __pte_alloc(mm, pmd, addr))
-		pmd = NULL;
-out:
-	spin_unlock(&mm->page_table_lock);
+		return NULL;
+
 	return pmd;
 }
 
@@ -80,6 +72,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 	struct address_space *mapping = NULL;
 	struct mm_struct *mm = vma->vm_mm;
 	pte_t *old_pte, *new_pte, pte;
+	spinlock_t *old_ptl;
 
 	if (vma->vm_file) {
 		/*
@@ -95,9 +88,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 			new_vma->vm_truncate_count = 0;
 	}
 
-	spin_lock(&mm->page_table_lock);
-	old_pte = pte_offset_map(old_pmd, old_addr);
-	new_pte = pte_offset_map_nested(new_pmd, new_addr);
+	old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
+ 	new_pte = pte_offset_map_nested(new_pmd, new_addr);
 
 	for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
 				   new_pte++, new_addr += PAGE_SIZE) {
@@ -110,8 +102,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 	}
 
 	pte_unmap_nested(new_pte - 1);
-	pte_unmap(old_pte - 1);
-	spin_unlock(&mm->page_table_lock);
+	pte_unmap_unlock(old_pte - 1, old_ptl);
 	if (mapping)
 		spin_unlock(&mapping->i_mmap_lock);
 }

From b462705ac679f6195d1b23a752cda592d9107495 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:24 -0700
Subject: [PATCH 50/91] [PATCH] mm: arches skip ptlock

Convert those few architectures which are calling pud_alloc, pmd_alloc,
pte_alloc_map on a user mm, not to take the page_table_lock first, nor drop it
after.  Each of these can continue to use pte_alloc_map, no need to change
over to pte_alloc_map_lock, they're neither racy nor swappable.

In the sparc64 io_remap_pfn_range, flush_tlb_range then falls outside of the
page_table_lock: that's okay, on sparc64 it's like flush_tlb_mm, and that has
always been called from outside of page_table_lock in dup_mmap.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/arm/mm/mm-armv.c     | 14 --------------
 arch/arm26/mm/memc.c      | 15 ---------------
 arch/sparc/mm/generic.c   |  4 +---
 arch/sparc64/mm/generic.c |  6 ++----
 arch/um/kernel/skas/mmu.c |  3 ---
 5 files changed, 3 insertions(+), 39 deletions(-)

diff --git a/arch/arm/mm/mm-armv.c b/arch/arm/mm/mm-armv.c
index 61bc2fa0511e..60f3e039bac2 100644
--- a/arch/arm/mm/mm-armv.c
+++ b/arch/arm/mm/mm-armv.c
@@ -179,11 +179,6 @@ pgd_t *get_pgd_slow(struct mm_struct *mm)
 	clean_dcache_area(new_pgd, PTRS_PER_PGD * sizeof(pgd_t));
 
 	if (!vectors_high()) {
-		/*
-		 * This lock is here just to satisfy pmd_alloc and pte_lock
-		 */
-		spin_lock(&mm->page_table_lock);
-
 		/*
 		 * On ARM, first page must always be allocated since it
 		 * contains the machine vectors.
@@ -201,23 +196,14 @@ pgd_t *get_pgd_slow(struct mm_struct *mm)
 		set_pte(new_pte, *init_pte);
 		pte_unmap_nested(init_pte);
 		pte_unmap(new_pte);
-
-		spin_unlock(&mm->page_table_lock);
 	}
 
 	return new_pgd;
 
 no_pte:
-	spin_unlock(&mm->page_table_lock);
 	pmd_free(new_pmd);
-	free_pages((unsigned long)new_pgd, 2);
-	return NULL;
-
 no_pmd:
-	spin_unlock(&mm->page_table_lock);
 	free_pages((unsigned long)new_pgd, 2);
-	return NULL;
-
 no_pgd:
 	return NULL;
 }
diff --git a/arch/arm26/mm/memc.c b/arch/arm26/mm/memc.c
index d6b008b8db76..34def6397c3c 100644
--- a/arch/arm26/mm/memc.c
+++ b/arch/arm26/mm/memc.c
@@ -78,12 +78,6 @@ pgd_t *get_pgd_slow(struct mm_struct *mm)
 	if (!new_pgd)
 		goto no_pgd;
 
-	/*
-	 * This lock is here just to satisfy pmd_alloc and pte_lock
-         * FIXME: I bet we could avoid taking it pretty much altogether
-	 */
-	spin_lock(&mm->page_table_lock);
-
 	/*
 	 * On ARM, first page must always be allocated since it contains
 	 * the machine vectors.
@@ -113,23 +107,14 @@ pgd_t *get_pgd_slow(struct mm_struct *mm)
 	memcpy(new_pgd + FIRST_KERNEL_PGD_NR, init_pgd + FIRST_KERNEL_PGD_NR,
 		(PTRS_PER_PGD - FIRST_KERNEL_PGD_NR) * sizeof(pgd_t));
 
-	spin_unlock(&mm->page_table_lock);
-
 	/* update MEMC tables */
 	cpu_memc_update_all(new_pgd);
 	return new_pgd;
 
 no_pte:
-	spin_unlock(&mm->page_table_lock);
 	pmd_free(new_pmd);
-	free_pgd_slow(new_pgd);
-	return NULL;
-
 no_pmd:
-	spin_unlock(&mm->page_table_lock);
 	free_pgd_slow(new_pgd);
-	return NULL;
-
 no_pgd:
 	return NULL;
 }
diff --git a/arch/sparc/mm/generic.c b/arch/sparc/mm/generic.c
index 659c9a71f867..9604893ffdbd 100644
--- a/arch/sparc/mm/generic.c
+++ b/arch/sparc/mm/generic.c
@@ -81,9 +81,8 @@ int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
 	dir = pgd_offset(mm, from);
 	flush_cache_range(vma, beg, end);
 
-	spin_lock(&mm->page_table_lock);
 	while (from < end) {
-		pmd_t *pmd = pmd_alloc(current->mm, dir, from);
+		pmd_t *pmd = pmd_alloc(mm, dir, from);
 		error = -ENOMEM;
 		if (!pmd)
 			break;
@@ -93,7 +92,6 @@ int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
 		from = (from + PGDIR_SIZE) & PGDIR_MASK;
 		dir++;
 	}
-	spin_unlock(&mm->page_table_lock);
 
 	flush_tlb_range(vma, beg, end);
 	return error;
diff --git a/arch/sparc64/mm/generic.c b/arch/sparc64/mm/generic.c
index afc01cec701f..112c316e7cd2 100644
--- a/arch/sparc64/mm/generic.c
+++ b/arch/sparc64/mm/generic.c
@@ -135,9 +135,8 @@ int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
 	dir = pgd_offset(mm, from);
 	flush_cache_range(vma, beg, end);
 
-	spin_lock(&mm->page_table_lock);
 	while (from < end) {
-		pud_t *pud = pud_alloc(current->mm, dir, from);
+		pud_t *pud = pud_alloc(mm, dir, from);
 		error = -ENOMEM;
 		if (!pud)
 			break;
@@ -147,8 +146,7 @@ int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
 		from = (from + PGDIR_SIZE) & PGDIR_MASK;
 		dir++;
 	}
-	flush_tlb_range(vma, beg, end);
-	spin_unlock(&mm->page_table_lock);
 
+	flush_tlb_range(vma, beg, end);
 	return error;
 }
diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c
index 240143b616a2..02cf36e0331a 100644
--- a/arch/um/kernel/skas/mmu.c
+++ b/arch/um/kernel/skas/mmu.c
@@ -28,7 +28,6 @@ static int init_stub_pte(struct mm_struct *mm, unsigned long proc,
 	pmd_t *pmd;
 	pte_t *pte;
 
-	spin_lock(&mm->page_table_lock);
 	pgd = pgd_offset(mm, proc);
 	pud = pud_alloc(mm, pgd, proc);
 	if (!pud)
@@ -63,7 +62,6 @@ static int init_stub_pte(struct mm_struct *mm, unsigned long proc,
 	*pte = mk_pte(virt_to_page(kernel), __pgprot(_PAGE_PRESENT));
 	*pte = pte_mkexec(*pte);
 	*pte = pte_wrprotect(*pte);
-	spin_unlock(&mm->page_table_lock);
 	return(0);
 
  out_pmd:
@@ -71,7 +69,6 @@ static int init_stub_pte(struct mm_struct *mm, unsigned long proc,
  out_pte:
 	pmd_free(pmd);
  out:
-	spin_unlock(&mm->page_table_lock);
 	return(-ENOMEM);
 }
 

From 8f4e2101fd7df9031a754eedb82e2060b51f8c45 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:26 -0700
Subject: [PATCH 51/91] [PATCH] mm: page fault handler locking

On the page fault path, the patch before last pushed acquiring the
page_table_lock down to the head of handle_pte_fault (though it's also taken
and dropped earlier when a new page table has to be allocated).

Now delete that line, read "entry = *pte" without it, and go off to this or
that page fault handler on the basis of this unlocked peek.  Usually the
handler can proceed without the lock, relying on the subsequent locked
pte_same or pte_none test to back out when necessary; though do_wp_page needs
the lock immediately, and do_file_page doesn't check (if there's a race,
install_page just zaps the entry and reinstalls it).

But on those architectures (notably i386 with PAE) whose pte is too big to be
read atomically, if SMP or preemption is enabled, do_swap_page and
do_file_page might cause irretrievable damage if passed a Frankenstein entry
stitched together from unrelated parts.  In those configs, "pte_unmap_same"
has to take page_table_lock, validate orig_pte still the same, and drop
page_table_lock before unmapping, before proceeding.

Use pte_offset_map_lock and pte_unmap_unlock throughout the handlers; but lock
avoidance leaves more lone maps and unmaps than elsewhere.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 150 +++++++++++++++++++++++++++++++---------------------
 1 file changed, 90 insertions(+), 60 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index a40e4b1cee4f..24ba688876d6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1218,6 +1218,30 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 }
 EXPORT_SYMBOL(remap_pfn_range);
 
+/*
+ * handle_pte_fault chooses page fault handler according to an entry
+ * which was read non-atomically.  Before making any commitment, on
+ * those architectures or configurations (e.g. i386 with PAE) which
+ * might give a mix of unmatched parts, do_swap_page and do_file_page
+ * must check under lock before unmapping the pte and proceeding
+ * (but do_wp_page is only called after already making such a check;
+ * and do_anonymous_page and do_no_page can safely check later on).
+ */
+static inline int pte_unmap_same(struct mm_struct *mm,
+				pte_t *page_table, pte_t orig_pte)
+{
+	int same = 1;
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
+	if (sizeof(pte_t) > sizeof(unsigned long)) {
+		spin_lock(&mm->page_table_lock);
+		same = pte_same(*page_table, orig_pte);
+		spin_unlock(&mm->page_table_lock);
+	}
+#endif
+	pte_unmap(page_table);
+	return same;
+}
+
 /*
  * Do pte_mkwrite, but only if the vma says VM_WRITE.  We do this when
  * servicing faults for write access.  In the normal case, do always want
@@ -1245,12 +1269,13 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
  * change only once the write actually happens. This avoids a few races,
  * and potentially makes it more efficient.
  *
- * We hold the mm semaphore and the page_table_lock on entry and exit
- * with the page_table_lock released.
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), with pte both mapped and locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
  */
 static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, pte_t *page_table, pmd_t *pmd,
-		pte_t orig_pte)
+		spinlock_t *ptl, pte_t orig_pte)
 {
 	struct page *old_page, *new_page;
 	unsigned long pfn = pte_pfn(orig_pte);
@@ -1288,8 +1313,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * Ok, we need to copy. Oh, well..
 	 */
 	page_cache_get(old_page);
-	pte_unmap(page_table);
-	spin_unlock(&mm->page_table_lock);
+	pte_unmap_unlock(page_table, ptl);
 
 	if (unlikely(anon_vma_prepare(vma)))
 		goto oom;
@@ -1307,8 +1331,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	/*
 	 * Re-check the pte - we dropped the lock
 	 */
-	spin_lock(&mm->page_table_lock);
-	page_table = pte_offset_map(pmd, address);
+	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
 	if (likely(pte_same(*page_table, orig_pte))) {
 		page_remove_rmap(old_page);
 		if (!PageAnon(old_page)) {
@@ -1321,7 +1344,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		ptep_establish(vma, address, page_table, entry);
 		update_mmu_cache(vma, address, entry);
 		lazy_mmu_prot_update(entry);
-
 		lru_cache_add_active(new_page);
 		page_add_anon_rmap(new_page, vma, address);
 
@@ -1332,8 +1354,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	page_cache_release(new_page);
 	page_cache_release(old_page);
 unlock:
-	pte_unmap(page_table);
-	spin_unlock(&mm->page_table_lock);
+	pte_unmap_unlock(page_table, ptl);
 	return ret;
 oom:
 	page_cache_release(old_page);
@@ -1660,20 +1681,22 @@ void swapin_readahead(swp_entry_t entry, unsigned long addr,struct vm_area_struc
 }
 
 /*
- * We hold the mm semaphore and the page_table_lock on entry and
- * should release the pagetable lock on exit..
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), and pte mapped but not yet locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
  */
 static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, pte_t *page_table, pmd_t *pmd,
 		int write_access, pte_t orig_pte)
 {
+	spinlock_t *ptl;
 	struct page *page;
 	swp_entry_t entry;
 	pte_t pte;
 	int ret = VM_FAULT_MINOR;
 
-	pte_unmap(page_table);
-	spin_unlock(&mm->page_table_lock);
+	if (!pte_unmap_same(mm, page_table, orig_pte))
+		goto out;
 
 	entry = pte_to_swp_entry(orig_pte);
 	page = lookup_swap_cache(entry);
@@ -1682,11 +1705,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
  		page = read_swap_cache_async(entry, vma, address);
 		if (!page) {
 			/*
-			 * Back out if somebody else faulted in this pte while
-			 * we released the page table lock.
+			 * Back out if somebody else faulted in this pte
+			 * while we released the pte lock.
 			 */
-			spin_lock(&mm->page_table_lock);
-			page_table = pte_offset_map(pmd, address);
+			page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
 			if (likely(pte_same(*page_table, orig_pte)))
 				ret = VM_FAULT_OOM;
 			goto unlock;
@@ -1702,11 +1724,9 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	lock_page(page);
 
 	/*
-	 * Back out if somebody else faulted in this pte while we
-	 * released the page table lock.
+	 * Back out if somebody else already faulted in this pte.
 	 */
-	spin_lock(&mm->page_table_lock);
-	page_table = pte_offset_map(pmd, address);
+	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
 	if (unlikely(!pte_same(*page_table, orig_pte)))
 		goto out_nomap;
 
@@ -1735,7 +1755,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	if (write_access) {
 		if (do_wp_page(mm, vma, address,
-				page_table, pmd, pte) == VM_FAULT_OOM)
+				page_table, pmd, ptl, pte) == VM_FAULT_OOM)
 			ret = VM_FAULT_OOM;
 		goto out;
 	}
@@ -1744,37 +1764,32 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	update_mmu_cache(vma, address, pte);
 	lazy_mmu_prot_update(pte);
 unlock:
-	pte_unmap(page_table);
-	spin_unlock(&mm->page_table_lock);
+	pte_unmap_unlock(page_table, ptl);
 out:
 	return ret;
 out_nomap:
-	pte_unmap(page_table);
-	spin_unlock(&mm->page_table_lock);
+	pte_unmap_unlock(page_table, ptl);
 	unlock_page(page);
 	page_cache_release(page);
 	return ret;
 }
 
 /*
- * We are called with the MM semaphore and page_table_lock
- * spinlock held to protect against concurrent faults in
- * multithreaded programs. 
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), and pte mapped but not yet locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
  */
 static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, pte_t *page_table, pmd_t *pmd,
 		int write_access)
 {
-	struct page *page = ZERO_PAGE(addr);
+	struct page *page;
+	spinlock_t *ptl;
 	pte_t entry;
 
-	/* Mapping of ZERO_PAGE - vm_page_prot is readonly */
-	entry = mk_pte(page, vma->vm_page_prot);
-
 	if (write_access) {
 		/* Allocate our own private page. */
 		pte_unmap(page_table);
-		spin_unlock(&mm->page_table_lock);
 
 		if (unlikely(anon_vma_prepare(vma)))
 			goto oom;
@@ -1782,23 +1797,28 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		if (!page)
 			goto oom;
 
-		spin_lock(&mm->page_table_lock);
-		page_table = pte_offset_map(pmd, address);
-
-		if (!pte_none(*page_table)) {
-			page_cache_release(page);
-			goto unlock;
-		}
-		inc_mm_counter(mm, anon_rss);
 		entry = mk_pte(page, vma->vm_page_prot);
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+
+		page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+		if (!pte_none(*page_table))
+			goto release;
+		inc_mm_counter(mm, anon_rss);
 		lru_cache_add_active(page);
 		SetPageReferenced(page);
 		page_add_anon_rmap(page, vma, address);
 	} else {
+		/* Map the ZERO_PAGE - vm_page_prot is readonly */
+		page = ZERO_PAGE(address);
+		page_cache_get(page);
+		entry = mk_pte(page, vma->vm_page_prot);
+
+		ptl = &mm->page_table_lock;
+		spin_lock(ptl);
+		if (!pte_none(*page_table))
+			goto release;
 		inc_mm_counter(mm, file_rss);
 		page_add_file_rmap(page);
-		page_cache_get(page);
 	}
 
 	set_pte_at(mm, address, page_table, entry);
@@ -1807,9 +1827,11 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	update_mmu_cache(vma, address, entry);
 	lazy_mmu_prot_update(entry);
 unlock:
-	pte_unmap(page_table);
-	spin_unlock(&mm->page_table_lock);
+	pte_unmap_unlock(page_table, ptl);
 	return VM_FAULT_MINOR;
+release:
+	page_cache_release(page);
+	goto unlock;
 oom:
 	return VM_FAULT_OOM;
 }
@@ -1823,13 +1845,15 @@ oom:
  * As this is called only for pages that do not currently exist, we
  * do not need to flush old virtual caches or the TLB.
  *
- * This is called with the MM semaphore held and the page table
- * spinlock held. Exit with the spinlock released.
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), and pte mapped but not yet locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
  */
 static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, pte_t *page_table, pmd_t *pmd,
 		int write_access)
 {
+	spinlock_t *ptl;
 	struct page *new_page;
 	struct address_space *mapping = NULL;
 	pte_t entry;
@@ -1838,7 +1862,6 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	int anon = 0;
 
 	pte_unmap(page_table);
-	spin_unlock(&mm->page_table_lock);
 
 	if (vma->vm_file) {
 		mapping = vma->vm_file->f_mapping;
@@ -1878,21 +1901,20 @@ retry:
 		anon = 1;
 	}
 
-	spin_lock(&mm->page_table_lock);
+	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
 	/*
 	 * For a file-backed vma, someone could have truncated or otherwise
 	 * invalidated this page.  If unmap_mapping_range got called,
 	 * retry getting the page.
 	 */
 	if (mapping && unlikely(sequence != mapping->truncate_count)) {
-		spin_unlock(&mm->page_table_lock);
+		pte_unmap_unlock(page_table, ptl);
 		page_cache_release(new_page);
 		cond_resched();
 		sequence = mapping->truncate_count;
 		smp_rmb();
 		goto retry;
 	}
-	page_table = pte_offset_map(pmd, address);
 
 	/*
 	 * This silly early PAGE_DIRTY setting removes a race
@@ -1929,8 +1951,7 @@ retry:
 	update_mmu_cache(vma, address, entry);
 	lazy_mmu_prot_update(entry);
 unlock:
-	pte_unmap(page_table);
-	spin_unlock(&mm->page_table_lock);
+	pte_unmap_unlock(page_table, ptl);
 	return ret;
 oom:
 	page_cache_release(new_page);
@@ -1941,6 +1962,10 @@ oom:
  * Fault of a previously existing named mapping. Repopulate the pte
  * from the encoded file_pte if possible. This enables swappable
  * nonlinear vmas.
+ *
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), and pte mapped but not yet locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
  */
 static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, pte_t *page_table, pmd_t *pmd,
@@ -1949,8 +1974,8 @@ static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	pgoff_t pgoff;
 	int err;
 
-	pte_unmap(page_table);
-	spin_unlock(&mm->page_table_lock);
+	if (!pte_unmap_same(mm, page_table, orig_pte))
+		return VM_FAULT_MINOR;
 
 	if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
 		/*
@@ -1989,8 +2014,8 @@ static inline int handle_pte_fault(struct mm_struct *mm,
 		pte_t *pte, pmd_t *pmd, int write_access)
 {
 	pte_t entry;
+	spinlock_t *ptl;
 
-	spin_lock(&mm->page_table_lock);
 	entry = *pte;
 	if (!pte_present(entry)) {
 		if (pte_none(entry)) {
@@ -2007,17 +2032,22 @@ static inline int handle_pte_fault(struct mm_struct *mm,
 					pte, pmd, write_access, entry);
 	}
 
+	ptl = &mm->page_table_lock;
+	spin_lock(ptl);
+	if (unlikely(!pte_same(*pte, entry)))
+		goto unlock;
 	if (write_access) {
 		if (!pte_write(entry))
-			return do_wp_page(mm, vma, address, pte, pmd, entry);
+			return do_wp_page(mm, vma, address,
+					pte, pmd, ptl, entry);
 		entry = pte_mkdirty(entry);
 	}
 	entry = pte_mkyoung(entry);
 	ptep_set_access_flags(vma, address, pte, entry, write_access);
 	update_mmu_cache(vma, address, entry);
 	lazy_mmu_prot_update(entry);
-	pte_unmap(pte);
-	spin_unlock(&mm->page_table_lock);
+unlock:
+	pte_unmap_unlock(pte, ptl);
 	return VM_FAULT_MINOR;
 }
 

From 705e87c0c3c38424f7f30556c85bc20e808d2f59 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:27 -0700
Subject: [PATCH 52/91] [PATCH] mm: pte_offset_map_lock loops

Convert those common loops using page_table_lock on the outside and
pte_offset_map within to use just pte_offset_map_lock within instead.

These all hold mmap_sem (some exclusively, some not), so at no level can a
page table be whipped away from beneath them.  But whereas pte_alloc loops
tested with the "atomic" pmd_present, these loops are testing with pmd_none,
which on i386 PAE tests both lower and upper halves.

That's now unsafe, so add a cast into pmd_none to test only the vital lower
half: we lose a little sensitivity to a corrupt middle directory, but not
enough to worry about.  It appears that i386 and UML were the only
architectures vulnerable in this way, and pgd and pud no problem.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/task_mmu.c         | 17 ++++++-----------
 include/asm-i386/pgtable.h |  3 ++-
 include/asm-um/pgtable.h   |  2 +-
 mm/mempolicy.c             |  7 +++----
 mm/mprotect.c              |  7 +++----
 mm/msync.c                 | 21 ++++++---------------
 mm/swapfile.c              | 20 +++++++++-----------
 7 files changed, 30 insertions(+), 47 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 7c89b4549049..7e5e7ec2e36d 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -203,13 +203,14 @@ static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 				struct mem_size_stats *mss)
 {
 	pte_t *pte, ptent;
+	spinlock_t *ptl;
 	unsigned long pfn;
 	struct page *page;
 
-	pte = pte_offset_map(pmd, addr);
+	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 	do {
 		ptent = *pte;
-		if (pte_none(ptent) || !pte_present(ptent))
+		if (!pte_present(ptent))
 			continue;
 
 		mss->resident += PAGE_SIZE;
@@ -230,8 +231,8 @@ static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 				mss->private_clean += PAGE_SIZE;
 		}
 	} while (pte++, addr += PAGE_SIZE, addr != end);
-	pte_unmap(pte - 1);
-	cond_resched_lock(&vma->vm_mm->page_table_lock);
+	pte_unmap_unlock(pte - 1, ptl);
+	cond_resched();
 }
 
 static inline void smaps_pmd_range(struct vm_area_struct *vma, pud_t *pud,
@@ -285,17 +286,11 @@ static inline void smaps_pgd_range(struct vm_area_struct *vma,
 static int show_smap(struct seq_file *m, void *v)
 {
 	struct vm_area_struct *vma = v;
-	struct mm_struct *mm = vma->vm_mm;
 	struct mem_size_stats mss;
 
 	memset(&mss, 0, sizeof mss);
-
-	if (mm) {
-		spin_lock(&mm->page_table_lock);
+	if (vma->vm_mm)
 		smaps_pgd_range(vma, vma->vm_start, vma->vm_end, &mss);
-		spin_unlock(&mm->page_table_lock);
-	}
-
 	return show_map_internal(m, v, &mss);
 }
 
diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h
index d101ac414f07..0e3ec809352d 100644
--- a/include/asm-i386/pgtable.h
+++ b/include/asm-i386/pgtable.h
@@ -203,7 +203,8 @@ extern unsigned long pg0[];
 #define pte_present(x)	((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE))
 #define pte_clear(mm,addr,xp)	do { set_pte_at(mm, addr, xp, __pte(0)); } while (0)
 
-#define pmd_none(x)	(!pmd_val(x))
+/* To avoid harmful races, pmd_none(x) should check only the lower when PAE */
+#define pmd_none(x)	(!(unsigned long)pmd_val(x))
 #define pmd_present(x)	(pmd_val(x) & _PAGE_PRESENT)
 #define pmd_clear(xp)	do { set_pmd(xp, __pmd(0)); } while (0)
 #define	pmd_bad(x)	((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
diff --git a/include/asm-um/pgtable.h b/include/asm-um/pgtable.h
index 616d02b57ea9..ac64eb955868 100644
--- a/include/asm-um/pgtable.h
+++ b/include/asm-um/pgtable.h
@@ -138,7 +138,7 @@ extern unsigned long pg0[1024];
 
 #define pte_clear(mm,addr,xp) pte_set_val(*(xp), (phys_t) 0, __pgprot(_PAGE_NEWPAGE))
 
-#define pmd_none(x)	(!(pmd_val(x) & ~_PAGE_NEWPAGE))
+#define pmd_none(x)	(!((unsigned long)pmd_val(x) & ~_PAGE_NEWPAGE))
 #define	pmd_bad(x)	((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
 #define pmd_present(x)	(pmd_val(x) & _PAGE_PRESENT)
 #define pmd_clear(xp)	do { pmd_val(*(xp)) = _PAGE_NEWPAGE; } while (0)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 11d824f282f1..902d4c9eccdc 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -228,9 +228,9 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 {
 	pte_t *orig_pte;
 	pte_t *pte;
+	spinlock_t *ptl;
 
-	spin_lock(&vma->vm_mm->page_table_lock);
-	orig_pte = pte = pte_offset_map(pmd, addr);
+	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 	do {
 		unsigned long pfn;
 		unsigned int nid;
@@ -246,8 +246,7 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 		if (!node_isset(nid, *nodes))
 			break;
 	} while (pte++, addr += PAGE_SIZE, addr != end);
-	pte_unmap(orig_pte);
-	spin_unlock(&vma->vm_mm->page_table_lock);
+	pte_unmap_unlock(orig_pte, ptl);
 	return addr != end;
 }
 
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 672a76fddd5e..17a2b52b753b 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -29,8 +29,9 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
 		unsigned long addr, unsigned long end, pgprot_t newprot)
 {
 	pte_t *pte;
+	spinlock_t *ptl;
 
-	pte = pte_offset_map(pmd, addr);
+	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 	do {
 		if (pte_present(*pte)) {
 			pte_t ptent;
@@ -44,7 +45,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
 			lazy_mmu_prot_update(ptent);
 		}
 	} while (pte++, addr += PAGE_SIZE, addr != end);
-	pte_unmap(pte - 1);
+	pte_unmap_unlock(pte - 1, ptl);
 }
 
 static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud,
@@ -88,7 +89,6 @@ static void change_protection(struct vm_area_struct *vma,
 	BUG_ON(addr >= end);
 	pgd = pgd_offset(mm, addr);
 	flush_cache_range(vma, addr, end);
-	spin_lock(&mm->page_table_lock);
 	do {
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd))
@@ -96,7 +96,6 @@ static void change_protection(struct vm_area_struct *vma,
 		change_pud_range(mm, pgd, addr, next, newprot);
 	} while (pgd++, addr = next, addr != end);
 	flush_tlb_range(vma, start, end);
-	spin_unlock(&mm->page_table_lock);
 }
 
 static int
diff --git a/mm/msync.c b/mm/msync.c
index 860395486060..0e040e9c39d8 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -17,28 +17,22 @@
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 
-/*
- * Called with mm->page_table_lock held to protect against other
- * threads/the swapper from ripping pte's out from under us.
- */
-
 static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 				unsigned long addr, unsigned long end)
 {
-	struct mm_struct *mm = vma->vm_mm;
 	pte_t *pte;
+	spinlock_t *ptl;
 	int progress = 0;
 
 again:
-	pte = pte_offset_map(pmd, addr);
+	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 	do {
 		unsigned long pfn;
 		struct page *page;
 
 		if (progress >= 64) {
 			progress = 0;
-			if (need_resched() ||
-			    need_lockbreak(&mm->page_table_lock))
+			if (need_resched() || need_lockbreak(ptl))
 				break;
 		}
 		progress++;
@@ -58,8 +52,8 @@ again:
 			set_page_dirty(page);
 		progress += 3;
 	} while (pte++, addr += PAGE_SIZE, addr != end);
-	pte_unmap(pte - 1);
-	cond_resched_lock(&mm->page_table_lock);
+	pte_unmap_unlock(pte - 1, ptl);
+	cond_resched();
 	if (addr != end)
 		goto again;
 }
@@ -97,7 +91,6 @@ static inline void msync_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 static void msync_page_range(struct vm_area_struct *vma,
 				unsigned long addr, unsigned long end)
 {
-	struct mm_struct *mm = vma->vm_mm;
 	pgd_t *pgd;
 	unsigned long next;
 
@@ -110,16 +103,14 @@ static void msync_page_range(struct vm_area_struct *vma,
 		return;
 
 	BUG_ON(addr >= end);
-	pgd = pgd_offset(mm, addr);
+	pgd = pgd_offset(vma->vm_mm, addr);
 	flush_cache_range(vma, addr, end);
-	spin_lock(&mm->page_table_lock);
 	do {
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd))
 			continue;
 		msync_pud_range(vma, pgd, addr, next);
 	} while (pgd++, addr = next, addr != end);
-	spin_unlock(&mm->page_table_lock);
 }
 
 /*
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 296e0bbf7836..510f0039b000 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -401,8 +401,6 @@ void free_swap_and_cache(swp_entry_t entry)
  * No need to decide whether this PTE shares the swap entry with others,
  * just let do_wp_page work it out if a write is requested later - to
  * force COW, vm_page_prot omits write permission from any private vma.
- *
- * vma->vm_mm->page_table_lock is held.
  */
 static void unuse_pte(struct vm_area_struct *vma, pte_t *pte,
 		unsigned long addr, swp_entry_t entry, struct page *page)
@@ -424,23 +422,25 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 				unsigned long addr, unsigned long end,
 				swp_entry_t entry, struct page *page)
 {
-	pte_t *pte;
 	pte_t swp_pte = swp_entry_to_pte(entry);
+	pte_t *pte;
+	spinlock_t *ptl;
+	int found = 0;
 
-	pte = pte_offset_map(pmd, addr);
+	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 	do {
 		/*
 		 * swapoff spends a _lot_ of time in this loop!
 		 * Test inline before going to call unuse_pte.
 		 */
 		if (unlikely(pte_same(*pte, swp_pte))) {
-			unuse_pte(vma, pte, addr, entry, page);
-			pte_unmap(pte);
-			return 1;
+			unuse_pte(vma, pte++, addr, entry, page);
+			found = 1;
+			break;
 		}
 	} while (pte++, addr += PAGE_SIZE, addr != end);
-	pte_unmap(pte - 1);
-	return 0;
+	pte_unmap_unlock(pte - 1, ptl);
+	return found;
 }
 
 static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
@@ -522,12 +522,10 @@ static int unuse_mm(struct mm_struct *mm,
 		down_read(&mm->mmap_sem);
 		lock_page(page);
 	}
-	spin_lock(&mm->page_table_lock);
 	for (vma = mm->mmap; vma; vma = vma->vm_next) {
 		if (vma->anon_vma && unuse_vma(vma, entry, page))
 			break;
 	}
-	spin_unlock(&mm->page_table_lock);
 	up_read(&mm->mmap_sem);
 	/*
 	 * Currently unuse_mm cannot fail, but leave error handling

From 663b97f7efd001b0c56bd5fce059c5272725b86f Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:28 -0700
Subject: [PATCH 53/91] [PATCH] mm: flush_tlb_range outside ptlock

There was one small but very significant change in the previous patch:
mprotect's flush_tlb_range fell outside the page_table_lock: as it is in 2.4,
but that doesn't prove it safe in 2.6.

On some architectures flush_tlb_range comes to the same as flush_tlb_mm, which
has always been called from outside page_table_lock in dup_mmap, and is so
proved safe.  Others required a deeper audit: I could find no reliance on
page_table_lock in any; but in ia64 and parisc found some code which looks a
bit as if it might want preemption disabled.  That won't do any actual harm,
so pending a decision from the maintainers, disable preemption there.

Remove comments on page_table_lock from flush_tlb_mm, flush_tlb_range and
flush_tlb_page entries in cachetlb.txt: they were rather misleading (what
generic code does is different from what usually happens), the rules are now
changing, and it's not yet clear where we'll end up (will the generic
tlb_flush_mmu happen always under lock?  never under lock?  or sometimes under
and sometimes not?).

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/cachetlb.txt    | 9 ---------
 arch/ia64/mm/tlb.c            | 2 ++
 include/asm-parisc/tlbflush.h | 3 ++-
 3 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/Documentation/cachetlb.txt b/Documentation/cachetlb.txt
index e132fb1163b0..7eb715e07eda 100644
--- a/Documentation/cachetlb.txt
+++ b/Documentation/cachetlb.txt
@@ -49,9 +49,6 @@ changes occur:
 	page table operations such as what happens during
 	fork, and exec.
 
-	Platform developers note that generic code will always
-	invoke this interface without mm->page_table_lock held.
-
 3) void flush_tlb_range(struct vm_area_struct *vma,
 			unsigned long start, unsigned long end)
 
@@ -72,9 +69,6 @@ changes occur:
 	call flush_tlb_page (see below) for each entry which may be
 	modified.
 
-	Platform developers note that generic code will always
-	invoke this interface with mm->page_table_lock held.
-
 4) void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr)
 
 	This time we need to remove the PAGE_SIZE sized translation
@@ -93,9 +87,6 @@ changes occur:
 
 	This is used primarily during fault processing.
 
-	Platform developers note that generic code will always
-	invoke this interface with mm->page_table_lock held.
-
 5) void flush_tlb_pgtables(struct mm_struct *mm,
 			   unsigned long start, unsigned long end)
 
diff --git a/arch/ia64/mm/tlb.c b/arch/ia64/mm/tlb.c
index c93e0f2b5fea..c79a9b96d02b 100644
--- a/arch/ia64/mm/tlb.c
+++ b/arch/ia64/mm/tlb.c
@@ -158,10 +158,12 @@ flush_tlb_range (struct vm_area_struct *vma, unsigned long start, unsigned long
 # ifdef CONFIG_SMP
 	platform_global_tlb_purge(mm, start, end, nbits);
 # else
+	preempt_disable();
 	do {
 		ia64_ptcl(start, (nbits<<2));
 		start += (1UL << nbits);
 	} while (start < end);
+	preempt_enable();
 # endif
 
 	ia64_srlz_i();			/* srlz.i implies srlz.d */
diff --git a/include/asm-parisc/tlbflush.h b/include/asm-parisc/tlbflush.h
index 84af4ab1fe51..e97aa8d1eff5 100644
--- a/include/asm-parisc/tlbflush.h
+++ b/include/asm-parisc/tlbflush.h
@@ -88,7 +88,7 @@ static inline void flush_tlb_range(struct vm_area_struct *vma,
 	if (npages >= 512)  /* 2MB of space: arbitrary, should be tuned */
 		flush_tlb_all();
 	else {
-
+		preempt_disable();
 		mtsp(vma->vm_mm->context,1);
 		purge_tlb_start();
 		if (split_tlb) {
@@ -102,6 +102,7 @@ static inline void flush_tlb_range(struct vm_area_struct *vma,
 				pdtlb(start);
 				start += PAGE_SIZE;
 			}
+		preempt_enable();
 		}
 		purge_tlb_end();
 	}

From 8f4f8c164cb4af1432cc25eda82928ea4519ba72 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:29 -0700
Subject: [PATCH 54/91] [PATCH] mm: unlink vma before pagetables

In most places the descent from pgd to pud to pmd to pte holds mmap_sem
(exclusively or not), which ensures that free_pgtables cannot be freeing page
tables from any level at the same time.  But truncation and reverse mapping
descend without mmap_sem.

No problem: just make sure that a vma is unlinked from its prio_tree (or
nonlinear list) and from its anon_vma list, after zapping the vma, but before
freeing its page tables.  Then neither vmtruncate nor rmap can reach that vma
whose page tables are now volatile (nor do they need to reach it, since all
its page entries have been zapped by this stage).

The i_mmap_lock and anon_vma->lock already serialize this correctly; but the
locking hierarchy is such that we cannot take them while holding
page_table_lock.  Well, we're trying to push that down anyway.  So in this
patch, move anon_vma_unlink and unlink_file_vma into free_pgtables, at the
same time as moving page_table_lock around calls to unmap_vmas.

tlb_gather_mmu and tlb_finish_mmu then fall outside the page_table_lock, but
we made them preempt_disable and preempt_enable earlier; and a long source
audit of all the architectures has shown no problem with removing
page_table_lock from them.  free_pgtables doesn't need page_table_lock for
itself, nor for what it calls; tlb->mm->nr_ptes is usually protected by
page_table_lock, but partly by non-exclusive mmap_sem - here it's decremented
with exclusive mmap_sem, or mm_users 0.  update_hiwater_rss and
vm_unacct_memory don't need page_table_lock either.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 12 ++++++++++--
 mm/mmap.c   | 23 ++++++-----------------
 2 files changed, 16 insertions(+), 19 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 24ba688876d6..4ea89a2e3a83 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -260,6 +260,12 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
 		struct vm_area_struct *next = vma->vm_next;
 		unsigned long addr = vma->vm_start;
 
+		/*
+		 * Hide vma from rmap and vmtruncate before freeing pgtables
+		 */
+		anon_vma_unlink(vma);
+		unlink_file_vma(vma);
+
 		if (is_hugepage_only_range(vma->vm_mm, addr, HPAGE_SIZE)) {
 			hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
 				floor, next? next->vm_start: ceiling);
@@ -272,6 +278,8 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
 							HPAGE_SIZE)) {
 				vma = next;
 				next = vma->vm_next;
+				anon_vma_unlink(vma);
+				unlink_file_vma(vma);
 			}
 			free_pgd_range(tlb, addr, vma->vm_end,
 				floor, next? next->vm_start: ceiling);
@@ -798,12 +806,12 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
 	}
 
 	lru_add_drain();
-	spin_lock(&mm->page_table_lock);
 	tlb = tlb_gather_mmu(mm, 0);
 	update_hiwater_rss(mm);
+	spin_lock(&mm->page_table_lock);
 	end = unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details);
-	tlb_finish_mmu(tlb, address, end);
 	spin_unlock(&mm->page_table_lock);
+	tlb_finish_mmu(tlb, address, end);
 	return end;
 }
 
diff --git a/mm/mmap.c b/mm/mmap.c
index d931d7e49ac9..fa35323a3c5b 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -203,14 +203,6 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
 {
 	struct vm_area_struct *next = vma->vm_next;
 
-	/*
-	 * Hide vma from rmap and vmtruncate before freeing page tables:
-	 * to be moved into free_pgtables once page_table_lock is lifted
-	 * from it, but until then lock ordering forbids that move.
-	 */
-	anon_vma_unlink(vma);
-	unlink_file_vma(vma);
-
 	might_sleep();
 	if (vma->vm_ops && vma->vm_ops->close)
 		vma->vm_ops->close(vma);
@@ -1679,15 +1671,15 @@ static void unmap_region(struct mm_struct *mm,
 	unsigned long nr_accounted = 0;
 
 	lru_add_drain();
-	spin_lock(&mm->page_table_lock);
 	tlb = tlb_gather_mmu(mm, 0);
 	update_hiwater_rss(mm);
+	spin_lock(&mm->page_table_lock);
 	unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL);
+	spin_unlock(&mm->page_table_lock);
 	vm_unacct_memory(nr_accounted);
 	free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
 				 next? next->vm_start: 0);
 	tlb_finish_mmu(tlb, start, end);
-	spin_unlock(&mm->page_table_lock);
 }
 
 /*
@@ -1962,23 +1954,20 @@ void exit_mmap(struct mm_struct *mm)
 	unsigned long end;
 
 	lru_add_drain();
-
-	spin_lock(&mm->page_table_lock);
-
 	flush_cache_mm(mm);
 	tlb = tlb_gather_mmu(mm, 1);
 	/* Don't update_hiwater_rss(mm) here, do_exit already did */
 	/* Use -1 here to ensure all VMAs in the mm are unmapped */
+	spin_lock(&mm->page_table_lock);
 	end = unmap_vmas(&tlb, mm, vma, 0, -1, &nr_accounted, NULL);
+	spin_unlock(&mm->page_table_lock);
 	vm_unacct_memory(nr_accounted);
 	free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
 	tlb_finish_mmu(tlb, 0, end);
 
-	spin_unlock(&mm->page_table_lock);
-
 	/*
-	 * Walk the list again, actually closing and freeing it
-	 * without holding any MM locks.
+	 * Walk the list again, actually closing and freeing it,
+	 * with preemption enabled, without holding any MM locks.
 	 */
 	while (vma)
 		vma = remove_vma(vma);

From 508034a32b819a2d40aa7ac0dbc8cd2e044c2de6 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:30 -0700
Subject: [PATCH 55/91] [PATCH] mm: unmap_vmas with inner ptlock

Remove the page_table_lock from around the calls to unmap_vmas, and replace
the pte_offset_map in zap_pte_range by pte_offset_map_lock: all callers are
now safe to descend without page_table_lock.

Don't attempt fancy locking for hugepages, just take page_table_lock in
unmap_hugepage_range.  Which makes zap_hugepage_range, and the hugetlb test in
zap_page_range, redundant: unmap_vmas calls unmap_hugepage_range anyway.  Nor
does unmap_vmas have much use for its mm arg now.

The tlb_start_vma and tlb_end_vma in unmap_page_range are now called without
page_table_lock: if they're implemented at all, they typically come down to
flush_cache_range (usually done outside page_table_lock) and flush_tlb_range
(which we already audited for the mprotect case).

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/hugetlbfs/inode.c    | 10 +++-------
 include/linux/hugetlb.h |  2 --
 include/linux/mm.h      |  2 +-
 mm/hugetlb.c            | 12 +++---------
 mm/memory.c             | 41 ++++++++++++-----------------------------
 mm/mmap.c               |  8 ++------
 6 files changed, 21 insertions(+), 54 deletions(-)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 3a9b6d179cbd..a826a8add5e3 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -92,7 +92,7 @@ out:
 }
 
 /*
- * Called under down_write(mmap_sem), page_table_lock is not held
+ * Called under down_write(mmap_sem).
  */
 
 #ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
@@ -308,7 +308,6 @@ hugetlb_vmtruncate_list(struct prio_tree_root *root, unsigned long h_pgoff)
 
 	vma_prio_tree_foreach(vma, &iter, root, h_pgoff, ULONG_MAX) {
 		unsigned long h_vm_pgoff;
-		unsigned long v_length;
 		unsigned long v_offset;
 
 		h_vm_pgoff = vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT);
@@ -319,11 +318,8 @@ hugetlb_vmtruncate_list(struct prio_tree_root *root, unsigned long h_pgoff)
 		if (h_vm_pgoff >= h_pgoff)
 			v_offset = 0;
 
-		v_length = vma->vm_end - vma->vm_start;
-
-		zap_hugepage_range(vma,
-				vma->vm_start + v_offset,
-				v_length - v_offset);
+		unmap_hugepage_range(vma,
+				vma->vm_start + v_offset, vma->vm_end);
 	}
 }
 
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index d664330d900e..0cea162b08c0 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -16,7 +16,6 @@ static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
 int hugetlb_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *);
 int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *);
 int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, struct page **, struct vm_area_struct **, unsigned long *, int *, int);
-void zap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long);
 void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long);
 int hugetlb_prefault(struct address_space *, struct vm_area_struct *);
 int hugetlb_report_meminfo(char *);
@@ -87,7 +86,6 @@ static inline unsigned long hugetlb_total_pages(void)
 #define follow_huge_addr(mm, addr, write)	ERR_PTR(-EINVAL)
 #define copy_hugetlb_page_range(src, dst, vma)	({ BUG(); 0; })
 #define hugetlb_prefault(mapping, vma)		({ BUG(); 0; })
-#define zap_hugepage_range(vma, start, len)	BUG()
 #define unmap_hugepage_range(vma, start, end)	BUG()
 #define is_hugepage_mem_enough(size)		0
 #define hugetlb_report_meminfo(buf)		0
diff --git a/include/linux/mm.h b/include/linux/mm.h
index d4c3512e7db4..972e2ce8e07c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -682,7 +682,7 @@ struct zap_details {
 
 unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
 		unsigned long size, struct zap_details *);
-unsigned long unmap_vmas(struct mmu_gather **tlb, struct mm_struct *mm,
+unsigned long unmap_vmas(struct mmu_gather **tlb,
 		struct vm_area_struct *start_vma, unsigned long start_addr,
 		unsigned long end_addr, unsigned long *nr_accounted,
 		struct zap_details *);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ea0826ff2663..f29b7dc02c39 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -314,6 +314,8 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 	BUG_ON(start & ~HPAGE_MASK);
 	BUG_ON(end & ~HPAGE_MASK);
 
+	spin_lock(&mm->page_table_lock);
+
 	/* Update high watermark before we lower rss */
 	update_hiwater_rss(mm);
 
@@ -333,17 +335,9 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 		put_page(page);
 		add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE));
 	}
-	flush_tlb_range(vma, start, end);
-}
 
-void zap_hugepage_range(struct vm_area_struct *vma,
-			unsigned long start, unsigned long length)
-{
-	struct mm_struct *mm = vma->vm_mm;
-
-	spin_lock(&mm->page_table_lock);
-	unmap_hugepage_range(vma, start, start + length);
 	spin_unlock(&mm->page_table_lock);
+	flush_tlb_range(vma, start, end);
 }
 
 int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
diff --git a/mm/memory.c b/mm/memory.c
index 4ea89a2e3a83..622a4ef5409f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -551,10 +551,11 @@ static void zap_pte_range(struct mmu_gather *tlb,
 {
 	struct mm_struct *mm = tlb->mm;
 	pte_t *pte;
+	spinlock_t *ptl;
 	int file_rss = 0;
 	int anon_rss = 0;
 
-	pte = pte_offset_map(pmd, addr);
+	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 	do {
 		pte_t ptent = *pte;
 		if (pte_none(ptent))
@@ -621,7 +622,7 @@ static void zap_pte_range(struct mmu_gather *tlb,
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 
 	add_mm_rss(mm, file_rss, anon_rss);
-	pte_unmap(pte - 1);
+	pte_unmap_unlock(pte - 1, ptl);
 }
 
 static inline void zap_pmd_range(struct mmu_gather *tlb,
@@ -690,7 +691,6 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
 /**
  * unmap_vmas - unmap a range of memory covered by a list of vma's
  * @tlbp: address of the caller's struct mmu_gather
- * @mm: the controlling mm_struct
  * @vma: the starting vma
  * @start_addr: virtual address at which to start unmapping
  * @end_addr: virtual address at which to end unmapping
@@ -699,10 +699,10 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
  *
  * Returns the end address of the unmapping (restart addr if interrupted).
  *
- * Unmap all pages in the vma list.  Called under page_table_lock.
+ * Unmap all pages in the vma list.
  *
- * We aim to not hold page_table_lock for too long (for scheduling latency
- * reasons).  So zap pages in ZAP_BLOCK_SIZE bytecounts.  This means we need to
+ * We aim to not hold locks for too long (for scheduling latency reasons).
+ * So zap pages in ZAP_BLOCK_SIZE bytecounts.  This means we need to
  * return the ending mmu_gather to the caller.
  *
  * Only addresses between `start' and `end' will be unmapped.
@@ -714,7 +714,7 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
  * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
  * drops the lock and schedules.
  */
-unsigned long unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
+unsigned long unmap_vmas(struct mmu_gather **tlbp,
 		struct vm_area_struct *vma, unsigned long start_addr,
 		unsigned long end_addr, unsigned long *nr_accounted,
 		struct zap_details *details)
@@ -764,19 +764,15 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
 			tlb_finish_mmu(*tlbp, tlb_start, start);
 
 			if (need_resched() ||
-				need_lockbreak(&mm->page_table_lock) ||
 				(i_mmap_lock && need_lockbreak(i_mmap_lock))) {
 				if (i_mmap_lock) {
-					/* must reset count of rss freed */
-					*tlbp = tlb_gather_mmu(mm, fullmm);
+					*tlbp = NULL;
 					goto out;
 				}
-				spin_unlock(&mm->page_table_lock);
 				cond_resched();
-				spin_lock(&mm->page_table_lock);
 			}
 
-			*tlbp = tlb_gather_mmu(mm, fullmm);
+			*tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
 			tlb_start_valid = 0;
 			zap_bytes = ZAP_BLOCK_SIZE;
 		}
@@ -800,18 +796,12 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
 	unsigned long end = address + size;
 	unsigned long nr_accounted = 0;
 
-	if (is_vm_hugetlb_page(vma)) {
-		zap_hugepage_range(vma, address, size);
-		return end;
-	}
-
 	lru_add_drain();
 	tlb = tlb_gather_mmu(mm, 0);
 	update_hiwater_rss(mm);
-	spin_lock(&mm->page_table_lock);
-	end = unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details);
-	spin_unlock(&mm->page_table_lock);
-	tlb_finish_mmu(tlb, address, end);
+	end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
+	if (tlb)
+		tlb_finish_mmu(tlb, address, end);
 	return end;
 }
 
@@ -1434,13 +1424,6 @@ again:
 
 	restart_addr = zap_page_range(vma, start_addr,
 					end_addr - start_addr, details);
-
-	/*
-	 * We cannot rely on the break test in unmap_vmas:
-	 * on the one hand, we don't want to restart our loop
-	 * just because that broke out for the page_table_lock;
-	 * on the other hand, it does no test when vma is small.
-	 */
 	need_break = need_resched() ||
 			need_lockbreak(details->i_mmap_lock);
 
diff --git a/mm/mmap.c b/mm/mmap.c
index fa35323a3c5b..5ecc2cf3e1d7 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1673,9 +1673,7 @@ static void unmap_region(struct mm_struct *mm,
 	lru_add_drain();
 	tlb = tlb_gather_mmu(mm, 0);
 	update_hiwater_rss(mm);
-	spin_lock(&mm->page_table_lock);
-	unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL);
-	spin_unlock(&mm->page_table_lock);
+	unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
 	vm_unacct_memory(nr_accounted);
 	free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
 				 next? next->vm_start: 0);
@@ -1958,9 +1956,7 @@ void exit_mmap(struct mm_struct *mm)
 	tlb = tlb_gather_mmu(mm, 1);
 	/* Don't update_hiwater_rss(mm) here, do_exit already did */
 	/* Use -1 here to ensure all VMAs in the mm are unmapped */
-	spin_lock(&mm->page_table_lock);
-	end = unmap_vmas(&tlb, mm, vma, 0, -1, &nr_accounted, NULL);
-	spin_unlock(&mm->page_table_lock);
+	end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
 	vm_unacct_memory(nr_accounted);
 	free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
 	tlb_finish_mmu(tlb, 0, end);

From 67b02f119df50ffad5a4e9e53ea4c896535862cd Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:31 -0700
Subject: [PATCH 56/91] [PATCH] mm: xip_unmap ZERO_PAGE fix

Small fix to the PageReserved patch: the mips ZERO_PAGE(address) depends on
address, so __xip_unmap is wrong to initialize page with that before address
is initialized; and in fact must re-evaluate it each iteration.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/filemap_xip.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 9354ee279b13..4e74ad60339a 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -174,7 +174,7 @@ __xip_unmap (struct address_space * mapping,
 	unsigned long address;
 	pte_t *pte;
 	pte_t pteval;
-	struct page *page = ZERO_PAGE(address);
+	struct page *page;
 
 	spin_lock(&mapping->i_mmap_lock);
 	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
@@ -182,6 +182,7 @@ __xip_unmap (struct address_space * mapping,
 		address = vma->vm_start +
 			((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
 		BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+		page = ZERO_PAGE(address);
 		/*
 		 * We need the page_table_lock to protect us from page faults,
 		 * munmap, fork, etc...

From c0718806cf955d5eb51ea77bffb5b21d9bba4972 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:31 -0700
Subject: [PATCH 57/91] [PATCH] mm: rmap with inner ptlock

rmap's page_check_address descend without page_table_lock.  First just
pte_offset_map in case there's no pte present worth locking for, then take
page_table_lock for the full check, and pass ptl back to caller in the same
style as pte_offset_map_lock.  __xip_unmap, page_referenced_one and
try_to_unmap_one use pte_unmap_unlock.  try_to_unmap_cluster also.

page_check_address reformatted to avoid progressive indentation.  No use is
made of its one error code, return NULL when it fails.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/rmap.h |   4 +-
 mm/filemap_xip.c     |  12 ++---
 mm/rmap.c            | 109 +++++++++++++++++++++----------------------
 3 files changed, 60 insertions(+), 65 deletions(-)

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index e80fb7ee6efd..35b30e6c8cf8 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -95,8 +95,8 @@ int try_to_unmap(struct page *);
 /*
  * Called from mm/filemap_xip.c to unmap empty zero page
  */
-pte_t *page_check_address(struct page *, struct mm_struct *, unsigned long);
-
+pte_t *page_check_address(struct page *, struct mm_struct *,
+				unsigned long, spinlock_t **);
 
 /*
  * Used by swapoff to help locate where page is expected in vma.
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 4e74ad60339a..9cf687e4a29a 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -174,6 +174,7 @@ __xip_unmap (struct address_space * mapping,
 	unsigned long address;
 	pte_t *pte;
 	pte_t pteval;
+	spinlock_t *ptl;
 	struct page *page;
 
 	spin_lock(&mapping->i_mmap_lock);
@@ -183,20 +184,15 @@ __xip_unmap (struct address_space * mapping,
 			((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
 		BUG_ON(address < vma->vm_start || address >= vma->vm_end);
 		page = ZERO_PAGE(address);
-		/*
-		 * We need the page_table_lock to protect us from page faults,
-		 * munmap, fork, etc...
-		 */
-		pte = page_check_address(page, mm, address);
-		if (!IS_ERR(pte)) {
+		pte = page_check_address(page, mm, address, &ptl);
+		if (pte) {
 			/* Nuke the page table entry. */
 			flush_cache_page(vma, address, pte_pfn(*pte));
 			pteval = ptep_clear_flush(vma, address, pte);
 			page_remove_rmap(page);
 			dec_mm_counter(mm, file_rss);
 			BUG_ON(pte_dirty(pteval));
-			pte_unmap(pte);
-			spin_unlock(&mm->page_table_lock);
+			pte_unmap_unlock(pte, ptl);
 			page_cache_release(page);
 		}
 	}
diff --git a/mm/rmap.c b/mm/rmap.c
index 4c52c56c9905..a84bdfe582c0 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -247,34 +247,41 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
  * On success returns with mapped pte and locked mm->page_table_lock.
  */
 pte_t *page_check_address(struct page *page, struct mm_struct *mm,
-			  unsigned long address)
+			  unsigned long address, spinlock_t **ptlp)
 {
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t *pte;
+	spinlock_t *ptl;
 
-	/*
-	 * We need the page_table_lock to protect us from page faults,
-	 * munmap, fork, etc...
-	 */
-	spin_lock(&mm->page_table_lock);
 	pgd = pgd_offset(mm, address);
-	if (likely(pgd_present(*pgd))) {
-		pud = pud_offset(pgd, address);
-		if (likely(pud_present(*pud))) {
-			pmd = pmd_offset(pud, address);
-			if (likely(pmd_present(*pmd))) {
-				pte = pte_offset_map(pmd, address);
-				if (likely(pte_present(*pte) &&
-					   page_to_pfn(page) == pte_pfn(*pte)))
-					return pte;
-				pte_unmap(pte);
-			}
-		}
+	if (!pgd_present(*pgd))
+		return NULL;
+
+	pud = pud_offset(pgd, address);
+	if (!pud_present(*pud))
+		return NULL;
+
+	pmd = pmd_offset(pud, address);
+	if (!pmd_present(*pmd))
+		return NULL;
+
+	pte = pte_offset_map(pmd, address);
+	/* Make a quick check before getting the lock */
+	if (!pte_present(*pte)) {
+		pte_unmap(pte);
+		return NULL;
 	}
-	spin_unlock(&mm->page_table_lock);
-	return ERR_PTR(-ENOENT);
+
+	ptl = &mm->page_table_lock;
+	spin_lock(ptl);
+	if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
+		*ptlp = ptl;
+		return pte;
+	}
+	pte_unmap_unlock(pte, ptl);
+	return NULL;
 }
 
 /*
@@ -287,28 +294,28 @@ static int page_referenced_one(struct page *page,
 	struct mm_struct *mm = vma->vm_mm;
 	unsigned long address;
 	pte_t *pte;
+	spinlock_t *ptl;
 	int referenced = 0;
 
 	address = vma_address(page, vma);
 	if (address == -EFAULT)
 		goto out;
 
-	pte = page_check_address(page, mm, address);
-	if (!IS_ERR(pte)) {
-		if (ptep_clear_flush_young(vma, address, pte))
-			referenced++;
+	pte = page_check_address(page, mm, address, &ptl);
+	if (!pte)
+		goto out;
 
-		/* Pretend the page is referenced if the task has the
-		   swap token and is in the middle of a page fault. */
-		if (mm != current->mm && !ignore_token &&
-				has_swap_token(mm) &&
-				rwsem_is_locked(&mm->mmap_sem))
-			referenced++;
+	if (ptep_clear_flush_young(vma, address, pte))
+		referenced++;
 
-		(*mapcount)--;
-		pte_unmap(pte);
-		spin_unlock(&mm->page_table_lock);
-	}
+	/* Pretend the page is referenced if the task has the
+	   swap token and is in the middle of a page fault. */
+	if (mm != current->mm && !ignore_token && has_swap_token(mm) &&
+			rwsem_is_locked(&mm->mmap_sem))
+		referenced++;
+
+	(*mapcount)--;
+	pte_unmap_unlock(pte, ptl);
 out:
 	return referenced;
 }
@@ -507,14 +514,15 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
 	unsigned long address;
 	pte_t *pte;
 	pte_t pteval;
+	spinlock_t *ptl;
 	int ret = SWAP_AGAIN;
 
 	address = vma_address(page, vma);
 	if (address == -EFAULT)
 		goto out;
 
-	pte = page_check_address(page, mm, address);
-	if (IS_ERR(pte))
+	pte = page_check_address(page, mm, address, &ptl);
+	if (!pte)
 		goto out;
 
 	/*
@@ -564,8 +572,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
 	page_cache_release(page);
 
 out_unmap:
-	pte_unmap(pte);
-	spin_unlock(&mm->page_table_lock);
+	pte_unmap_unlock(pte, ptl);
 out:
 	return ret;
 }
@@ -599,19 +606,14 @@ static void try_to_unmap_cluster(unsigned long cursor,
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
-	pte_t *pte, *original_pte;
+	pte_t *pte;
 	pte_t pteval;
+	spinlock_t *ptl;
 	struct page *page;
 	unsigned long address;
 	unsigned long end;
 	unsigned long pfn;
 
-	/*
-	 * We need the page_table_lock to protect us from page faults,
-	 * munmap, fork, etc...
-	 */
-	spin_lock(&mm->page_table_lock);
-
 	address = (vma->vm_start + cursor) & CLUSTER_MASK;
 	end = address + CLUSTER_SIZE;
 	if (address < vma->vm_start)
@@ -621,22 +623,22 @@ static void try_to_unmap_cluster(unsigned long cursor,
 
 	pgd = pgd_offset(mm, address);
 	if (!pgd_present(*pgd))
-		goto out_unlock;
+		return;
 
 	pud = pud_offset(pgd, address);
 	if (!pud_present(*pud))
-		goto out_unlock;
+		return;
 
 	pmd = pmd_offset(pud, address);
 	if (!pmd_present(*pmd))
-		goto out_unlock;
+		return;
+
+	pte = pte_offset_map_lock(mm, pmd, address, &ptl);
 
 	/* Update high watermark before we lower rss */
 	update_hiwater_rss(mm);
 
-	for (original_pte = pte = pte_offset_map(pmd, address);
-			address < end; pte++, address += PAGE_SIZE) {
-
+	for (; address < end; pte++, address += PAGE_SIZE) {
 		if (!pte_present(*pte))
 			continue;
 
@@ -669,10 +671,7 @@ static void try_to_unmap_cluster(unsigned long cursor,
 		dec_mm_counter(mm, file_rss);
 		(*mapcount)--;
 	}
-
-	pte_unmap(original_pte);
-out_unlock:
-	spin_unlock(&mm->page_table_lock);
+	pte_unmap_unlock(pte - 1, ptl);
 }
 
 static int try_to_unmap_anon(struct page *page)

From c34d1b4d165c67b966bca4aba026443d7ff161eb Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:32 -0700
Subject: [PATCH 58/91] [PATCH] mm: kill check_user_page_readable

check_user_page_readable is a problematic variant of follow_page.  It's used
only by oprofile's i386 and arm backtrace code, at interrupt time, to
establish whether a userspace stackframe is currently readable.

This is problematic, because we want to push the page_table_lock down inside
follow_page, and later split it; whereas oprofile is doing a spin_trylock on
it (in the i386 case, forgotten in the arm case), and needs that to pin
perhaps two pages spanned by the stackframe (which might be covered by
different locks when we split).

I think oprofile is going about this in the wrong way: it doesn't need to know
the area is readable (neither i386 nor arm uses read protection of user
pages), it doesn't need to pin the memory, it should simply
__copy_from_user_inatomic, and see if that succeeds or not.  Sorry, but I've
not got around to devising the sparse __user annotations for this.

Then we can eliminate check_user_page_readable, and return to a single
follow_page without the __follow_page variants.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/arm/oprofile/backtrace.c  | 46 +++++++---------------------------
 arch/i386/oprofile/backtrace.c | 38 ++++++++++------------------
 include/linux/mm.h             |  1 -
 mm/memory.c                    | 29 +++------------------
 4 files changed, 26 insertions(+), 88 deletions(-)

diff --git a/arch/arm/oprofile/backtrace.c b/arch/arm/oprofile/backtrace.c
index df35c452a8bf..7c22c12618cc 100644
--- a/arch/arm/oprofile/backtrace.c
+++ b/arch/arm/oprofile/backtrace.c
@@ -49,42 +49,22 @@ static struct frame_tail* kernel_backtrace(struct frame_tail *tail)
 
 static struct frame_tail* user_backtrace(struct frame_tail *tail)
 {
-	struct frame_tail buftail;
+	struct frame_tail buftail[2];
 
-	/* hardware pte might not be valid due to dirty/accessed bit emulation
-	 * so we use copy_from_user and benefit from exception fixups */
-	if (copy_from_user(&buftail, tail, sizeof(struct frame_tail)))
+	/* Also check accessibility of one struct frame_tail beyond */
+	if (!access_ok(VERIFY_READ, tail, sizeof(buftail)))
+		return NULL;
+	if (__copy_from_user_inatomic(buftail, tail, sizeof(buftail)))
 		return NULL;
 
-	oprofile_add_trace(buftail.lr);
+	oprofile_add_trace(buftail[0].lr);
 
 	/* frame pointers should strictly progress back up the stack
 	 * (towards higher addresses) */
-	if (tail >= buftail.fp)
+	if (tail >= buftail[0].fp)
 		return NULL;
 
-	return buftail.fp-1;
-}
-
-/* Compare two addresses and see if they're on the same page */
-#define CMP_ADDR_EQUAL(x,y,offset) ((((unsigned long) x) >> PAGE_SHIFT) \
-	== ((((unsigned long) y) + offset) >> PAGE_SHIFT))
-
-/* check that the page(s) containing the frame tail are present */
-static int pages_present(struct frame_tail *tail)
-{
-	struct mm_struct * mm = current->mm;
-
-	if (!check_user_page_readable(mm, (unsigned long)tail))
-		return 0;
-
-	if (CMP_ADDR_EQUAL(tail, tail, 8))
-		return 1;
-
-	if (!check_user_page_readable(mm, ((unsigned long)tail) + 8))
-		return 0;
-
-	return 1;
+	return buftail[0].fp-1;
 }
 
 /*
@@ -118,7 +98,6 @@ static int valid_kernel_stack(struct frame_tail *tail, struct pt_regs *regs)
 void arm_backtrace(struct pt_regs * const regs, unsigned int depth)
 {
 	struct frame_tail *tail;
-	unsigned long last_address = 0;
 
 	tail = ((struct frame_tail *) regs->ARM_fp) - 1;
 
@@ -132,13 +111,6 @@ void arm_backtrace(struct pt_regs * const regs, unsigned int depth)
 		return;
 	}
 
-	while (depth-- && tail && !((unsigned long) tail & 3)) {
-		if ((!CMP_ADDR_EQUAL(last_address, tail, 0)
-			|| !CMP_ADDR_EQUAL(last_address, tail, 8))
-				&& !pages_present(tail))
-			return;
-		last_address = (unsigned long) tail;
+	while (depth-- && tail && !((unsigned long) tail & 3))
 		tail = user_backtrace(tail);
-	}
 }
-
diff --git a/arch/i386/oprofile/backtrace.c b/arch/i386/oprofile/backtrace.c
index 65dfd2edb671..21654be3f73f 100644
--- a/arch/i386/oprofile/backtrace.c
+++ b/arch/i386/oprofile/backtrace.c
@@ -12,6 +12,7 @@
 #include <linux/sched.h>
 #include <linux/mm.h>
 #include <asm/ptrace.h>
+#include <asm/uaccess.h>
 
 struct frame_head {
 	struct frame_head * ebp;
@@ -21,26 +22,22 @@ struct frame_head {
 static struct frame_head *
 dump_backtrace(struct frame_head * head)
 {
-	oprofile_add_trace(head->ret);
+	struct frame_head bufhead[2];
+
+	/* Also check accessibility of one struct frame_head beyond */
+	if (!access_ok(VERIFY_READ, head, sizeof(bufhead)))
+		return NULL;
+	if (__copy_from_user_inatomic(bufhead, head, sizeof(bufhead)))
+		return NULL;
+
+	oprofile_add_trace(bufhead[0].ret);
 
 	/* frame pointers should strictly progress back up the stack
 	 * (towards higher addresses) */
-	if (head >= head->ebp)
+	if (head >= bufhead[0].ebp)
 		return NULL;
 
-	return head->ebp;
-}
-
-/* check that the page(s) containing the frame head are present */
-static int pages_present(struct frame_head * head)
-{
-	struct mm_struct * mm = current->mm;
-
-	/* FIXME: only necessary once per page */
-	if (!check_user_page_readable(mm, (unsigned long)head))
-		return 0;
-
-	return check_user_page_readable(mm, (unsigned long)(head + 1));
+	return bufhead[0].ebp;
 }
 
 /*
@@ -97,15 +94,6 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth)
 		return;
 	}
 
-#ifdef CONFIG_SMP
-	if (!spin_trylock(&current->mm->page_table_lock))
-		return;
-#endif
-
-	while (depth-- && head && pages_present(head))
+	while (depth-- && head)
 		head = dump_backtrace(head);
-
-#ifdef CONFIG_SMP
-	spin_unlock(&current->mm->page_table_lock);
-#endif
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 972e2ce8e07c..aa8de20e2e80 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -944,7 +944,6 @@ extern struct page * vmalloc_to_page(void *addr);
 extern unsigned long vmalloc_to_pfn(void *addr);
 extern struct page * follow_page(struct mm_struct *mm, unsigned long address,
 		int write);
-extern int check_user_page_readable(struct mm_struct *mm, unsigned long address);
 int remap_pfn_range(struct vm_area_struct *, unsigned long,
 		unsigned long, unsigned long, pgprot_t);
 
diff --git a/mm/memory.c b/mm/memory.c
index 622a4ef5409f..51f7c0a220d4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -809,8 +809,7 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
  * Do a quick page-table lookup for a single page.
  * mm->page_table_lock must be held.
  */
-static struct page *__follow_page(struct mm_struct *mm, unsigned long address,
-			int read, int write, int accessed)
+struct page *follow_page(struct mm_struct *mm, unsigned long address, int write)
 {
 	pgd_t *pgd;
 	pud_t *pud;
@@ -846,16 +845,12 @@ static struct page *__follow_page(struct mm_struct *mm, unsigned long address,
 	if (pte_present(pte)) {
 		if (write && !pte_write(pte))
 			goto out;
-		if (read && !pte_read(pte))
-			goto out;
 		pfn = pte_pfn(pte);
 		if (pfn_valid(pfn)) {
 			page = pfn_to_page(pfn);
-			if (accessed) {
-				if (write && !pte_dirty(pte) &&!PageDirty(page))
-					set_page_dirty(page);
-				mark_page_accessed(page);
-			}
+			if (write && !pte_dirty(pte) &&!PageDirty(page))
+				set_page_dirty(page);
+			mark_page_accessed(page);
 			return page;
 		}
 	}
@@ -864,22 +859,6 @@ out:
 	return NULL;
 }
 
-inline struct page *
-follow_page(struct mm_struct *mm, unsigned long address, int write)
-{
-	return __follow_page(mm, address, 0, write, 1);
-}
-
-/*
- * check_user_page_readable() can be called frm niterrupt context by oprofile,
- * so we need to avoid taking any non-irq-safe locks
- */
-int check_user_page_readable(struct mm_struct *mm, unsigned long address)
-{
-	return __follow_page(mm, address, 1, 0, 0) != NULL;
-}
-EXPORT_SYMBOL(check_user_page_readable);
-
 static inline int
 untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma,
 			 unsigned long address)

From deceb6cd17e6dfafe4c4f81b1b4153bc41b2cb70 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:33 -0700
Subject: [PATCH 59/91] [PATCH] mm: follow_page with inner ptlock

Final step in pushing down common core's page_table_lock.  follow_page no
longer wants caller to hold page_table_lock, uses pte_offset_map_lock itself;
and so no page_table_lock is taken in get_user_pages itself.

But get_user_pages (and get_futex_key) do then need follow_page to pin the
page for them: take Daniel's suggestion of bitflags to follow_page.

Need one for WRITE, another for TOUCH (it was the accessed flag before:
vanished along with check_user_page_readable, but surely get_numa_maps is
wrong to mark every page it finds as accessed), another for GET.

And another, ANON to dispose of untouched_anonymous_page: it seems silly for
that to descend a second time, let follow_page observe if there was no page
table and return ZERO_PAGE if so.  Fix minor bug in that: check VM_LOCKED -
make_pages_present ought to make readonly anonymous present.

Give get_numa_maps a cond_resched while we're there.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/task_mmu.c |   3 +-
 include/linux/mm.h |  18 +++---
 kernel/futex.c     |   6 +-
 mm/memory.c        | 154 +++++++++++++++++++++------------------------
 mm/nommu.c         |   3 +-
 5 files changed, 88 insertions(+), 96 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 7e5e7ec2e36d..d2fa42006d8f 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -419,7 +419,6 @@ static struct numa_maps *get_numa_maps(const struct vm_area_struct *vma)
 	for_each_node(i)
 		md->node[i] =0;
 
-	spin_lock(&mm->page_table_lock);
  	for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) {
 		page = follow_page(mm, vaddr, 0);
 		if (page) {
@@ -434,8 +433,8 @@ static struct numa_maps *get_numa_maps(const struct vm_area_struct *vma)
 				md->anon++;
 			md->node[page_to_nid(page)]++;
 		}
+		cond_resched();
 	}
-	spin_unlock(&mm->page_table_lock);
 	return md;
 }
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index aa8de20e2e80..e8d1424153bb 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -938,14 +938,18 @@ static inline unsigned long vma_pages(struct vm_area_struct *vma)
 	return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
 }
 
-extern struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr);
+struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
+struct page *vmalloc_to_page(void *addr);
+unsigned long vmalloc_to_pfn(void *addr);
+int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
+			unsigned long pfn, unsigned long size, pgprot_t);
 
-extern struct page * vmalloc_to_page(void *addr);
-extern unsigned long vmalloc_to_pfn(void *addr);
-extern struct page * follow_page(struct mm_struct *mm, unsigned long address,
-		int write);
-int remap_pfn_range(struct vm_area_struct *, unsigned long,
-		unsigned long, unsigned long, pgprot_t);
+struct page *follow_page(struct mm_struct *, unsigned long address,
+			unsigned int foll_flags);
+#define FOLL_WRITE	0x01	/* check pte is writable */
+#define FOLL_TOUCH	0x02	/* mark page accessed */
+#define FOLL_GET	0x04	/* do get_page on page */
+#define FOLL_ANON	0x08	/* give ZERO_PAGE if no pgtable */
 
 #ifdef CONFIG_PROC_FS
 void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long);
diff --git a/kernel/futex.c b/kernel/futex.c
index ca05fe6a70b2..3b4d5ad44cc6 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -205,15 +205,13 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
 	/*
 	 * Do a quick atomic lookup first - this is the fastpath.
 	 */
-	spin_lock(&current->mm->page_table_lock);
-	page = follow_page(mm, uaddr, 0);
+	page = follow_page(mm, uaddr, FOLL_TOUCH|FOLL_GET);
 	if (likely(page != NULL)) {
 		key->shared.pgoff =
 			page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
-		spin_unlock(&current->mm->page_table_lock);
+		put_page(page);
 		return 0;
 	}
-	spin_unlock(&current->mm->page_table_lock);
 
 	/*
 	 * Do it the general way.
diff --git a/mm/memory.c b/mm/memory.c
index 51f7c0a220d4..8461e2dd91d7 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -807,86 +807,82 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
 
 /*
  * Do a quick page-table lookup for a single page.
- * mm->page_table_lock must be held.
  */
-struct page *follow_page(struct mm_struct *mm, unsigned long address, int write)
+struct page *follow_page(struct mm_struct *mm, unsigned long address,
+			unsigned int flags)
 {
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t *ptep, pte;
+	spinlock_t *ptl;
 	unsigned long pfn;
 	struct page *page;
 
-	page = follow_huge_addr(mm, address, write);
-	if (! IS_ERR(page))
-		return page;
+	page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
+	if (!IS_ERR(page)) {
+		BUG_ON(flags & FOLL_GET);
+		goto out;
+	}
 
+	page = NULL;
 	pgd = pgd_offset(mm, address);
 	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
-		goto out;
+		goto no_page_table;
 
 	pud = pud_offset(pgd, address);
 	if (pud_none(*pud) || unlikely(pud_bad(*pud)))
-		goto out;
+		goto no_page_table;
 	
 	pmd = pmd_offset(pud, address);
 	if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
-		goto out;
-	if (pmd_huge(*pmd))
-		return follow_huge_pmd(mm, address, pmd, write);
+		goto no_page_table;
 
-	ptep = pte_offset_map(pmd, address);
+	if (pmd_huge(*pmd)) {
+		BUG_ON(flags & FOLL_GET);
+		page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
+		goto out;
+	}
+
+	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
 	if (!ptep)
 		goto out;
 
 	pte = *ptep;
-	pte_unmap(ptep);
-	if (pte_present(pte)) {
-		if (write && !pte_write(pte))
-			goto out;
-		pfn = pte_pfn(pte);
-		if (pfn_valid(pfn)) {
-			page = pfn_to_page(pfn);
-			if (write && !pte_dirty(pte) &&!PageDirty(page))
-				set_page_dirty(page);
-			mark_page_accessed(page);
-			return page;
-		}
+	if (!pte_present(pte))
+		goto unlock;
+	if ((flags & FOLL_WRITE) && !pte_write(pte))
+		goto unlock;
+	pfn = pte_pfn(pte);
+	if (!pfn_valid(pfn))
+		goto unlock;
+
+	page = pfn_to_page(pfn);
+	if (flags & FOLL_GET)
+		get_page(page);
+	if (flags & FOLL_TOUCH) {
+		if ((flags & FOLL_WRITE) &&
+		    !pte_dirty(pte) && !PageDirty(page))
+			set_page_dirty(page);
+		mark_page_accessed(page);
 	}
-
+unlock:
+	pte_unmap_unlock(ptep, ptl);
 out:
-	return NULL;
-}
+	return page;
 
-static inline int
-untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma,
-			 unsigned long address)
-{
-	pgd_t *pgd;
-	pud_t *pud;
-	pmd_t *pmd;
-
-	/* Check if the vma is for an anonymous mapping. */
-	if (vma->vm_ops && vma->vm_ops->nopage)
-		return 0;
-
-	/* Check if page directory entry exists. */
-	pgd = pgd_offset(mm, address);
-	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
-		return 1;
-
-	pud = pud_offset(pgd, address);
-	if (pud_none(*pud) || unlikely(pud_bad(*pud)))
-		return 1;
-
-	/* Check if page middle directory entry exists. */
-	pmd = pmd_offset(pud, address);
-	if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
-		return 1;
-
-	/* There is a pte slot for 'address' in 'mm'. */
-	return 0;
+no_page_table:
+	/*
+	 * When core dumping an enormous anonymous area that nobody
+	 * has touched so far, we don't want to allocate page tables.
+	 */
+	if (flags & FOLL_ANON) {
+		page = ZERO_PAGE(address);
+		if (flags & FOLL_GET)
+			get_page(page);
+		BUG_ON(flags & FOLL_WRITE);
+	}
+	return page;
 }
 
 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
@@ -894,18 +890,19 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		struct page **pages, struct vm_area_struct **vmas)
 {
 	int i;
-	unsigned int flags;
+	unsigned int vm_flags;
 
 	/* 
 	 * Require read or write permissions.
 	 * If 'force' is set, we only require the "MAY" flags.
 	 */
-	flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
-	flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
+	vm_flags  = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
+	vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
 	i = 0;
 
 	do {
-		struct vm_area_struct *	vma;
+		struct vm_area_struct *vma;
+		unsigned int foll_flags;
 
 		vma = find_extend_vma(mm, start);
 		if (!vma && in_gate_area(tsk, start)) {
@@ -946,7 +943,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		}
 
 		if (!vma || (vma->vm_flags & (VM_IO | VM_RESERVED))
-				|| !(flags & vma->vm_flags))
+				|| !(vm_flags & vma->vm_flags))
 			return i ? : -EFAULT;
 
 		if (is_vm_hugetlb_page(vma)) {
@@ -954,29 +951,25 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 						&start, &len, i);
 			continue;
 		}
-		spin_lock(&mm->page_table_lock);
+
+		foll_flags = FOLL_TOUCH;
+		if (pages)
+			foll_flags |= FOLL_GET;
+		if (!write && !(vma->vm_flags & VM_LOCKED) &&
+		    (!vma->vm_ops || !vma->vm_ops->nopage))
+			foll_flags |= FOLL_ANON;
+
 		do {
-			int write_access = write;
 			struct page *page;
 
-			cond_resched_lock(&mm->page_table_lock);
-			while (!(page = follow_page(mm, start, write_access))) {
+			if (write)
+				foll_flags |= FOLL_WRITE;
+
+			cond_resched();
+			while (!(page = follow_page(mm, start, foll_flags))) {
 				int ret;
-
-				/*
-				 * Shortcut for anonymous pages. We don't want
-				 * to force the creation of pages tables for
-				 * insanely big anonymously mapped areas that
-				 * nobody touched so far. This is important
-				 * for doing a core dump for these mappings.
-				 */
-				if (!write && untouched_anonymous_page(mm,vma,start)) {
-					page = ZERO_PAGE(start);
-					break;
-				}
-				spin_unlock(&mm->page_table_lock);
-				ret = __handle_mm_fault(mm, vma, start, write_access);
-
+				ret = __handle_mm_fault(mm, vma, start,
+						foll_flags & FOLL_WRITE);
 				/*
 				 * The VM_FAULT_WRITE bit tells us that do_wp_page has
 				 * broken COW when necessary, even if maybe_mkwrite
@@ -984,7 +977,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 				 * subsequent page lookups as if they were reads.
 				 */
 				if (ret & VM_FAULT_WRITE)
-					write_access = 0;
+					foll_flags &= ~FOLL_WRITE;
 				
 				switch (ret & ~VM_FAULT_WRITE) {
 				case VM_FAULT_MINOR:
@@ -1000,12 +993,10 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 				default:
 					BUG();
 				}
-				spin_lock(&mm->page_table_lock);
 			}
 			if (pages) {
 				pages[i] = page;
 				flush_dcache_page(page);
-				page_cache_get(page);
 			}
 			if (vmas)
 				vmas[i] = vma;
@@ -1013,7 +1004,6 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 			start += PAGE_SIZE;
 			len--;
 		} while (len && start < vma->vm_end);
-		spin_unlock(&mm->page_table_lock);
 	} while (len);
 	return i;
 }
diff --git a/mm/nommu.c b/mm/nommu.c
index dfb124ffb9be..d1e076a487cb 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1049,7 +1049,8 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
 
 EXPORT_SYMBOL(find_vma);
 
-struct page * follow_page(struct mm_struct *mm, unsigned long addr, int write)
+struct page *follow_page(struct mm_struct *mm, unsigned long address,
+			unsigned int foll_flags)
 {
 	return NULL;
 }

From 60ec5585496871345c1a8113d7b60ed9d9474866 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:34 -0700
Subject: [PATCH 60/91] [PATCH] mm: i386 sh sh64 ready for split ptlock

Use pte_offset_map_lock, instead of pte_offset_map (or inappropriate
pte_offset_kernel) and mm-wide page_table_lock, in sundry arch places.

The i386 vm86 mark_screen_rdonly: yes, there was and is an assumption that the
screen fits inside the one page table, as indeed it does.

The sh __do_page_fault: which handles both kernel faults (without lock) and
user mm faults (locked - though it set_pte without locking before).

The sh64 flush_cache_range and helpers: which wrongly thought callers held
page_table_lock before (only its tlb_start_vma did, and no longer does so);
moved the flush loop down, and adjusted the large versus small range decision
to consider a range which spans page tables as large.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Acked-by: Paul Mundt <lethal@linux-sh.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/vm86.c | 17 +++++------
 arch/sh/mm/fault.c      | 40 +++++++++++++++-----------
 arch/sh64/mm/cache.c    | 62 ++++++++++++++++++-----------------------
 3 files changed, 57 insertions(+), 62 deletions(-)

diff --git a/arch/i386/kernel/vm86.c b/arch/i386/kernel/vm86.c
index 16b485009622..fc1993564f98 100644
--- a/arch/i386/kernel/vm86.c
+++ b/arch/i386/kernel/vm86.c
@@ -134,17 +134,16 @@ struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
 	return ret;
 }
 
-static void mark_screen_rdonly(struct task_struct * tsk)
+static void mark_screen_rdonly(struct mm_struct *mm)
 {
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
-	pte_t *pte, *mapped;
+	pte_t *pte;
+	spinlock_t *ptl;
 	int i;
 
-	preempt_disable();
-	spin_lock(&tsk->mm->page_table_lock);
-	pgd = pgd_offset(tsk->mm, 0xA0000);
+	pgd = pgd_offset(mm, 0xA0000);
 	if (pgd_none_or_clear_bad(pgd))
 		goto out;
 	pud = pud_offset(pgd, 0xA0000);
@@ -153,16 +152,14 @@ static void mark_screen_rdonly(struct task_struct * tsk)
 	pmd = pmd_offset(pud, 0xA0000);
 	if (pmd_none_or_clear_bad(pmd))
 		goto out;
-	pte = mapped = pte_offset_map(pmd, 0xA0000);
+	pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl);
 	for (i = 0; i < 32; i++) {
 		if (pte_present(*pte))
 			set_pte(pte, pte_wrprotect(*pte));
 		pte++;
 	}
-	pte_unmap(mapped);
+	pte_unmap_unlock(pte, ptl);
 out:
-	spin_unlock(&tsk->mm->page_table_lock);
-	preempt_enable();
 	flush_tlb();
 }
 
@@ -306,7 +303,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
 
 	tsk->thread.screen_bitmap = info->screen_bitmap;
 	if (info->flags & VM86_SCREEN_BITMAP)
-		mark_screen_rdonly(tsk);
+		mark_screen_rdonly(tsk->mm);
 	__asm__ __volatile__(
 		"xorl %%eax,%%eax; movl %%eax,%%fs; movl %%eax,%%gs\n\t"
 		"movl %0,%%esp\n\t"
diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c
index 7abba2161da6..775f86cd3fe8 100644
--- a/arch/sh/mm/fault.c
+++ b/arch/sh/mm/fault.c
@@ -194,10 +194,13 @@ asmlinkage int __do_page_fault(struct pt_regs *regs, unsigned long writeaccess,
 			       unsigned long address)
 {
 	unsigned long addrmax = P4SEG;
-	pgd_t *dir;
+	pgd_t *pgd;
 	pmd_t *pmd;
 	pte_t *pte;
 	pte_t entry;
+	struct mm_struct *mm;
+	spinlock_t *ptl;
+	int ret = 1;
 
 #ifdef CONFIG_SH_KGDB
 	if (kgdb_nofault && kgdb_bus_err_hook)
@@ -208,28 +211,28 @@ asmlinkage int __do_page_fault(struct pt_regs *regs, unsigned long writeaccess,
 	addrmax = P4SEG_STORE_QUE + 0x04000000;
 #endif
 
-	if (address >= P3SEG && address < addrmax)
-		dir = pgd_offset_k(address);
-	else if (address >= TASK_SIZE)
+	if (address >= P3SEG && address < addrmax) {
+		pgd = pgd_offset_k(address);
+		mm = NULL;
+	} else if (address >= TASK_SIZE)
 		return 1;
-	else if (!current->mm)
+	else if (!(mm = current->mm))
 		return 1;
 	else
-		dir = pgd_offset(current->mm, address);
+		pgd = pgd_offset(mm, address);
 
-	pmd = pmd_offset(dir, address);
-	if (pmd_none(*pmd))
+	pmd = pmd_offset(pgd, address);
+	if (pmd_none_or_clear_bad(pmd))
 		return 1;
-	if (pmd_bad(*pmd)) {
-		pmd_ERROR(*pmd);
-		pmd_clear(pmd);
-		return 1;
-	}
-	pte = pte_offset_kernel(pmd, address);
+	if (mm)
+		pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+	else
+		pte = pte_offset_kernel(pmd, address);
+
 	entry = *pte;
 	if (pte_none(entry) || pte_not_present(entry)
 	    || (writeaccess && !pte_write(entry)))
-		return 1;
+		goto unlock;
 
 	if (writeaccess)
 		entry = pte_mkdirty(entry);
@@ -251,8 +254,11 @@ asmlinkage int __do_page_fault(struct pt_regs *regs, unsigned long writeaccess,
 
 	set_pte(pte, entry);
 	update_mmu_cache(NULL, address, entry);
-
-	return 0;
+	ret = 0;
+unlock:
+	if (mm)
+		pte_unmap_unlock(pte, ptl);
+	return ret;
 }
 
 void flush_tlb_page(struct vm_area_struct *vma, unsigned long page)
diff --git a/arch/sh64/mm/cache.c b/arch/sh64/mm/cache.c
index 3b87e25ea773..c0c1b21350d8 100644
--- a/arch/sh64/mm/cache.c
+++ b/arch/sh64/mm/cache.c
@@ -584,32 +584,36 @@ static void sh64_dcache_purge_phy_page(unsigned long paddr)
 	}
 }
 
-static void sh64_dcache_purge_user_page(struct mm_struct *mm, unsigned long eaddr)
+static void sh64_dcache_purge_user_pages(struct mm_struct *mm,
+				unsigned long addr, unsigned long end)
 {
 	pgd_t *pgd;
 	pmd_t *pmd;
 	pte_t *pte;
 	pte_t entry;
+	spinlock_t *ptl;
 	unsigned long paddr;
 
-	/* NOTE : all the callers of this have mm->page_table_lock held, so the
-	   following page table traversal is safe even on SMP/pre-emptible. */
+	if (!mm)
+		return; /* No way to find physical address of page */
 
-	if (!mm) return; /* No way to find physical address of page */
-	pgd = pgd_offset(mm, eaddr);
-	if (pgd_bad(*pgd)) return;
+	pgd = pgd_offset(mm, addr);
+	if (pgd_bad(*pgd))
+		return;
 
-	pmd = pmd_offset(pgd, eaddr);
-	if (pmd_none(*pmd) || pmd_bad(*pmd)) return;
-
-	pte = pte_offset_kernel(pmd, eaddr);
-	entry = *pte;
-	if (pte_none(entry) || !pte_present(entry)) return;
-
-	paddr = pte_val(entry) & PAGE_MASK;
-
-	sh64_dcache_purge_coloured_phy_page(paddr, eaddr);
+	pmd = pmd_offset(pgd, addr);
+	if (pmd_none(*pmd) || pmd_bad(*pmd))
+		return;
 
+	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+	do {
+		entry = *pte;
+		if (pte_none(entry) || !pte_present(entry))
+			continue;
+		paddr = pte_val(entry) & PAGE_MASK;
+		sh64_dcache_purge_coloured_phy_page(paddr, addr);
+	} while (pte++, addr += PAGE_SIZE, addr != end);
+	pte_unmap_unlock(pte - 1, ptl);
 }
 /****************************************************************************/
 
@@ -668,7 +672,7 @@ static void sh64_dcache_purge_user_range(struct mm_struct *mm,
 	int n_pages;
 
 	n_pages = ((end - start) >> PAGE_SHIFT);
-	if (n_pages >= 64) {
+	if (n_pages >= 64 || ((start ^ (end - 1)) & PMD_MASK)) {
 #if 1
 		sh64_dcache_purge_all();
 #else
@@ -707,20 +711,10 @@ static void sh64_dcache_purge_user_range(struct mm_struct *mm,
 		}
 #endif
 	} else {
-		/* 'Small' range */
-		unsigned long aligned_start;
-		unsigned long eaddr;
-		unsigned long last_page_start;
-
-		aligned_start = start & PAGE_MASK;
-		/* 'end' is 1 byte beyond the end of the range */
-		last_page_start = (end - 1) & PAGE_MASK;
-
-		eaddr = aligned_start;
-		while (eaddr <= last_page_start) {
-			sh64_dcache_purge_user_page(mm, eaddr);
-			eaddr += PAGE_SIZE;
-		}
+		/* Small range, covered by a single page table page */
+		start &= PAGE_MASK;	/* should already be so */
+		end = PAGE_ALIGN(end);	/* should already be so */
+		sh64_dcache_purge_user_pages(mm, start, end);
 	}
 	return;
 }
@@ -880,9 +874,7 @@ void flush_cache_range(struct vm_area_struct *vma, unsigned long start,
 	   addresses from the user address space specified by mm, after writing
 	   back any dirty data.
 
-	   Note(1), 'end' is 1 byte beyond the end of the range to flush.
-
-	   Note(2), this is called with mm->page_table_lock held.*/
+	   Note, 'end' is 1 byte beyond the end of the range to flush. */
 
 	sh64_dcache_purge_user_range(mm, start, end);
 	sh64_icache_inv_user_page_range(mm, start, end);
@@ -898,7 +890,7 @@ void flush_cache_page(struct vm_area_struct *vma, unsigned long eaddr, unsigned
 	   the I-cache must be searched too in case the page in question is
 	   both writable and being executed from (e.g. stack trampolines.)
 
-	   Note(1), this is called with mm->page_table_lock held.
+	   Note, this is called with pte lock held.
 	   */
 
 	sh64_dcache_purge_phy_page(pfn << PAGE_SHIFT);

From 69b0475456ff7ef520e16f69d7a15c0d68b74e64 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:36 -0700
Subject: [PATCH 61/91] [PATCH] mm: arm ready for split ptlock

Prepare arm for the split page_table_lock: three issues.

Signal handling's preserve and restore of iwmmxt context currently involves
reading and writing that context to and from user space, while holding
page_table_lock to secure the user page(s) against kswapd.  If we split the
lock, then the structure might span two pages, secured by to read into and
write from a kernel stack buffer, copying that out and in without locking (the
structure is 160 bytes in size, and here we're near the top of the kernel
stack).  Or would the overhead be noticeable?

arm_syscall's cmpxchg emulation use pte_offset_map_lock, instead of
pte_offset_map and mm-wide page_table_lock; and strictly, it should now also
take mmap_sem before descending to pmd, to guard against another thread
munmapping, and the page table pulled out beneath this thread.

Updated two comments in fault-armv.c.  adjust_pte is interesting, since its
modification of a pte in one part of the mm depends on the lock held when
calling update_mmu_cache for a pte in some other part of that mm.  This can't
be done with a split page_table_lock (and we've already taken the lowest lock
in the hierarchy here): so we'll have to disable split on arm, unless
CONFIG_CPU_CACHE_VIPT to ensures adjust_pte never used.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/arm/kernel/signal.c | 96 ++++++++--------------------------------
 arch/arm/kernel/traps.c  | 14 +++---
 arch/arm/mm/fault-armv.c |  7 ++-
 3 files changed, 33 insertions(+), 84 deletions(-)

diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index a94d75fef598..a917e3dd3666 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -139,93 +139,33 @@ struct iwmmxt_sigframe {
 	unsigned long	storage[0x98/4];
 };
 
-static int page_present(struct mm_struct *mm, void __user *uptr, int wr)
-{
-	unsigned long addr = (unsigned long)uptr;
-	pgd_t *pgd = pgd_offset(mm, addr);
-	if (pgd_present(*pgd)) {
-		pmd_t *pmd = pmd_offset(pgd, addr);
-		if (pmd_present(*pmd)) {
-			pte_t *pte = pte_offset_map(pmd, addr);
-			return (pte_present(*pte) && (!wr || pte_write(*pte)));
-		}
-	}
-	return 0;
-}
-
-static int copy_locked(void __user *uptr, void *kptr, size_t size, int write,
-		       void (*copyfn)(void *, void __user *))
-{
-	unsigned char v, __user *userptr = uptr;
-	int err = 0;
-
-	do {
-		struct mm_struct *mm;
-
-		if (write) {
-			__put_user_error(0, userptr, err);
-			__put_user_error(0, userptr + size - 1, err);
-		} else {
-			__get_user_error(v, userptr, err);
-			__get_user_error(v, userptr + size - 1, err);
-		}
-
-		if (err)
-			break;
-
-		mm = current->mm;
-		spin_lock(&mm->page_table_lock);
-		if (page_present(mm, userptr, write) &&
-		    page_present(mm, userptr + size - 1, write)) {
-		    	copyfn(kptr, uptr);
-		} else
-			err = 1;
-		spin_unlock(&mm->page_table_lock);
-	} while (err);
-
-	return err;
-}
-
 static int preserve_iwmmxt_context(struct iwmmxt_sigframe *frame)
 {
-	int err = 0;
+	char kbuf[sizeof(*frame) + 8];
+	struct iwmmxt_sigframe *kframe;
 
 	/* the iWMMXt context must be 64 bit aligned */
-	WARN_ON((unsigned long)frame & 7);
-
-	__put_user_error(IWMMXT_MAGIC0, &frame->magic0, err);
-	__put_user_error(IWMMXT_MAGIC1, &frame->magic1, err);
-
-	/*
-	 * iwmmxt_task_copy() doesn't check user permissions.
-	 * Let's do a dummy write on the upper boundary to ensure
-	 * access to user mem is OK all way up.
-	 */
-	err |= copy_locked(&frame->storage, current_thread_info(),
-			   sizeof(frame->storage), 1, iwmmxt_task_copy);
-	return err;
+	kframe = (struct iwmmxt_sigframe *)((unsigned long)(kbuf + 8) & ~7);
+	kframe->magic0 = IWMMXT_MAGIC0;
+	kframe->magic1 = IWMMXT_MAGIC1;
+	iwmmxt_task_copy(current_thread_info(), &kframe->storage);
+	return __copy_to_user(frame, kframe, sizeof(*frame));
 }
 
 static int restore_iwmmxt_context(struct iwmmxt_sigframe *frame)
 {
-	unsigned long magic0, magic1;
-	int err = 0;
+	char kbuf[sizeof(*frame) + 8];
+	struct iwmmxt_sigframe *kframe;
 
-	/* the iWMMXt context is 64 bit aligned */
-	WARN_ON((unsigned long)frame & 7);
-
-	/*
-	 * Validate iWMMXt context signature.
-	 * Also, iwmmxt_task_restore() doesn't check user permissions.
-	 * Let's do a dummy write on the upper boundary to ensure
-	 * access to user mem is OK all way up.
-	 */
-	__get_user_error(magic0, &frame->magic0, err);
-	__get_user_error(magic1, &frame->magic1, err);
-	if (!err && magic0 == IWMMXT_MAGIC0 && magic1 == IWMMXT_MAGIC1)
-		err = copy_locked(&frame->storage, current_thread_info(),
-				  sizeof(frame->storage), 0, iwmmxt_task_restore);
-	return err;
+	/* the iWMMXt context must be 64 bit aligned */
+	kframe = (struct iwmmxt_sigframe *)((unsigned long)(kbuf + 8) & ~7);
+	if (__copy_from_user(kframe, frame, sizeof(*frame)))
+		return -1;
+	if (kframe->magic0 != IWMMXT_MAGIC0 ||
+	    kframe->magic1 != IWMMXT_MAGIC1)
+		return -1;
+	iwmmxt_task_restore(current_thread_info(), &kframe->storage);
+	return 0;
 }
 
 #endif
diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c
index baa09601a64e..66e5a0516f23 100644
--- a/arch/arm/kernel/traps.c
+++ b/arch/arm/kernel/traps.c
@@ -483,29 +483,33 @@ asmlinkage int arm_syscall(int no, struct pt_regs *regs)
 		unsigned long addr = regs->ARM_r2;
 		struct mm_struct *mm = current->mm;
 		pgd_t *pgd; pmd_t *pmd; pte_t *pte;
+		spinlock_t *ptl;
 
 		regs->ARM_cpsr &= ~PSR_C_BIT;
-		spin_lock(&mm->page_table_lock);
+		down_read(&mm->mmap_sem);
 		pgd = pgd_offset(mm, addr);
 		if (!pgd_present(*pgd))
 			goto bad_access;
 		pmd = pmd_offset(pgd, addr);
 		if (!pmd_present(*pmd))
 			goto bad_access;
-		pte = pte_offset_map(pmd, addr);
-		if (!pte_present(*pte) || !pte_write(*pte))
+		pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+		if (!pte_present(*pte) || !pte_write(*pte)) {
+			pte_unmap_unlock(pte, ptl);
 			goto bad_access;
+		}
 		val = *(unsigned long *)addr;
 		val -= regs->ARM_r0;
 		if (val == 0) {
 			*(unsigned long *)addr = regs->ARM_r1;
 			regs->ARM_cpsr |= PSR_C_BIT;
 		}
-		spin_unlock(&mm->page_table_lock);
+		pte_unmap_unlock(pte, ptl);
+		up_read(&mm->mmap_sem);
 		return val;
 
 		bad_access:
-		spin_unlock(&mm->page_table_lock);
+		up_read(&mm->mmap_sem);
 		/* simulate a write access fault */
 		do_DataAbort(addr, 15 + (1 << 11), regs);
 		return -1;
diff --git a/arch/arm/mm/fault-armv.c b/arch/arm/mm/fault-armv.c
index be4ab3d73c91..7fc1b35a6746 100644
--- a/arch/arm/mm/fault-armv.c
+++ b/arch/arm/mm/fault-armv.c
@@ -26,6 +26,11 @@ static unsigned long shared_pte_mask = L_PTE_CACHEABLE;
 /*
  * We take the easy way out of this problem - we make the
  * PTE uncacheable.  However, we leave the write buffer on.
+ *
+ * Note that the pte lock held when calling update_mmu_cache must also
+ * guard the pte (somewhere else in the same mm) that we modify here.
+ * Therefore those configurations which might call adjust_pte (those
+ * without CONFIG_CPU_CACHE_VIPT) cannot support split page_table_lock.
  */
 static int adjust_pte(struct vm_area_struct *vma, unsigned long address)
 {
@@ -127,7 +132,7 @@ void __flush_dcache_page(struct address_space *mapping, struct page *page);
  *  2. If we have multiple shared mappings of the same space in
  *     an object, we need to deal with the cache aliasing issues.
  *
- * Note that the page_table_lock will be held.
+ * Note that the pte lock will be held.
  */
 void update_mmu_cache(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
 {

From 92dc6fcc845d99e87d8168e0786796525832d130 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:36 -0700
Subject: [PATCH 62/91] [PATCH] mm: parisc pte atomicity

There's a worrying function translation_exists in parisc cacheflush.h,
unaffected by split ptlock since flush_dcache_page is using it on some other
mm, without any relevant lock.  Oh well, make it a slightly more robust by
factoring the pfn check within it.  And it looked liable to confuse a
camouflaged swap or file entry with a good pte: fix that too.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/parisc/kernel/cache.c      | 24 +++++++++-------------
 include/asm-parisc/cacheflush.h | 35 ++++++++++++++++++---------------
 2 files changed, 28 insertions(+), 31 deletions(-)

diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c
index e15f09eaed12..a065349aee37 100644
--- a/arch/parisc/kernel/cache.c
+++ b/arch/parisc/kernel/cache.c
@@ -270,7 +270,6 @@ void flush_dcache_page(struct page *page)
 	unsigned long offset;
 	unsigned long addr;
 	pgoff_t pgoff;
-	pte_t *pte;
 	unsigned long pfn = page_to_pfn(page);
 
 
@@ -301,21 +300,16 @@ void flush_dcache_page(struct page *page)
 		 * taking a page fault if the pte doesn't exist.
 		 * This is just for speed.  If the page translation
 		 * isn't there, there's no point exciting the
-		 * nadtlb handler into a nullification frenzy */
-
-
-  		if(!(pte = translation_exists(mpnt, addr)))
-			continue;
-
-		/* make sure we really have this page: the private
+		 * nadtlb handler into a nullification frenzy.
+		 *
+		 * Make sure we really have this page: the private
 		 * mappings may cover this area but have COW'd this
-		 * particular page */
-		if(pte_pfn(*pte) != pfn)
-  			continue;
-
-		__flush_cache_page(mpnt, addr);
-
-		break;
+		 * particular page.
+		 */
+  		if (translation_exists(mpnt, addr, pfn)) {
+			__flush_cache_page(mpnt, addr);
+			break;
+		}
 	}
 	flush_dcache_mmap_unlock(mapping);
 }
diff --git a/include/asm-parisc/cacheflush.h b/include/asm-parisc/cacheflush.h
index aa592d8c0e39..1bc3c83ee74b 100644
--- a/include/asm-parisc/cacheflush.h
+++ b/include/asm-parisc/cacheflush.h
@@ -100,30 +100,34 @@ static inline void flush_cache_range(struct vm_area_struct *vma,
 
 /* Simple function to work out if we have an existing address translation
  * for a user space vma. */
-static inline pte_t *__translation_exists(struct mm_struct *mm,
-					  unsigned long addr)
+static inline int translation_exists(struct vm_area_struct *vma,
+				unsigned long addr, unsigned long pfn)
 {
-	pgd_t *pgd = pgd_offset(mm, addr);
+	pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
 	pmd_t *pmd;
-	pte_t *pte;
+	pte_t pte;
 
 	if(pgd_none(*pgd))
-		return NULL;
+		return 0;
 
 	pmd = pmd_offset(pgd, addr);
 	if(pmd_none(*pmd) || pmd_bad(*pmd))
-		return NULL;
+		return 0;
 
-	pte = pte_offset_map(pmd, addr);
+	/* We cannot take the pte lock here: flush_cache_page is usually
+	 * called with pte lock already held.  Whereas flush_dcache_page
+	 * takes flush_dcache_mmap_lock, which is lower in the hierarchy:
+	 * the vma itself is secure, but the pte might come or go racily.
+	 */
+	pte = *pte_offset_map(pmd, addr);
+	/* But pte_unmap() does nothing on this architecture */
 
-	/* The PA flush mappings show up as pte_none, but they're
-	 * valid none the less */
-	if(pte_none(*pte) && ((pte_val(*pte) & _PAGE_FLUSH) == 0))
-		return NULL;
-	return pte;
+	/* Filter out coincidental file entries and swap entries */
+	if (!(pte_val(pte) & (_PAGE_FLUSH|_PAGE_PRESENT)))
+		return 0;
+
+	return pte_pfn(pte) == pfn;
 }
-#define	translation_exists(vma, addr)	__translation_exists((vma)->vm_mm, addr)
-
 
 /* Private function to flush a page from the cache of a non-current
  * process.  cr25 contains the Page Directory of the current user
@@ -175,9 +179,8 @@ flush_cache_page(struct vm_area_struct *vma, unsigned long vmaddr, unsigned long
 {
 	BUG_ON(!vma->vm_mm->context);
 
-	if(likely(translation_exists(vma, vmaddr)))
+	if (likely(translation_exists(vma, vmaddr, pfn)))
 		__flush_cache_page(vma, vmaddr);
 
 }
 #endif
-

From a7e4705b24e611574e5c23105005ffdff694fd58 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:37 -0700
Subject: [PATCH 63/91] [PATCH] mm: cris v32 mmu_context_lock

The cris v32 switch_mm guards get_mmu_context with next->page_table_lock: good
it's not really SMP yet, since get_mmu_context messes with global variables
affecting other mms.  Replace by global mmu_context_lock.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/cris/arch-v32/mm/tlb.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/cris/arch-v32/mm/tlb.c b/arch/cris/arch-v32/mm/tlb.c
index 8233406798d3..b08a28bb58ab 100644
--- a/arch/cris/arch-v32/mm/tlb.c
+++ b/arch/cris/arch-v32/mm/tlb.c
@@ -175,6 +175,8 @@ init_new_context(struct task_struct *tsk, struct mm_struct *mm)
 	return 0;
 }
 
+static DEFINE_SPINLOCK(mmu_context_lock);
+
 /* Called in schedule() just before actually doing the switch_to. */
 void
 switch_mm(struct mm_struct *prev, struct mm_struct *next,
@@ -183,10 +185,10 @@ switch_mm(struct mm_struct *prev, struct mm_struct *next,
 	int cpu = smp_processor_id();
 
 	/* Make sure there is a MMU context. */
-	spin_lock(&next->page_table_lock);
+	spin_lock(&mmu_context_lock);
 	get_mmu_context(next);
 	cpu_set(cpu, next->cpu_vm_mask);
-	spin_unlock(&next->page_table_lock);
+	spin_unlock(&mmu_context_lock);
 
 	/*
 	 * Remember the pgd for the fault handlers. Keep a seperate copy of it

From 8f5cd76c185a4c8aeb5fe1e560e3612bfc050c35 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:38 -0700
Subject: [PATCH 64/91] [PATCH] mm: uml pte atomicity

There's usually a good reason when a pte is examined without the lock; but it
makes me nervous when the pointer is dereferenced more than once.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/um/kernel/process_kern.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/arch/um/kernel/process_kern.c b/arch/um/kernel/process_kern.c
index 0d73ceeece72..34b54a3e2132 100644
--- a/arch/um/kernel/process_kern.c
+++ b/arch/um/kernel/process_kern.c
@@ -222,6 +222,7 @@ void *um_virt_to_phys(struct task_struct *task, unsigned long addr,
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t *pte;
+	pte_t ptent;
 
 	if(task->mm == NULL) 
 		return(ERR_PTR(-EINVAL));
@@ -238,12 +239,13 @@ void *um_virt_to_phys(struct task_struct *task, unsigned long addr,
 		return(ERR_PTR(-EINVAL));
 
 	pte = pte_offset_kernel(pmd, addr);
-	if(!pte_present(*pte)) 
+	ptent = *pte;
+	if(!pte_present(ptent))
 		return(ERR_PTR(-EINVAL));
 
 	if(pte_out != NULL)
-		*pte_out = *pte;
-	return((void *) (pte_val(*pte) & PAGE_MASK) + (addr & ~PAGE_MASK));
+		*pte_out = ptent;
+	return((void *) (pte_val(ptent) & PAGE_MASK) + (addr & ~PAGE_MASK));
 }
 
 char *current_cmd(void)

From b38c6845b695141259019e2b7c0fe6c32a6e720d Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:39 -0700
Subject: [PATCH 65/91] [PATCH] mm: uml kill unused

In worrying over the various pte operations in different architectures, I came
across some unused functions in UML: remove mprotect_kernel_vm,
protect_vm_page and addr_pte.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/um/include/tlb.h   |  1 -
 arch/um/kernel/tt/tlb.c | 36 ------------------------------------
 2 files changed, 37 deletions(-)

diff --git a/arch/um/include/tlb.h b/arch/um/include/tlb.h
index 45d7da6c3b2c..8efc1e0f1b84 100644
--- a/arch/um/include/tlb.h
+++ b/arch/um/include/tlb.h
@@ -34,7 +34,6 @@ struct host_vm_op {
 	} u;
 };
 
-extern void mprotect_kernel_vm(int w);
 extern void force_flush_all(void);
 extern void fix_range_common(struct mm_struct *mm, unsigned long start_addr,
                              unsigned long end_addr, int force,
diff --git a/arch/um/kernel/tt/tlb.c b/arch/um/kernel/tt/tlb.c
index f1d85dbb45b9..ae6217c86135 100644
--- a/arch/um/kernel/tt/tlb.c
+++ b/arch/um/kernel/tt/tlb.c
@@ -74,42 +74,6 @@ void flush_tlb_kernel_range_tt(unsigned long start, unsigned long end)
                 atomic_inc(&vmchange_seq);
 }
 
-static void protect_vm_page(unsigned long addr, int w, int must_succeed)
-{
-	int err;
-
-	err = protect_memory(addr, PAGE_SIZE, 1, w, 1, must_succeed);
-	if(err == 0) return;
-	else if((err == -EFAULT) || (err == -ENOMEM)){
-		flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
-		protect_vm_page(addr, w, 1);
-	}
-	else panic("protect_vm_page : protect failed, errno = %d\n", err);
-}
-
-void mprotect_kernel_vm(int w)
-{
-	struct mm_struct *mm;
-	pgd_t *pgd;
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte;
-	unsigned long addr;
-	
-	mm = &init_mm;
-	for(addr = start_vm; addr < end_vm;){
-		pgd = pgd_offset(mm, addr);
-		pud = pud_offset(pgd, addr);
-		pmd = pmd_offset(pud, addr);
-		if(pmd_present(*pmd)){
-			pte = pte_offset_kernel(pmd, addr);
-			if(pte_present(*pte)) protect_vm_page(addr, w, 0);
-			addr += PAGE_SIZE;
-		}
-		else addr += PMD_SIZE;
-	}
-}
-
 void flush_tlb_kernel_vm_tt(void)
 {
         flush_tlb_kernel_range(start_vm, end_vm);

From 4c21e2f2441dc5fbb957b030333f5a3f2d02dea7 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:40 -0700
Subject: [PATCH 66/91] [PATCH] mm: split page table lock

Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.

This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock.  (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)

In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.

Splitting the lock is not quite for free: another cacheline access.  Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS.  But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.

There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/arm/mm/mm-armv.c       |  1 +
 arch/frv/mm/pgalloc.c       |  4 ++--
 arch/i386/mm/pgtable.c      |  8 +++----
 arch/um/kernel/skas/mmu.c   |  1 +
 fs/afs/file.c               |  4 ++--
 fs/buffer.c                 |  2 +-
 fs/jfs/jfs_metapage.c       | 12 +++++-----
 fs/xfs/linux-2.6/xfs_buf.c  |  7 +++---
 include/linux/buffer_head.h |  6 ++---
 include/linux/mm.h          | 46 ++++++++++++++++++++++++++++++-------
 kernel/kexec.c              |  4 ++--
 mm/Kconfig                  | 13 +++++++++++
 mm/filemap.c                |  2 +-
 mm/memory.c                 | 24 +++++++++++--------
 mm/mremap.c                 | 11 ++++++++-
 mm/page_alloc.c             | 16 ++++++-------
 mm/page_io.c                |  6 +++--
 mm/rmap.c                   |  4 ++--
 mm/shmem.c                  | 22 ++++++++----------
 mm/swap.c                   |  2 +-
 mm/swap_state.c             |  8 +++----
 mm/swapfile.c               | 12 +++++-----
 mm/vmscan.c                 |  2 +-
 23 files changed, 138 insertions(+), 79 deletions(-)

diff --git a/arch/arm/mm/mm-armv.c b/arch/arm/mm/mm-armv.c
index 60f3e039bac2..1221fdde1769 100644
--- a/arch/arm/mm/mm-armv.c
+++ b/arch/arm/mm/mm-armv.c
@@ -229,6 +229,7 @@ void free_pgd_slow(pgd_t *pgd)
 	pte = pmd_page(*pmd);
 	pmd_clear(pmd);
 	dec_page_state(nr_page_table_pages);
+	pte_lock_deinit(pte);
 	pte_free(pte);
 	pmd_free(pmd);
 free:
diff --git a/arch/frv/mm/pgalloc.c b/arch/frv/mm/pgalloc.c
index 4eaec0f3525b..2c67dfe5a6b3 100644
--- a/arch/frv/mm/pgalloc.c
+++ b/arch/frv/mm/pgalloc.c
@@ -87,14 +87,14 @@ static inline void pgd_list_add(pgd_t *pgd)
 	if (pgd_list)
 		pgd_list->private = (unsigned long) &page->index;
 	pgd_list = page;
-	page->private = (unsigned long) &pgd_list;
+	set_page_private(page, (unsigned long)&pgd_list);
 }
 
 static inline void pgd_list_del(pgd_t *pgd)
 {
 	struct page *next, **pprev, *page = virt_to_page(pgd);
 	next = (struct page *) page->index;
-	pprev = (struct page **) page->private;
+	pprev = (struct page **)page_private(page);
 	*pprev = next;
 	if (next)
 		next->private = (unsigned long) pprev;
diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c
index dcdce2c6c532..39c099f15b5f 100644
--- a/arch/i386/mm/pgtable.c
+++ b/arch/i386/mm/pgtable.c
@@ -188,19 +188,19 @@ static inline void pgd_list_add(pgd_t *pgd)
 	struct page *page = virt_to_page(pgd);
 	page->index = (unsigned long)pgd_list;
 	if (pgd_list)
-		pgd_list->private = (unsigned long)&page->index;
+		set_page_private(pgd_list, (unsigned long)&page->index);
 	pgd_list = page;
-	page->private = (unsigned long)&pgd_list;
+	set_page_private(page, (unsigned long)&pgd_list);
 }
 
 static inline void pgd_list_del(pgd_t *pgd)
 {
 	struct page *next, **pprev, *page = virt_to_page(pgd);
 	next = (struct page *)page->index;
-	pprev = (struct page **)page->private;
+	pprev = (struct page **)page_private(page);
 	*pprev = next;
 	if (next)
-		next->private = (unsigned long)pprev;
+		set_page_private(next, (unsigned long)pprev);
 }
 
 void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused)
diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c
index 02cf36e0331a..9e5e39cea821 100644
--- a/arch/um/kernel/skas/mmu.c
+++ b/arch/um/kernel/skas/mmu.c
@@ -144,6 +144,7 @@ void destroy_context_skas(struct mm_struct *mm)
 
 	if(!proc_mm || !ptrace_faultinfo){
 		free_page(mmu->id.stack);
+		pte_lock_deinit(virt_to_page(mmu->last_page_table));
 		pte_free_kernel((pte_t *) mmu->last_page_table);
                 dec_page_state(nr_page_table_pages);
 #ifdef CONFIG_3_LEVEL_PGTABLES
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 0d576987ec67..4975c9c193dd 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -291,8 +291,8 @@ static int afs_file_releasepage(struct page *page, gfp_t gfp_flags)
 		cachefs_uncache_page(vnode->cache, page);
 #endif
 
-		pageio = (struct cachefs_page *) page->private;
-		page->private = 0;
+		pageio = (struct cachefs_page *) page_private(page);
+		set_page_private(page, 0);
 		ClearPagePrivate(page);
 
 		if (pageio)
diff --git a/fs/buffer.c b/fs/buffer.c
index b1667986442f..2066e4cb700c 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -96,7 +96,7 @@ static void
 __clear_page_buffers(struct page *page)
 {
 	ClearPagePrivate(page);
-	page->private = 0;
+	set_page_private(page, 0);
 	page_cache_release(page);
 }
 
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 26091a5f88d4..8a53981f9f27 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -86,7 +86,7 @@ struct meta_anchor {
 	atomic_t io_count;
 	struct metapage *mp[MPS_PER_PAGE];
 };
-#define mp_anchor(page) ((struct meta_anchor *)page->private)
+#define mp_anchor(page) ((struct meta_anchor *)page_private(page))
 
 static inline struct metapage *page_to_mp(struct page *page, uint offset)
 {
@@ -108,7 +108,7 @@ static inline int insert_metapage(struct page *page, struct metapage *mp)
 		if (!a)
 			return -ENOMEM;
 		memset(a, 0, sizeof(struct meta_anchor));
-		page->private = (unsigned long)a;
+		set_page_private(page, (unsigned long)a);
 		SetPagePrivate(page);
 		kmap(page);
 	}
@@ -136,7 +136,7 @@ static inline void remove_metapage(struct page *page, struct metapage *mp)
 	a->mp[index] = NULL;
 	if (--a->mp_count == 0) {
 		kfree(a);
-		page->private = 0;
+		set_page_private(page, 0);
 		ClearPagePrivate(page);
 		kunmap(page);
 	}
@@ -156,13 +156,13 @@ static inline void dec_io(struct page *page, void (*handler) (struct page *))
 #else
 static inline struct metapage *page_to_mp(struct page *page, uint offset)
 {
-	return PagePrivate(page) ? (struct metapage *)page->private : NULL;
+	return PagePrivate(page) ? (struct metapage *)page_private(page) : NULL;
 }
 
 static inline int insert_metapage(struct page *page, struct metapage *mp)
 {
 	if (mp) {
-		page->private = (unsigned long)mp;
+		set_page_private(page, (unsigned long)mp);
 		SetPagePrivate(page);
 		kmap(page);
 	}
@@ -171,7 +171,7 @@ static inline int insert_metapage(struct page *page, struct metapage *mp)
 
 static inline void remove_metapage(struct page *page, struct metapage *mp)
 {
-	page->private = 0;
+	set_page_private(page, 0);
 	ClearPagePrivate(page);
 	kunmap(page);
 }
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index ba4767c04adf..4cd46abe8434 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -181,8 +181,9 @@ set_page_region(
 	size_t		offset,
 	size_t		length)
 {
-	page->private |= page_region_mask(offset, length);
-	if (page->private == ~0UL)
+	set_page_private(page,
+		page_private(page) | page_region_mask(offset, length));
+	if (page_private(page) == ~0UL)
 		SetPageUptodate(page);
 }
 
@@ -194,7 +195,7 @@ test_page_region(
 {
 	unsigned long	mask = page_region_mask(offset, length);
 
-	return (mask && (page->private & mask) == mask);
+	return (mask && (page_private(page) & mask) == mask);
 }
 
 /*
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 88af42f5e04a..c937d6e65502 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -126,8 +126,8 @@ BUFFER_FNS(Eopnotsupp, eopnotsupp)
 /* If we *know* page->private refers to buffer_heads */
 #define page_buffers(page)					\
 	({							\
-		BUG_ON(!PagePrivate(page));		\
-		((struct buffer_head *)(page)->private);	\
+		BUG_ON(!PagePrivate(page));			\
+		((struct buffer_head *)page_private(page));	\
 	})
 #define page_has_buffers(page)	PagePrivate(page)
 
@@ -219,7 +219,7 @@ static inline void attach_page_buffers(struct page *page,
 {
 	page_cache_get(page);
 	SetPagePrivate(page);
-	page->private = (unsigned long)head;
+	set_page_private(page, (unsigned long)head);
 }
 
 static inline void get_bh(struct buffer_head *bh)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index e8d1424153bb..8a514eca40d5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -226,13 +226,18 @@ struct page {
 					 * to show when page is mapped
 					 * & limit reverse map searches.
 					 */
-	unsigned long private;		/* Mapping-private opaque data:
+	union {
+		unsigned long private;	/* Mapping-private opaque data:
 					 * usually used for buffer_heads
 					 * if PagePrivate set; used for
 					 * swp_entry_t if PageSwapCache
 					 * When page is free, this indicates
 					 * order in the buddy system.
 					 */
+#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+		spinlock_t ptl;
+#endif
+	} u;
 	struct address_space *mapping;	/* If low bit clear, points to
 					 * inode address_space, or NULL.
 					 * If page mapped as anonymous
@@ -260,6 +265,9 @@ struct page {
 #endif /* WANT_PAGE_VIRTUAL */
 };
 
+#define page_private(page)		((page)->u.private)
+#define set_page_private(page, v)	((page)->u.private = (v))
+
 /*
  * FIXME: take this include out, include page-flags.h in
  * files which need it (119 of them)
@@ -311,17 +319,17 @@ extern void FASTCALL(__page_cache_release(struct page *));
 
 #ifdef CONFIG_HUGETLB_PAGE
 
-static inline int page_count(struct page *p)
+static inline int page_count(struct page *page)
 {
-	if (PageCompound(p))
-		p = (struct page *)p->private;
-	return atomic_read(&(p)->_count) + 1;
+	if (PageCompound(page))
+		page = (struct page *)page_private(page);
+	return atomic_read(&page->_count) + 1;
 }
 
 static inline void get_page(struct page *page)
 {
 	if (unlikely(PageCompound(page)))
-		page = (struct page *)page->private;
+		page = (struct page *)page_private(page);
 	atomic_inc(&page->_count);
 }
 
@@ -587,7 +595,7 @@ static inline int PageAnon(struct page *page)
 static inline pgoff_t page_index(struct page *page)
 {
 	if (unlikely(PageSwapCache(page)))
-		return page->private;
+		return page_private(page);
 	return page->index;
 }
 
@@ -779,9 +787,31 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a
 }
 #endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */
 
+#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+/*
+ * We tuck a spinlock to guard each pagetable page into its struct page,
+ * at page->private, with BUILD_BUG_ON to make sure that this will not
+ * overflow into the next struct page (as it might with DEBUG_SPINLOCK).
+ * When freeing, reset page->mapping so free_pages_check won't complain.
+ */
+#define __pte_lockptr(page)	&((page)->u.ptl)
+#define pte_lock_init(_page)	do {					\
+	spin_lock_init(__pte_lockptr(_page));				\
+} while (0)
+#define pte_lock_deinit(page)	((page)->mapping = NULL)
+#define pte_lockptr(mm, pmd)	({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));})
+#else
+/*
+ * We use mm->page_table_lock to guard all pagetable pages of the mm.
+ */
+#define pte_lock_init(page)	do {} while (0)
+#define pte_lock_deinit(page)	do {} while (0)
+#define pte_lockptr(mm, pmd)	({(void)(pmd); &(mm)->page_table_lock;})
+#endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
+
 #define pte_offset_map_lock(mm, pmd, address, ptlp)	\
 ({							\
-	spinlock_t *__ptl = &(mm)->page_table_lock;	\
+	spinlock_t *__ptl = pte_lockptr(mm, pmd);	\
 	pte_t *__pte = pte_offset_map(pmd, address);	\
 	*(ptlp) = __ptl;				\
 	spin_lock(__ptl);				\
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 36c5d9cd4cc1..2c95848fbce8 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -334,7 +334,7 @@ static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
 	if (pages) {
 		unsigned int count, i;
 		pages->mapping = NULL;
-		pages->private = order;
+		set_page_private(pages, order);
 		count = 1 << order;
 		for (i = 0; i < count; i++)
 			SetPageReserved(pages + i);
@@ -347,7 +347,7 @@ static void kimage_free_pages(struct page *page)
 {
 	unsigned int order, count, i;
 
-	order = page->private;
+	order = page_private(page);
 	count = 1 << order;
 	for (i = 0; i < count; i++)
 		ClearPageReserved(page + i);
diff --git a/mm/Kconfig b/mm/Kconfig
index 391ffc54d136..f35a550ba4b9 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -111,3 +111,16 @@ config SPARSEMEM_STATIC
 config SPARSEMEM_EXTREME
 	def_bool y
 	depends on SPARSEMEM && !SPARSEMEM_STATIC
+
+# Heavily threaded applications may benefit from splitting the mm-wide
+# page_table_lock, so that faults on different parts of the user address
+# space can be handled with less contention: split it at this NR_CPUS.
+# Default to 4 for wider testing, though 8 might be more appropriate.
+# ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock.
+# PA-RISC's debug spinlock_t is too large for the 32-bit struct page.
+#
+config SPLIT_PTLOCK_CPUS
+	int
+	default "4096" if ARM && !CPU_CACHE_VIPT
+	default "4096" if PARISC && DEBUG_SPINLOCK && !64BIT
+	default "4"
diff --git a/mm/filemap.c b/mm/filemap.c
index 8aa344e88489..f560b41c8f61 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -152,7 +152,7 @@ static int sync_page(void *word)
 	 * in the ->sync_page() methods make essential use of the
 	 * page_mapping(), merely passing the page down to the backing
 	 * device's unplug functions when it's non-NULL, which in turn
-	 * ignore it for all cases but swap, where only page->private is
+	 * ignore it for all cases but swap, where only page_private(page) is
 	 * of interest. When page_mapping() does go NULL, the entire
 	 * call stack gracefully ignores the page and returns.
 	 * -- wli
diff --git a/mm/memory.c b/mm/memory.c
index 8461e2dd91d7..e9ef599498b5 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -114,6 +114,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd)
 {
 	struct page *page = pmd_page(*pmd);
 	pmd_clear(pmd);
+	pte_lock_deinit(page);
 	pte_free_tlb(tlb, page);
 	dec_page_state(nr_page_table_pages);
 	tlb->mm->nr_ptes--;
@@ -294,10 +295,12 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
 	if (!new)
 		return -ENOMEM;
 
+	pte_lock_init(new);
 	spin_lock(&mm->page_table_lock);
-	if (pmd_present(*pmd))		/* Another has populated it */
+	if (pmd_present(*pmd)) {	/* Another has populated it */
+		pte_lock_deinit(new);
 		pte_free(new);
-	else {
+	} else {
 		mm->nr_ptes++;
 		inc_page_state(nr_page_table_pages);
 		pmd_populate(mm, pmd, new);
@@ -432,7 +435,7 @@ again:
 	if (!dst_pte)
 		return -ENOMEM;
 	src_pte = pte_offset_map_nested(src_pmd, addr);
-	src_ptl = &src_mm->page_table_lock;
+	src_ptl = pte_lockptr(src_mm, src_pmd);
 	spin_lock(src_ptl);
 
 	do {
@@ -1194,15 +1197,16 @@ EXPORT_SYMBOL(remap_pfn_range);
  * (but do_wp_page is only called after already making such a check;
  * and do_anonymous_page and do_no_page can safely check later on).
  */
-static inline int pte_unmap_same(struct mm_struct *mm,
+static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
 				pte_t *page_table, pte_t orig_pte)
 {
 	int same = 1;
 #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
 	if (sizeof(pte_t) > sizeof(unsigned long)) {
-		spin_lock(&mm->page_table_lock);
+		spinlock_t *ptl = pte_lockptr(mm, pmd);
+		spin_lock(ptl);
 		same = pte_same(*page_table, orig_pte);
-		spin_unlock(&mm->page_table_lock);
+		spin_unlock(ptl);
 	}
 #endif
 	pte_unmap(page_table);
@@ -1655,7 +1659,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	pte_t pte;
 	int ret = VM_FAULT_MINOR;
 
-	if (!pte_unmap_same(mm, page_table, orig_pte))
+	if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
 		goto out;
 
 	entry = pte_to_swp_entry(orig_pte);
@@ -1773,7 +1777,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		page_cache_get(page);
 		entry = mk_pte(page, vma->vm_page_prot);
 
-		ptl = &mm->page_table_lock;
+		ptl = pte_lockptr(mm, pmd);
 		spin_lock(ptl);
 		if (!pte_none(*page_table))
 			goto release;
@@ -1934,7 +1938,7 @@ static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	pgoff_t pgoff;
 	int err;
 
-	if (!pte_unmap_same(mm, page_table, orig_pte))
+	if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
 		return VM_FAULT_MINOR;
 
 	if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
@@ -1992,7 +1996,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
 					pte, pmd, write_access, entry);
 	}
 
-	ptl = &mm->page_table_lock;
+	ptl = pte_lockptr(mm, pmd);
 	spin_lock(ptl);
 	if (unlikely(!pte_same(*pte, entry)))
 		goto unlock;
diff --git a/mm/mremap.c b/mm/mremap.c
index 8de77b632a20..b535438c363c 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -72,7 +72,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 	struct address_space *mapping = NULL;
 	struct mm_struct *mm = vma->vm_mm;
 	pte_t *old_pte, *new_pte, pte;
-	spinlock_t *old_ptl;
+	spinlock_t *old_ptl, *new_ptl;
 
 	if (vma->vm_file) {
 		/*
@@ -88,8 +88,15 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 			new_vma->vm_truncate_count = 0;
 	}
 
+	/*
+	 * We don't have to worry about the ordering of src and dst
+	 * pte locks because exclusive mmap_sem prevents deadlock.
+	 */
 	old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
  	new_pte = pte_offset_map_nested(new_pmd, new_addr);
+	new_ptl = pte_lockptr(mm, new_pmd);
+	if (new_ptl != old_ptl)
+		spin_lock(new_ptl);
 
 	for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
 				   new_pte++, new_addr += PAGE_SIZE) {
@@ -101,6 +108,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 		set_pte_at(mm, new_addr, new_pte, pte);
 	}
 
+	if (new_ptl != old_ptl)
+		spin_unlock(new_ptl);
 	pte_unmap_nested(new_pte - 1);
 	pte_unmap_unlock(old_pte - 1, old_ptl);
 	if (mapping)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0541288ebf4b..a2995a5d012c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -154,7 +154,7 @@ static void prep_compound_page(struct page *page, unsigned long order)
 		struct page *p = page + i;
 
 		SetPageCompound(p);
-		p->private = (unsigned long)page;
+		set_page_private(p, (unsigned long)page);
 	}
 }
 
@@ -174,7 +174,7 @@ static void destroy_compound_page(struct page *page, unsigned long order)
 
 		if (!PageCompound(p))
 			bad_page(__FUNCTION__, page);
-		if (p->private != (unsigned long)page)
+		if (page_private(p) != (unsigned long)page)
 			bad_page(__FUNCTION__, page);
 		ClearPageCompound(p);
 	}
@@ -187,18 +187,18 @@ static void destroy_compound_page(struct page *page, unsigned long order)
  * So, we don't need atomic page->flags operations here.
  */
 static inline unsigned long page_order(struct page *page) {
-	return page->private;
+	return page_private(page);
 }
 
 static inline void set_page_order(struct page *page, int order) {
-	page->private = order;
+	set_page_private(page, order);
 	__SetPagePrivate(page);
 }
 
 static inline void rmv_page_order(struct page *page)
 {
 	__ClearPagePrivate(page);
-	page->private = 0;
+	set_page_private(page, 0);
 }
 
 /*
@@ -238,7 +238,7 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
  * (a) the buddy is free &&
  * (b) the buddy is on the buddy system &&
  * (c) a page and its buddy have the same order.
- * for recording page's order, we use page->private and PG_private.
+ * for recording page's order, we use page_private(page) and PG_private.
  *
  */
 static inline int page_is_buddy(struct page *page, int order)
@@ -264,7 +264,7 @@ static inline int page_is_buddy(struct page *page, int order)
  * parts of the VM system.
  * At each level, we keep a list of pages, which are heads of continuous
  * free pages of length of (1 << order) and marked with PG_Private.Page's
- * order is recorded in page->private field.
+ * order is recorded in page_private(page) field.
  * So when we are allocating or freeing one, we can derive the state of the
  * other.  That is, if we allocate a small block, and both were   
  * free, the remainder of the region must be split into blocks.   
@@ -463,7 +463,7 @@ static void prep_new_page(struct page *page, int order)
 	page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
 			1 << PG_referenced | 1 << PG_arch_1 |
 			1 << PG_checked | 1 << PG_mappedtodisk);
-	page->private = 0;
+	set_page_private(page, 0);
 	set_page_refs(page, order);
 	kernel_map_pages(page, 1 << order, 1);
 }
diff --git a/mm/page_io.c b/mm/page_io.c
index 330e00d6db00..bb2b0d53889c 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -91,7 +91,8 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
 		unlock_page(page);
 		goto out;
 	}
-	bio = get_swap_bio(GFP_NOIO, page->private, page, end_swap_bio_write);
+	bio = get_swap_bio(GFP_NOIO, page_private(page), page,
+				end_swap_bio_write);
 	if (bio == NULL) {
 		set_page_dirty(page);
 		unlock_page(page);
@@ -115,7 +116,8 @@ int swap_readpage(struct file *file, struct page *page)
 
 	BUG_ON(!PageLocked(page));
 	ClearPageUptodate(page);
-	bio = get_swap_bio(GFP_KERNEL, page->private, page, end_swap_bio_read);
+	bio = get_swap_bio(GFP_KERNEL, page_private(page), page,
+				end_swap_bio_read);
 	if (bio == NULL) {
 		unlock_page(page);
 		ret = -ENOMEM;
diff --git a/mm/rmap.c b/mm/rmap.c
index a84bdfe582c0..a33e779d1bd8 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -274,7 +274,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
 		return NULL;
 	}
 
-	ptl = &mm->page_table_lock;
+	ptl = pte_lockptr(mm, pmd);
 	spin_lock(ptl);
 	if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
 		*ptlp = ptl;
@@ -550,7 +550,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
 	update_hiwater_rss(mm);
 
 	if (PageAnon(page)) {
-		swp_entry_t entry = { .val = page->private };
+		swp_entry_t entry = { .val = page_private(page) };
 		/*
 		 * Store the swap location in the pte.
 		 * See handle_pte_fault() ...
diff --git a/mm/shmem.c b/mm/shmem.c
index 37777f4c11f8..dc25565a61e9 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -71,9 +71,6 @@
 /* Pretend that each entry is of this size in directory's i_size */
 #define BOGO_DIRENT_SIZE 20
 
-/* Keep swapped page count in private field of indirect struct page */
-#define nr_swapped		private
-
 /* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */
 enum sgp_type {
 	SGP_QUICK,	/* don't try more than file page cache lookup */
@@ -324,8 +321,10 @@ static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, uns
 
 	entry->val = value;
 	info->swapped += incdec;
-	if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT)
-		kmap_atomic_to_page(entry)->nr_swapped += incdec;
+	if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) {
+		struct page *page = kmap_atomic_to_page(entry);
+		set_page_private(page, page_private(page) + incdec);
+	}
 }
 
 /*
@@ -368,9 +367,8 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
 
 		spin_unlock(&info->lock);
 		page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) | __GFP_ZERO);
-		if (page) {
-			page->nr_swapped = 0;
-		}
+		if (page)
+			set_page_private(page, 0);
 		spin_lock(&info->lock);
 
 		if (!page) {
@@ -561,7 +559,7 @@ static void shmem_truncate(struct inode *inode)
 			diroff = 0;
 		}
 		subdir = dir[diroff];
-		if (subdir && subdir->nr_swapped) {
+		if (subdir && page_private(subdir)) {
 			size = limit - idx;
 			if (size > ENTRIES_PER_PAGE)
 				size = ENTRIES_PER_PAGE;
@@ -572,10 +570,10 @@ static void shmem_truncate(struct inode *inode)
 			nr_swaps_freed += freed;
 			if (offset)
 				spin_lock(&info->lock);
-			subdir->nr_swapped -= freed;
+			set_page_private(subdir, page_private(subdir) - freed);
 			if (offset)
 				spin_unlock(&info->lock);
-			BUG_ON(subdir->nr_swapped > offset);
+			BUG_ON(page_private(subdir) > offset);
 		}
 		if (offset)
 			offset = 0;
@@ -743,7 +741,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s
 			dir = shmem_dir_map(subdir);
 		}
 		subdir = *dir;
-		if (subdir && subdir->nr_swapped) {
+		if (subdir && page_private(subdir)) {
 			ptr = shmem_swp_map(subdir);
 			size = limit - idx;
 			if (size > ENTRIES_PER_PAGE)
diff --git a/mm/swap.c b/mm/swap.c
index 21d15f99805c..b89512877ec2 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -39,7 +39,7 @@ int page_cluster;
 void put_page(struct page *page)
 {
 	if (unlikely(PageCompound(page))) {
-		page = (struct page *)page->private;
+		page = (struct page *)page_private(page);
 		if (put_page_testzero(page)) {
 			void (*dtor)(struct page *page);
 
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 132164f7d0a7..cafc1edcbeba 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -83,7 +83,7 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry,
 			page_cache_get(page);
 			SetPageLocked(page);
 			SetPageSwapCache(page);
-			page->private = entry.val;
+			set_page_private(page, entry.val);
 			total_swapcache_pages++;
 			pagecache_acct(1);
 		}
@@ -126,8 +126,8 @@ void __delete_from_swap_cache(struct page *page)
 	BUG_ON(PageWriteback(page));
 	BUG_ON(PagePrivate(page));
 
-	radix_tree_delete(&swapper_space.page_tree, page->private);
-	page->private = 0;
+	radix_tree_delete(&swapper_space.page_tree, page_private(page));
+	set_page_private(page, 0);
 	ClearPageSwapCache(page);
 	total_swapcache_pages--;
 	pagecache_acct(-1);
@@ -197,7 +197,7 @@ void delete_from_swap_cache(struct page *page)
 {
 	swp_entry_t entry;
 
-	entry.val = page->private;
+	entry.val = page_private(page);
 
 	write_lock_irq(&swapper_space.tree_lock);
 	__delete_from_swap_cache(page);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 510f0039b000..8970c0b74194 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -61,7 +61,7 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
 	swp_entry_t entry;
 
 	down_read(&swap_unplug_sem);
-	entry.val = page->private;
+	entry.val = page_private(page);
 	if (PageSwapCache(page)) {
 		struct block_device *bdev = swap_info[swp_type(entry)].bdev;
 		struct backing_dev_info *bdi;
@@ -69,8 +69,8 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
 		/*
 		 * If the page is removed from swapcache from under us (with a
 		 * racy try_to_unuse/swapoff) we need an additional reference
-		 * count to avoid reading garbage from page->private above. If
-		 * the WARN_ON triggers during a swapoff it maybe the race
+		 * count to avoid reading garbage from page_private(page) above.
+		 * If the WARN_ON triggers during a swapoff it maybe the race
 		 * condition and it's harmless. However if it triggers without
 		 * swapoff it signals a problem.
 		 */
@@ -294,7 +294,7 @@ static inline int page_swapcount(struct page *page)
 	struct swap_info_struct *p;
 	swp_entry_t entry;
 
-	entry.val = page->private;
+	entry.val = page_private(page);
 	p = swap_info_get(entry);
 	if (p) {
 		/* Subtract the 1 for the swap cache itself */
@@ -339,7 +339,7 @@ int remove_exclusive_swap_page(struct page *page)
 	if (page_count(page) != 2) /* 2: us + cache */
 		return 0;
 
-	entry.val = page->private;
+	entry.val = page_private(page);
 	p = swap_info_get(entry);
 	if (!p)
 		return 0;
@@ -1042,7 +1042,7 @@ int page_queue_congested(struct page *page)
 	BUG_ON(!PageLocked(page));	/* It pins the swap_info_struct */
 
 	if (PageSwapCache(page)) {
-		swp_entry_t entry = { .val = page->private };
+		swp_entry_t entry = { .val = page_private(page) };
 		struct swap_info_struct *sis;
 
 		sis = get_swap_info_struct(swp_type(entry));
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 41d1064aabfb..135bf8ca96ee 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -521,7 +521,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
 
 #ifdef CONFIG_SWAP
 		if (PageSwapCache(page)) {
-			swp_entry_t swap = { .val = page->private };
+			swp_entry_t swap = { .val = page_private(page) };
 			__delete_from_swap_cache(page);
 			write_unlock_irq(&mapping->tree_lock);
 			swap_free(swap);

From f412ac08c9861b4791af0145934c22f1458686da Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:41 -0700
Subject: [PATCH 67/91] [PATCH] mm: fix rss and mmlist locking

A couple of oddities were guarded by page_table_lock, no longer properly
guarded when that is split.

The mm_counters of file_rss and anon_rss: make those an atomic_t, or an
atomic64_t if the architecture supports it, in such a case.  Definitions by
courtesy of Christoph Lameter: who spent considerable effort on more scalable
ways of counting, but found insufficient benefit in practice.

And adding an mm with swap to the mmlist for swapoff: the list is well-
guarded by its own lock, but the list_empty check now has to be repeated
inside it.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h | 42 ++++++++++++++++++++++++++++++++++++++----
 mm/memory.c           |  4 +++-
 mm/rmap.c             |  3 ++-
 3 files changed, 43 insertions(+), 6 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 292cb57ce38f..1c30bc308ef1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -249,13 +249,47 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
 extern void arch_unmap_area(struct mm_struct *, unsigned long);
 extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
 
+#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+/*
+ * The mm counters are not protected by its page_table_lock,
+ * so must be incremented atomically.
+ */
+#ifdef ATOMIC64_INIT
+#define set_mm_counter(mm, member, value) atomic64_set(&(mm)->_##member, value)
+#define get_mm_counter(mm, member) ((unsigned long)atomic64_read(&(mm)->_##member))
+#define add_mm_counter(mm, member, value) atomic64_add(value, &(mm)->_##member)
+#define inc_mm_counter(mm, member) atomic64_inc(&(mm)->_##member)
+#define dec_mm_counter(mm, member) atomic64_dec(&(mm)->_##member)
+typedef atomic64_t mm_counter_t;
+#else /* !ATOMIC64_INIT */
+/*
+ * The counters wrap back to 0 at 2^32 * PAGE_SIZE,
+ * that is, at 16TB if using 4kB page size.
+ */
+#define set_mm_counter(mm, member, value) atomic_set(&(mm)->_##member, value)
+#define get_mm_counter(mm, member) ((unsigned long)atomic_read(&(mm)->_##member))
+#define add_mm_counter(mm, member, value) atomic_add(value, &(mm)->_##member)
+#define inc_mm_counter(mm, member) atomic_inc(&(mm)->_##member)
+#define dec_mm_counter(mm, member) atomic_dec(&(mm)->_##member)
+typedef atomic_t mm_counter_t;
+#endif /* !ATOMIC64_INIT */
+
+#else  /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
+/*
+ * The mm counters are protected by its page_table_lock,
+ * so can be incremented directly.
+ */
 #define set_mm_counter(mm, member, value) (mm)->_##member = (value)
 #define get_mm_counter(mm, member) ((mm)->_##member)
 #define add_mm_counter(mm, member, value) (mm)->_##member += (value)
 #define inc_mm_counter(mm, member) (mm)->_##member++
 #define dec_mm_counter(mm, member) (mm)->_##member--
-#define get_mm_rss(mm) ((mm)->_file_rss + (mm)->_anon_rss)
+typedef unsigned long mm_counter_t;
 
+#endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
+
+#define get_mm_rss(mm)					\
+	(get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss))
 #define update_hiwater_rss(mm)	do {			\
 	unsigned long _rss = get_mm_rss(mm);		\
 	if ((mm)->hiwater_rss < _rss)			\
@@ -266,8 +300,6 @@ extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
 		(mm)->hiwater_vm = (mm)->total_vm;	\
 } while (0)
 
-typedef unsigned long mm_counter_t;
-
 struct mm_struct {
 	struct vm_area_struct * mmap;		/* list of VMAs */
 	struct rb_root mm_rb;
@@ -291,7 +323,9 @@ struct mm_struct {
 						 * by mmlist_lock
 						 */
 
-	/* Special counters protected by the page_table_lock */
+	/* Special counters, in some configurations protected by the
+	 * page_table_lock, in other configurations by being atomic.
+	 */
 	mm_counter_t _file_rss;
 	mm_counter_t _anon_rss;
 
diff --git a/mm/memory.c b/mm/memory.c
index e9ef599498b5..d68421dd64ef 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -372,7 +372,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 			/* make sure dst_mm is on swapoff's mmlist. */
 			if (unlikely(list_empty(&dst_mm->mmlist))) {
 				spin_lock(&mmlist_lock);
-				list_add(&dst_mm->mmlist, &src_mm->mmlist);
+				if (list_empty(&dst_mm->mmlist))
+					list_add(&dst_mm->mmlist,
+						 &src_mm->mmlist);
 				spin_unlock(&mmlist_lock);
 			}
 		}
diff --git a/mm/rmap.c b/mm/rmap.c
index a33e779d1bd8..a7427bbf57e4 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -559,7 +559,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
 		swap_duplicate(entry);
 		if (list_empty(&mm->mmlist)) {
 			spin_lock(&mmlist_lock);
-			list_add(&mm->mmlist, &init_mm.mmlist);
+			if (list_empty(&mm->mmlist))
+				list_add(&mm->mmlist, &init_mm.mmlist);
 			spin_unlock(&mmlist_lock);
 		}
 		set_pte_at(mm, address, pte, swp_entry_to_pte(entry));

From b8072f099b7829a6ff3eba618e1d079a81f753f8 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:41 -0700
Subject: [PATCH 68/91] [PATCH] mm: update comments to pte lock

Updated several references to page_table_lock in common code comments.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-generic/pgtable.h |  2 +-
 include/linux/mempolicy.h     |  3 +--
 mm/filemap.c                  |  6 +++---
 mm/rmap.c                     | 10 +++++-----
 mm/swap_state.c               |  3 +--
 5 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index ff28c8b31f58..7dca30a26c53 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -8,7 +8,7 @@
  *  - update the page tables
  *  - inform the TLB about the new one
  *
- * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock.
+ * We hold the mm semaphore for reading, and the pte lock.
  *
  * Note: the old pte is known to not be writable, so we don't need to
  * worry about dirty bits etc getting lost.
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 38e60a099399..7af8cb836e78 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -47,8 +47,7 @@ struct vm_area_struct;
  * Locking policy for interlave:
  * In process context there is no locking because only the process accesses
  * its own state. All vma manipulation is somewhat protected by a down_read on
- * mmap_sem. For allocating in the interleave policy the page_table_lock
- * must be also aquired to protect il_next.
+ * mmap_sem.
  *
  * Freeing policy:
  * When policy is MPOL_BIND v.zonelist is kmalloc'ed and must be kfree'd.
diff --git a/mm/filemap.c b/mm/filemap.c
index f560b41c8f61..036599d1177e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -66,7 +66,7 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
  *
  *  ->mmap_sem
  *    ->i_mmap_lock
- *      ->page_table_lock	(various places, mainly in mmap.c)
+ *      ->page_table_lock or pte_lock	(various, mainly in memory.c)
  *        ->mapping->tree_lock	(arch-dependent flush_dcache_mmap_lock)
  *
  *  ->mmap_sem
@@ -86,9 +86,9 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
  *    ->anon_vma.lock		(vma_adjust)
  *
  *  ->anon_vma.lock
- *    ->page_table_lock		(anon_vma_prepare and various)
+ *    ->page_table_lock or pte_lock	(anon_vma_prepare and various)
  *
- *  ->page_table_lock
+ *  ->page_table_lock or pte_lock
  *    ->swap_lock		(try_to_unmap_one)
  *    ->private_lock		(try_to_unmap_one)
  *    ->tree_lock		(try_to_unmap_one)
diff --git a/mm/rmap.c b/mm/rmap.c
index a7427bbf57e4..914d04b98bee 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -32,7 +32,7 @@
  *   page->flags PG_locked (lock_page)
  *     mapping->i_mmap_lock
  *       anon_vma->lock
- *         mm->page_table_lock
+ *         mm->page_table_lock or pte_lock
  *           zone->lru_lock (in mark_page_accessed)
  *           swap_lock (in swap_duplicate, swap_info_get)
  *             mmlist_lock (in mmput, drain_mmlist and others)
@@ -244,7 +244,7 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
 /*
  * Check that @page is mapped at @address into @mm.
  *
- * On success returns with mapped pte and locked mm->page_table_lock.
+ * On success returns with pte mapped and locked.
  */
 pte_t *page_check_address(struct page *page, struct mm_struct *mm,
 			  unsigned long address, spinlock_t **ptlp)
@@ -445,7 +445,7 @@ int page_referenced(struct page *page, int is_locked, int ignore_token)
  * @vma:	the vm area in which the mapping is added
  * @address:	the user virtual address mapped
  *
- * The caller needs to hold the mm->page_table_lock.
+ * The caller needs to hold the pte lock.
  */
 void page_add_anon_rmap(struct page *page,
 	struct vm_area_struct *vma, unsigned long address)
@@ -468,7 +468,7 @@ void page_add_anon_rmap(struct page *page,
  * page_add_file_rmap - add pte mapping to a file page
  * @page: the page to add the mapping to
  *
- * The caller needs to hold the mm->page_table_lock.
+ * The caller needs to hold the pte lock.
  */
 void page_add_file_rmap(struct page *page)
 {
@@ -483,7 +483,7 @@ void page_add_file_rmap(struct page *page)
  * page_remove_rmap - take down pte mapping from a page
  * @page: page to remove mapping from
  *
- * Caller needs to hold the mm->page_table_lock.
+ * The caller needs to hold the pte lock.
  */
 void page_remove_rmap(struct page *page)
 {
diff --git a/mm/swap_state.c b/mm/swap_state.c
index cafc1edcbeba..dfd9a46755b8 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -259,8 +259,7 @@ static inline void free_swap_cache(struct page *page)
 
 /* 
  * Perform a free_page(), also freeing any swap cache associated with
- * this page if it is the last user of the page. Can not do a lock_page,
- * as we are holding the page_table_lock spinlock.
+ * this page if it is the last user of the page.
  */
 void free_page_and_swap_cache(struct page *page)
 {

From 96527980d4cb8f65fe49efdbc4ab92c0837d42f6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 29 Oct 2005 18:16:42 -0700
Subject: [PATCH 69/91] [PATCH] hugetlbfs: move free_inodes accounting

Move hugetlbfs accounting into ->alloc_inode / ->destroy_inode.  This keeps
the code simpler, fixes a loeak where a failing inode allocation wouldn't
decrement the counter and moves hugetlbfs_delete_inode and
hugetlbfs_forget_inode closer to their generic counterparts.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/hugetlbfs/inode.c | 80 ++++++++++++++++++++++++--------------------
 1 file changed, 43 insertions(+), 37 deletions(-)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index a826a8add5e3..8e9d43633365 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -224,8 +224,6 @@ static void truncate_hugepages(struct address_space *mapping, loff_t lstart)
 
 static void hugetlbfs_delete_inode(struct inode *inode)
 {
-	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(inode->i_sb);
-
 	hlist_del_init(&inode->i_hash);
 	list_del_init(&inode->i_list);
 	list_del_init(&inode->i_sb_list);
@@ -238,12 +236,6 @@ static void hugetlbfs_delete_inode(struct inode *inode)
 
 	security_inode_delete(inode);
 
-	if (sbinfo->free_inodes >= 0) {
-		spin_lock(&sbinfo->stat_lock);
-		sbinfo->free_inodes++;
-		spin_unlock(&sbinfo->stat_lock);
-	}
-
 	clear_inode(inode);
 	destroy_inode(inode);
 }
@@ -251,7 +243,6 @@ static void hugetlbfs_delete_inode(struct inode *inode)
 static void hugetlbfs_forget_inode(struct inode *inode)
 {
 	struct super_block *super_block = inode->i_sb;
-	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(super_block);
 
 	if (hlist_unhashed(&inode->i_hash))
 		goto out_truncate;
@@ -278,12 +269,6 @@ out_truncate:
 	if (inode->i_data.nrpages)
 		truncate_hugepages(&inode->i_data, 0);
 
-	if (sbinfo->free_inodes >= 0) {
-		spin_lock(&sbinfo->stat_lock);
-		sbinfo->free_inodes++;
-		spin_unlock(&sbinfo->stat_lock);
-	}
-
 	clear_inode(inode);
 	destroy_inode(inode);
 }
@@ -375,17 +360,6 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
 					gid_t gid, int mode, dev_t dev)
 {
 	struct inode *inode;
-	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb);
-
-	if (sbinfo->free_inodes >= 0) {
-		spin_lock(&sbinfo->stat_lock);
-		if (!sbinfo->free_inodes) {
-			spin_unlock(&sbinfo->stat_lock);
-			return NULL;
-		}
-		sbinfo->free_inodes--;
-		spin_unlock(&sbinfo->stat_lock);
-	}
 
 	inode = new_inode(sb);
 	if (inode) {
@@ -527,29 +501,51 @@ static void hugetlbfs_put_super(struct super_block *sb)
 	}
 }
 
+static inline int hugetlbfs_dec_free_inodes(struct hugetlbfs_sb_info *sbinfo)
+{
+	if (sbinfo->free_inodes >= 0) {
+		spin_lock(&sbinfo->stat_lock);
+		if (unlikely(!sbinfo->free_inodes)) {
+			spin_unlock(&sbinfo->stat_lock);
+			return 0;
+		}
+		sbinfo->free_inodes--;
+		spin_unlock(&sbinfo->stat_lock);
+	}
+
+	return 1;
+}
+
+static void hugetlbfs_inc_free_inodes(struct hugetlbfs_sb_info *sbinfo)
+{
+	if (sbinfo->free_inodes >= 0) {
+		spin_lock(&sbinfo->stat_lock);
+		sbinfo->free_inodes++;
+		spin_unlock(&sbinfo->stat_lock);
+	}
+}
+
+
 static kmem_cache_t *hugetlbfs_inode_cachep;
 
 static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
 {
+	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb);
 	struct hugetlbfs_inode_info *p;
 
-	p = kmem_cache_alloc(hugetlbfs_inode_cachep, SLAB_KERNEL);
-	if (!p)
+	if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo)))
 		return NULL;
+	p = kmem_cache_alloc(hugetlbfs_inode_cachep, SLAB_KERNEL);
+	if (unlikely(!p)) {
+		hugetlbfs_inc_free_inodes(sbinfo);
+		return NULL;
+	}
 	return &p->vfs_inode;
 }
 
-static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags)
-{
-	struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo;
-
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR)
-		inode_init_once(&ei->vfs_inode);
-}
-
 static void hugetlbfs_destroy_inode(struct inode *inode)
 {
+	hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb));
 	mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy);
 	kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode));
 }
@@ -561,6 +557,16 @@ static struct address_space_operations hugetlbfs_aops = {
 	.set_page_dirty	= hugetlbfs_set_page_dirty,
 };
 
+
+static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags)
+{
+	struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo;
+
+	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+	    SLAB_CTOR_CONSTRUCTOR)
+		inode_init_once(&ei->vfs_inode);
+}
+
 struct file_operations hugetlbfs_file_operations = {
 	.mmap			= hugetlbfs_file_mmap,
 	.fsync			= simple_sync_file,

From 149f4211afda85743e3a3db3fa3abbd81506cf2b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 29 Oct 2005 18:16:43 -0700
Subject: [PATCH 70/91] [PATCH] hugetlbfs: clean up hugetlbfs_delete_inode

Make hugetlbfs looks the same as generic_detelte_inode, fixing a bunch of
missing updates to it at the same time.  Rename it to
hugetlbfs_do_delete_inode and add a real hugetlbfs_delete_inode that
implements ->delete_inode.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/hugetlbfs/inode.c | 38 ++++++++++++++++++++++++++++++++------
 1 file changed, 32 insertions(+), 6 deletions(-)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 8e9d43633365..2b9d1bee9220 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -224,19 +224,44 @@ static void truncate_hugepages(struct address_space *mapping, loff_t lstart)
 
 static void hugetlbfs_delete_inode(struct inode *inode)
 {
-	hlist_del_init(&inode->i_hash);
+	if (inode->i_data.nrpages)
+		truncate_hugepages(&inode->i_data, 0);
+	clear_inode(inode);
+}
+
+static void hugetlbfs_do_delete_inode(struct inode *inode)
+{
+	struct super_operations *op = inode->i_sb->s_op;
+
 	list_del_init(&inode->i_list);
 	list_del_init(&inode->i_sb_list);
 	inode->i_state |= I_FREEING;
 	inodes_stat.nr_inodes--;
 	spin_unlock(&inode_lock);
 
-	if (inode->i_data.nrpages)
-		truncate_hugepages(&inode->i_data, 0);
-
 	security_inode_delete(inode);
 
-	clear_inode(inode);
+	if (op->delete_inode) {
+		void (*delete)(struct inode *) = op->delete_inode;
+		if (!is_bad_inode(inode))
+			DQUOT_INIT(inode);
+		/* Filesystems implementing their own
+		 * s_op->delete_inode are required to call
+		 * truncate_inode_pages and clear_inode()
+		 * internally
+		 */
+		delete(inode);
+	} else {
+		truncate_inode_pages(&inode->i_data, 0);
+		clear_inode(inode);
+	}
+
+	spin_lock(&inode_lock);
+	hlist_del_init(&inode->i_hash);
+	spin_unlock(&inode_lock);
+	wake_up_inode(inode);
+	if (inode->i_state != I_CLEAR)
+		BUG();
 	destroy_inode(inode);
 }
 
@@ -276,7 +301,7 @@ out_truncate:
 static void hugetlbfs_drop_inode(struct inode *inode)
 {
 	if (!inode->i_nlink)
-		hugetlbfs_delete_inode(inode);
+		hugetlbfs_do_delete_inode(inode);
 	else
 		hugetlbfs_forget_inode(inode);
 }
@@ -594,6 +619,7 @@ static struct super_operations hugetlbfs_ops = {
 	.alloc_inode    = hugetlbfs_alloc_inode,
 	.destroy_inode  = hugetlbfs_destroy_inode,
 	.statfs		= hugetlbfs_statfs,
+	.delete_inode	= hugetlbfs_delete_inode,
 	.drop_inode	= hugetlbfs_drop_inode,
 	.put_super	= hugetlbfs_put_super,
 };

From 6b09b9df05f319ec27e0dae1721efe097b8b23ad Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 29 Oct 2005 18:16:44 -0700
Subject: [PATCH 71/91] [PATCH] kill hugelbfs_do_delete_inode

hugetlbfs_do_delete_inode is the same as generic_delete_inode now, so remove
it in favour of the latter.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/hugetlbfs/inode.c | 38 +-------------------------------------
 1 file changed, 1 insertion(+), 37 deletions(-)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 2b9d1bee9220..ffdad4e64671 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -229,42 +229,6 @@ static void hugetlbfs_delete_inode(struct inode *inode)
 	clear_inode(inode);
 }
 
-static void hugetlbfs_do_delete_inode(struct inode *inode)
-{
-	struct super_operations *op = inode->i_sb->s_op;
-
-	list_del_init(&inode->i_list);
-	list_del_init(&inode->i_sb_list);
-	inode->i_state |= I_FREEING;
-	inodes_stat.nr_inodes--;
-	spin_unlock(&inode_lock);
-
-	security_inode_delete(inode);
-
-	if (op->delete_inode) {
-		void (*delete)(struct inode *) = op->delete_inode;
-		if (!is_bad_inode(inode))
-			DQUOT_INIT(inode);
-		/* Filesystems implementing their own
-		 * s_op->delete_inode are required to call
-		 * truncate_inode_pages and clear_inode()
-		 * internally
-		 */
-		delete(inode);
-	} else {
-		truncate_inode_pages(&inode->i_data, 0);
-		clear_inode(inode);
-	}
-
-	spin_lock(&inode_lock);
-	hlist_del_init(&inode->i_hash);
-	spin_unlock(&inode_lock);
-	wake_up_inode(inode);
-	if (inode->i_state != I_CLEAR)
-		BUG();
-	destroy_inode(inode);
-}
-
 static void hugetlbfs_forget_inode(struct inode *inode)
 {
 	struct super_block *super_block = inode->i_sb;
@@ -301,7 +265,7 @@ out_truncate:
 static void hugetlbfs_drop_inode(struct inode *inode)
 {
 	if (!inode->i_nlink)
-		hugetlbfs_do_delete_inode(inode);
+		generic_delete_inode(inode);
 	else
 		hugetlbfs_forget_inode(inode);
 }

From 0b1533f67cc1a595457af6d05ab3510294e2ca9c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 29 Oct 2005 18:16:45 -0700
Subject: [PATCH 72/91] [PATCH] cleanup hugelbfs_forget_inode

Reformat hugelbfs_forget_inode and add the missing but harmless
write_inode_now call.  It looks the same as generic_forget_inode now except
for the call to truncate_hugepages instead of truncate_inode_pages.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/hugetlbfs/inode.c | 36 +++++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index ffdad4e64671..8f94feb24c0a 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -231,25 +231,28 @@ static void hugetlbfs_delete_inode(struct inode *inode)
 
 static void hugetlbfs_forget_inode(struct inode *inode)
 {
-	struct super_block *super_block = inode->i_sb;
+	struct super_block *sb = inode->i_sb;
 
-	if (hlist_unhashed(&inode->i_hash))
-		goto out_truncate;
-
-	if (!(inode->i_state & (I_DIRTY|I_LOCK))) {
-		list_del(&inode->i_list);
-		list_add(&inode->i_list, &inode_unused);
-	}
-	inodes_stat.nr_unused++;
-	if (!super_block || (super_block->s_flags & MS_ACTIVE)) {
+	if (!hlist_unhashed(&inode->i_hash)) {
+		if (!(inode->i_state & (I_DIRTY|I_LOCK)))
+			list_move(&inode->i_list, &inode_unused);
+		inodes_stat.nr_unused++;
+		if (!sb || (sb->s_flags & MS_ACTIVE)) {
+			spin_unlock(&inode_lock);
+			return;
+		}
+		inode->i_state |= I_WILL_FREE;
 		spin_unlock(&inode_lock);
-		return;
+		/*
+		 * write_inode_now is a noop as we set BDI_CAP_NO_WRITEBACK
+		 * in our backing_dev_info.
+		 */
+		write_inode_now(inode, 1);
+		spin_lock(&inode_lock);
+		inode->i_state &= ~I_WILL_FREE;
+		inodes_stat.nr_unused--;
+		hlist_del_init(&inode->i_hash);
 	}
-
-	/* write_inode_now() ? */
-	inodes_stat.nr_unused--;
-	hlist_del_init(&inode->i_hash);
-out_truncate:
 	list_del_init(&inode->i_list);
 	list_del_init(&inode->i_sb_list);
 	inode->i_state |= I_FREEING;
@@ -257,7 +260,6 @@ out_truncate:
 	spin_unlock(&inode_lock);
 	if (inode->i_data.nrpages)
 		truncate_hugepages(&inode->i_data, 0);
-
 	clear_inode(inode);
 	destroy_inode(inode);
 }

From 551110a94aa15890d1709b179c4be1e66ff6db53 Mon Sep 17 00:00:00 2001
From: Krishnakumar R <rkrishnakumar@gmail.com>
Date: Sat, 29 Oct 2005 18:16:45 -0700
Subject: [PATCH 73/91] [PATCH] hugetlb: remove repeated code

Clean up some repeated code related to HugeTLB.  hugetlb_zero_setup would
have already allocated the file->f_op.

Signed-off-by: Krishnakumar. R <rkrishnakumar@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 ipc/shm.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/ipc/shm.c b/ipc/shm.c
index dca90489e3b0..b58c651d31ae 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -233,10 +233,11 @@ static int newseg (key_t key, int shmflg, size_t size)
 	shp->id = shm_buildid(id,shp->shm_perm.seq);
 	shp->shm_file = file;
 	file->f_dentry->d_inode->i_ino = shp->id;
-	if (shmflg & SHM_HUGETLB)
-		set_file_hugepages(file);
-	else
+
+	/* Hugetlb ops would have already been assigned. */
+	if (!(shmflg & SHM_HUGETLB))
 		file->f_op = &shm_file_operations;
+
 	shm_tot += numpages;
 	shm_unlock(shp);
 	return shp->id;

From 4c887265977213985091476be40ab11dfdcb4caf Mon Sep 17 00:00:00 2001
From: Adam Litke <agl@us.ibm.com>
Date: Sat, 29 Oct 2005 18:16:46 -0700
Subject: [PATCH 74/91] [PATCH] hugetlb: demand fault handler

Below is a patch to implement demand faulting for huge pages.  The main
motivation for changing from prefaulting to demand faulting is so that huge
page memory areas can be allocated according to NUMA policy.

Thanks to consolidated hugetlb code, switching the behavior requires changing
only one fault handler.  The bulk of the patch just moves the logic from
hugelb_prefault() to hugetlb_pte_fault() and find_get_huge_page().

Signed-off-by: Adam Litke <agl@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/hugetlbfs/inode.c |   7 +-
 mm/hugetlb.c         | 176 +++++++++++++++++++++++--------------------
 2 files changed, 95 insertions(+), 88 deletions(-)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 8f94feb24c0a..2627efe767cf 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -48,7 +48,6 @@ int sysctl_hugetlb_shm_group;
 static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct inode *inode = file->f_dentry->d_inode;
-	struct address_space *mapping = inode->i_mapping;
 	loff_t len, vma_len;
 	int ret;
 
@@ -79,10 +78,8 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size)
 		goto out;
 
-	ret = hugetlb_prefault(mapping, vma);
-	if (ret)
-		goto out;
-
+	ret = 0;
+	hugetlb_prefault_arch_hook(vma->vm_mm);
 	if (inode->i_size < len)
 		inode->i_size = len;
 out:
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index f29b7dc02c39..c9b43360fd33 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -321,10 +321,7 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 
 	for (address = start; address < end; address += HPAGE_SIZE) {
 		ptep = huge_pte_offset(mm, address);
-		if (! ptep)
-			/* This can happen on truncate, or if an
-			 * mmap() is aborted due to an error before
-			 * the prefault */
+		if (!ptep)
 			continue;
 
 		pte = huge_ptep_get_and_clear(mm, address, ptep);
@@ -340,81 +337,92 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 	flush_tlb_range(vma, start, end);
 }
 
-int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
+static struct page *find_lock_huge_page(struct address_space *mapping,
+			unsigned long idx)
 {
-	struct mm_struct *mm = current->mm;
-	unsigned long addr;
-	int ret = 0;
+	struct page *page;
+	int err;
+	struct inode *inode = mapping->host;
+	unsigned long size;
 
-	WARN_ON(!is_vm_hugetlb_page(vma));
-	BUG_ON(vma->vm_start & ~HPAGE_MASK);
-	BUG_ON(vma->vm_end & ~HPAGE_MASK);
+retry:
+	page = find_lock_page(mapping, idx);
+	if (page)
+		goto out;
 
-	hugetlb_prefault_arch_hook(mm);
+	/* Check to make sure the mapping hasn't been truncated */
+	size = i_size_read(inode) >> HPAGE_SHIFT;
+	if (idx >= size)
+		goto out;
 
-	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
-		unsigned long idx;
-		pte_t *pte = huge_pte_alloc(mm, addr);
-		struct page *page;
+	if (hugetlb_get_quota(mapping))
+		goto out;
+	page = alloc_huge_page();
+	if (!page) {
+		hugetlb_put_quota(mapping);
+		goto out;
+	}
 
-		if (!pte) {
-			ret = -ENOMEM;
-			goto out;
-		}
-
-		idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
-			+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
-		page = find_get_page(mapping, idx);
-		if (!page) {
-			/* charge the fs quota first */
-			if (hugetlb_get_quota(mapping)) {
-				ret = -ENOMEM;
-				goto out;
-			}
-			page = alloc_huge_page();
-			if (!page) {
-				hugetlb_put_quota(mapping);
-				ret = -ENOMEM;
-				goto out;
-			}
-			ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
-			if (! ret) {
-				unlock_page(page);
-			} else {
-				hugetlb_put_quota(mapping);
-				free_huge_page(page);
-				goto out;
-			}
-		}
-		spin_lock(&mm->page_table_lock);
-		add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
-		set_huge_pte_at(mm, addr, pte, make_huge_pte(vma, page));
-		spin_unlock(&mm->page_table_lock);
+	err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
+	if (err) {
+		put_page(page);
+		hugetlb_put_quota(mapping);
+		if (err == -EEXIST)
+			goto retry;
+		page = NULL;
 	}
 out:
-	return ret;
+	return page;
 }
 
-/*
- * On ia64 at least, it is possible to receive a hugetlb fault from a
- * stale zero entry left in the TLB from earlier hardware prefetching.
- * Low-level arch code should already have flushed the stale entry as
- * part of its fault handling, but we do need to accept this minor fault
- * and return successfully.  Whereas the "normal" case is that this is
- * an access to a hugetlb page which has been truncated off since mmap.
- */
 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, int write_access)
 {
 	int ret = VM_FAULT_SIGBUS;
+	unsigned long idx;
+	unsigned long size;
 	pte_t *pte;
+	struct page *page;
+	struct address_space *mapping;
+
+	pte = huge_pte_alloc(mm, address);
+	if (!pte)
+		goto out;
+
+	mapping = vma->vm_file->f_mapping;
+	idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
+		+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
+
+	/*
+	 * Use page lock to guard against racing truncation
+	 * before we get page_table_lock.
+	 */
+	page = find_lock_huge_page(mapping, idx);
+	if (!page)
+		goto out;
 
 	spin_lock(&mm->page_table_lock);
-	pte = huge_pte_offset(mm, address);
-	if (pte && !pte_none(*pte))
-		ret = VM_FAULT_MINOR;
+	size = i_size_read(mapping->host) >> HPAGE_SHIFT;
+	if (idx >= size)
+		goto backout;
+
+	ret = VM_FAULT_MINOR;
+	if (!pte_none(*pte))
+		goto backout;
+
+	add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
+	set_huge_pte_at(mm, address, pte, make_huge_pte(vma, page));
 	spin_unlock(&mm->page_table_lock);
+	unlock_page(page);
+out:
 	return ret;
+
+backout:
+	spin_unlock(&mm->page_table_lock);
+	hugetlb_put_quota(mapping);
+	unlock_page(page);
+	put_page(page);
+	goto out;
 }
 
 int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -424,34 +432,36 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	unsigned long vpfn, vaddr = *position;
 	int remainder = *length;
 
-	BUG_ON(!is_vm_hugetlb_page(vma));
-
 	vpfn = vaddr/PAGE_SIZE;
 	spin_lock(&mm->page_table_lock);
 	while (vaddr < vma->vm_end && remainder) {
+		pte_t *pte;
+		struct page *page;
+
+		/*
+		 * Some archs (sparc64, sh*) have multiple pte_ts to
+		 * each hugepage.  We have to make * sure we get the
+		 * first, for the page indexing below to work.
+		 */
+		pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
+
+		if (!pte || pte_none(*pte)) {
+			int ret;
+
+			spin_unlock(&mm->page_table_lock);
+			ret = hugetlb_fault(mm, vma, vaddr, 0);
+			spin_lock(&mm->page_table_lock);
+			if (ret == VM_FAULT_MINOR)
+				continue;
+
+			remainder = 0;
+			if (!i)
+				i = -EFAULT;
+			break;
+		}
 
 		if (pages) {
-			pte_t *pte;
-			struct page *page;
-
-			/* Some archs (sparc64, sh*) have multiple
-			 * pte_ts to each hugepage.  We have to make
-			 * sure we get the first, for the page
-			 * indexing below to work. */
-			pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
-
-			/* the hugetlb file might have been truncated */
-			if (!pte || pte_none(*pte)) {
-				remainder = 0;
-				if (!i)
-					i = -EFAULT;
-				break;
-			}
-
 			page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
-
-			WARN_ON(!PageCompound(page));
-
 			get_page(page);
 			pages[i] = page;
 		}

From 2e9b367c2273ed21c9852a04d90944d472c4f3e6 Mon Sep 17 00:00:00 2001
From: Adam Litke <agl@us.ibm.com>
Date: Sat, 29 Oct 2005 18:16:47 -0700
Subject: [PATCH 75/91] [PATCH] hugetlb: overcommit accounting check

Basic overcommit checking for hugetlb_file_map() based on an implementation
used with demand faulting in SLES9.

Since demand faulting can't guarantee the availability of pages at mmap
time, this patch implements a basic sanity check to ensure that the number
of huge pages required to satisfy the mmap are currently available.
Despite the obvious race, I think it is a good start on doing proper
accounting.  I'd like to work towards an accounting system that mimics the
semantics of normal pages (especially for the MAP_PRIVATE/COW case).  That
work is underway and builds on what this patch starts.

Huge page shared memory segments are simpler and still maintain their
commit on shmget semantics.

Signed-off-by: Adam Litke <agl@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/hugetlbfs/inode.c | 63 +++++++++++++++++++++++++++++++++++++-------
 1 file changed, 53 insertions(+), 10 deletions(-)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 2627efe767cf..e026c807e6b3 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -45,9 +45,58 @@ static struct backing_dev_info hugetlbfs_backing_dev_info = {
 
 int sysctl_hugetlb_shm_group;
 
+static void huge_pagevec_release(struct pagevec *pvec)
+{
+	int i;
+
+	for (i = 0; i < pagevec_count(pvec); ++i)
+		put_page(pvec->pages[i]);
+
+	pagevec_reinit(pvec);
+}
+
+/*
+ * huge_pages_needed tries to determine the number of new huge pages that
+ * will be required to fully populate this VMA.  This will be equal to
+ * the size of the VMA in huge pages minus the number of huge pages
+ * (covered by this VMA) that are found in the page cache.
+ *
+ * Result is in bytes to be compatible with is_hugepage_mem_enough()
+ */
+unsigned long
+huge_pages_needed(struct address_space *mapping, struct vm_area_struct *vma)
+{
+	int i;
+	struct pagevec pvec;
+	unsigned long start = vma->vm_start;
+	unsigned long end = vma->vm_end;
+	unsigned long hugepages = (end - start) >> HPAGE_SHIFT;
+	pgoff_t next = vma->vm_pgoff;
+	pgoff_t endpg = next + ((end - start) >> PAGE_SHIFT);
+
+	pagevec_init(&pvec, 0);
+	while (next < endpg) {
+		if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE))
+			break;
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			struct page *page = pvec.pages[i];
+			if (page->index > next)
+				next = page->index;
+			if (page->index >= endpg)
+				break;
+			next++;
+			hugepages--;
+		}
+		huge_pagevec_release(&pvec);
+	}
+	return hugepages << HPAGE_SHIFT;
+}
+
 static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct inode *inode = file->f_dentry->d_inode;
+	struct address_space *mapping = inode->i_mapping;
+	unsigned long bytes;
 	loff_t len, vma_len;
 	int ret;
 
@@ -66,6 +115,10 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
 		return -EINVAL;
 
+	bytes = huge_pages_needed(mapping, vma);
+	if (!is_hugepage_mem_enough(bytes))
+		return -ENOMEM;
+
 	vma_len = (loff_t)(vma->vm_end - vma->vm_start);
 
 	down(&inode->i_sem);
@@ -168,16 +221,6 @@ static int hugetlbfs_commit_write(struct file *file,
 	return -EINVAL;
 }
 
-static void huge_pagevec_release(struct pagevec *pvec)
-{
-	int i;
-
-	for (i = 0; i < pagevec_count(pvec); ++i)
-		put_page(pvec->pages[i]);
-
-	pagevec_reinit(pvec);
-}
-
 static void truncate_huge_page(struct page *page)
 {
 	clear_page_dirty(page);

From 1a44e149084d772a1bcf4cdbdde8a013a8a1cfde Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <andrea@suse.de>
Date: Sat, 29 Oct 2005 18:16:48 -0700
Subject: [PATCH 76/91] [PATCH] .text page fault SMP scalability optimization

We had a problem on ppc64 where with more than 4 threads a large system
wouldn't scale well while faulting in the .text (most of the time was spent
in the kernel despite it was an userland compute intensive app).  The
reason is the useless overwrite of the same pte from all cpu.

I fixed it this way (verified on an older kernel but the forward port is
almost identical).  This will benefit all archs not just ppc64.

Signed-off-by: Andrea Arcangeli <andrea@suse.de>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index d68421dd64ef..0f60baf6f69b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1980,9 +1980,10 @@ static inline int handle_pte_fault(struct mm_struct *mm,
 		pte_t *pte, pmd_t *pmd, int write_access)
 {
 	pte_t entry;
+	pte_t old_entry;
 	spinlock_t *ptl;
 
-	entry = *pte;
+	old_entry = entry = *pte;
 	if (!pte_present(entry)) {
 		if (pte_none(entry)) {
 			if (!vma->vm_ops || !vma->vm_ops->nopage)
@@ -2009,9 +2010,20 @@ static inline int handle_pte_fault(struct mm_struct *mm,
 		entry = pte_mkdirty(entry);
 	}
 	entry = pte_mkyoung(entry);
-	ptep_set_access_flags(vma, address, pte, entry, write_access);
-	update_mmu_cache(vma, address, entry);
-	lazy_mmu_prot_update(entry);
+	if (!pte_same(old_entry, entry)) {
+		ptep_set_access_flags(vma, address, pte, entry, write_access);
+		update_mmu_cache(vma, address, entry);
+		lazy_mmu_prot_update(entry);
+	} else {
+		/*
+		 * This is needed only for protection faults but the arch code
+		 * is not yet telling us if this is a protection fault or not.
+		 * This still avoids useless tlb flushes for .text page faults
+		 * with threads.
+		 */
+		if (write_access)
+			flush_tlb_page(vma, address);
+	}
 unlock:
 	pte_unmap_unlock(pte, ptl);
 	return VM_FAULT_MINOR;

From 2774812f417db562f0d659d2c1b5755ba35d2770 Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Sat, 29 Oct 2005 18:16:49 -0700
Subject: [PATCH 77/91] [PATCH] memory hotplug prep: kill local_mapnr

The following series implements memory hot-add for ppc64 and i386.  There are
x86_64 and ia64 implementations that will be submitted shortly as well,
through the normal maintainers.

This patch:

local_mapnr is unused, except for in an alpha header.  Keep the alpha one,
kill the rest.

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/mmzone.h   | 6 ------
 include/asm-m32r/mmzone.h   | 6 ------
 include/asm-parisc/mmzone.h | 6 ------
 include/asm-ppc64/mmzone.h  | 3 ---
 4 files changed, 21 deletions(-)

diff --git a/include/asm-i386/mmzone.h b/include/asm-i386/mmzone.h
index 348fe3a4879d..620a90641ea8 100644
--- a/include/asm-i386/mmzone.h
+++ b/include/asm-i386/mmzone.h
@@ -88,12 +88,6 @@ static inline int pfn_to_nid(unsigned long pfn)
 	__pgdat->node_start_pfn + __pgdat->node_spanned_pages;		\
 })
 
-#define local_mapnr(kvaddr)						\
-({									\
-	unsigned long __pfn = __pa(kvaddr) >> PAGE_SHIFT;		\
-	(__pfn - node_start_pfn(pfn_to_nid(__pfn)));			\
-})
-
 /* XXX: FIXME -- wli */
 #define kern_addr_valid(kaddr)	(0)
 
diff --git a/include/asm-m32r/mmzone.h b/include/asm-m32r/mmzone.h
index d58878ec899e..adc7970a77ec 100644
--- a/include/asm-m32r/mmzone.h
+++ b/include/asm-m32r/mmzone.h
@@ -21,12 +21,6 @@ extern struct pglist_data *node_data[];
 	__pgdat->node_start_pfn + __pgdat->node_spanned_pages - 1;	\
 })
 
-#define local_mapnr(kvaddr)						\
-({									\
-	unsigned long __pfn = __pa(kvaddr) >> PAGE_SHIFT;		\
-	(__pfn - node_start_pfn(pfn_to_nid(__pfn)));			\
-})
-
 #define pfn_to_page(pfn)						\
 ({									\
 	unsigned long __pfn = pfn;					\
diff --git a/include/asm-parisc/mmzone.h b/include/asm-parisc/mmzone.h
index 595d3dce120a..ae039f4fd711 100644
--- a/include/asm-parisc/mmzone.h
+++ b/include/asm-parisc/mmzone.h
@@ -27,12 +27,6 @@ extern struct node_map_data node_data[];
 })
 #define node_localnr(pfn, nid)		((pfn) - node_start_pfn(nid))
 
-#define local_mapnr(kvaddr)						\
-({									\
-	unsigned long __pfn = __pa(kvaddr) >> PAGE_SHIFT;		\
-	(__pfn - node_start_pfn(pfn_to_nid(__pfn)));			\
-})
-
 #define pfn_to_page(pfn)						\
 ({									\
 	unsigned long __pfn = (pfn);					\
diff --git a/include/asm-ppc64/mmzone.h b/include/asm-ppc64/mmzone.h
index ed473f4b0152..80a708e7093a 100644
--- a/include/asm-ppc64/mmzone.h
+++ b/include/asm-ppc64/mmzone.h
@@ -67,9 +67,6 @@ static inline int pa_to_nid(unsigned long pa)
 #define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
 #define node_end_pfn(nid)	(NODE_DATA(nid)->node_end_pfn)
 
-#define local_mapnr(kvaddr) \
-	( (__pa(kvaddr) >> PAGE_SHIFT) - node_start_pfn(kvaddr_to_nid(kvaddr)) 
-
 #ifdef CONFIG_DISCONTIGMEM
 
 /*

From ed8ece2ec8d3c2031b1a1a0737568bb0d49454e0 Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Sat, 29 Oct 2005 18:16:50 -0700
Subject: [PATCH 78/91] [PATCH] memory hotplug prep: break out zone
 initialization

If a zone is empty at boot-time and then hot-added to later, it needs to run
the same init code that would have been run on it at boot.

This patch breaks out zone table and per-cpu-pages functions for use by the
hotplug code.  You can almost see all of the free_area_init_core() function on
one page now.  :)

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 98 +++++++++++++++++++++++++++++--------------------
 1 file changed, 58 insertions(+), 40 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a2995a5d012c..9a2fa8110afc 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1875,6 +1875,60 @@ void __init setup_per_cpu_pageset()
 
 #endif
 
+static __devinit
+void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
+{
+	int i;
+	struct pglist_data *pgdat = zone->zone_pgdat;
+
+	/*
+	 * The per-page waitqueue mechanism uses hashed waitqueues
+	 * per zone.
+	 */
+	zone->wait_table_size = wait_table_size(zone_size_pages);
+	zone->wait_table_bits =	wait_table_bits(zone->wait_table_size);
+	zone->wait_table = (wait_queue_head_t *)
+		alloc_bootmem_node(pgdat, zone->wait_table_size
+					* sizeof(wait_queue_head_t));
+
+	for(i = 0; i < zone->wait_table_size; ++i)
+		init_waitqueue_head(zone->wait_table + i);
+}
+
+static __devinit void zone_pcp_init(struct zone *zone)
+{
+	int cpu;
+	unsigned long batch = zone_batchsize(zone);
+
+	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+#ifdef CONFIG_NUMA
+		/* Early boot. Slab allocator not functional yet */
+		zone->pageset[cpu] = &boot_pageset[cpu];
+		setup_pageset(&boot_pageset[cpu],0);
+#else
+		setup_pageset(zone_pcp(zone,cpu), batch);
+#endif
+	}
+	printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
+		zone->name, zone->present_pages, batch);
+}
+
+static __devinit void init_currently_empty_zone(struct zone *zone,
+		unsigned long zone_start_pfn, unsigned long size)
+{
+	struct pglist_data *pgdat = zone->zone_pgdat;
+
+	zone_wait_table_init(zone, size);
+	pgdat->nr_zones = zone_idx(zone) + 1;
+
+	zone->zone_mem_map = pfn_to_page(zone_start_pfn);
+	zone->zone_start_pfn = zone_start_pfn;
+
+	memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
+
+	zone_init_free_lists(pgdat, zone, zone->spanned_pages);
+}
+
 /*
  * Set up the zone data structures:
  *   - mark all pages reserved
@@ -1884,8 +1938,8 @@ void __init setup_per_cpu_pageset()
 static void __init free_area_init_core(struct pglist_data *pgdat,
 		unsigned long *zones_size, unsigned long *zholes_size)
 {
-	unsigned long i, j;
-	int cpu, nid = pgdat->node_id;
+	unsigned long j;
+	int nid = pgdat->node_id;
 	unsigned long zone_start_pfn = pgdat->node_start_pfn;
 
 	pgdat->nr_zones = 0;
@@ -1895,7 +1949,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
 	for (j = 0; j < MAX_NR_ZONES; j++) {
 		struct zone *zone = pgdat->node_zones + j;
 		unsigned long size, realsize;
-		unsigned long batch;
 
 		realsize = size = zones_size[j];
 		if (zholes_size)
@@ -1915,19 +1968,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
 
 		zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
 
-		batch = zone_batchsize(zone);
-
-		for (cpu = 0; cpu < NR_CPUS; cpu++) {
-#ifdef CONFIG_NUMA
-			/* Early boot. Slab allocator not functional yet */
-			zone->pageset[cpu] = &boot_pageset[cpu];
-			setup_pageset(&boot_pageset[cpu],0);
-#else
-			setup_pageset(zone_pcp(zone,cpu), batch);
-#endif
-		}
-		printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
-				zone_names[j], realsize, batch);
+		zone_pcp_init(zone);
 		INIT_LIST_HEAD(&zone->active_list);
 		INIT_LIST_HEAD(&zone->inactive_list);
 		zone->nr_scan_active = 0;
@@ -1938,32 +1979,9 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
 		if (!size)
 			continue;
 
-		/*
-		 * The per-page waitqueue mechanism uses hashed waitqueues
-		 * per zone.
-		 */
-		zone->wait_table_size = wait_table_size(size);
-		zone->wait_table_bits =
-			wait_table_bits(zone->wait_table_size);
-		zone->wait_table = (wait_queue_head_t *)
-			alloc_bootmem_node(pgdat, zone->wait_table_size
-						* sizeof(wait_queue_head_t));
-
-		for(i = 0; i < zone->wait_table_size; ++i)
-			init_waitqueue_head(zone->wait_table + i);
-
-		pgdat->nr_zones = j+1;
-
-		zone->zone_mem_map = pfn_to_page(zone_start_pfn);
-		zone->zone_start_pfn = zone_start_pfn;
-
-		memmap_init(size, nid, j, zone_start_pfn);
-
 		zonetable_add(zone, nid, j, zone_start_pfn, size);
-
+		init_currently_empty_zone(zone, zone_start_pfn, size);
 		zone_start_pfn += size;
-
-		zone_init_free_lists(pgdat, zone, zone->spanned_pages);
 	}
 }
 

From 4ca644d970bf2542623228a4624af356d20ca267 Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Sat, 29 Oct 2005 18:16:51 -0700
Subject: [PATCH 79/91] [PATCH] memory hotplug prep: __section_nr helper

A little helper that we use in the hotplug code.

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mmzone.h |  1 +
 mm/sparse.c            | 25 +++++++++++++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 7519eb4191e7..4674145bb63d 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -509,6 +509,7 @@ static inline struct mem_section *__nr_to_section(unsigned long nr)
 		return NULL;
 	return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK];
 }
+extern int __section_nr(struct mem_section* ms);
 
 /*
  * We use the lower bits of the mem_map pointer to store
diff --git a/mm/sparse.c b/mm/sparse.c
index 347249a4917a..0d3bd4bf3aaa 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -72,6 +72,31 @@ static inline int sparse_index_init(unsigned long section_nr, int nid)
 }
 #endif
 
+/*
+ * Although written for the SPARSEMEM_EXTREME case, this happens
+ * to also work for the flat array case becase
+ * NR_SECTION_ROOTS==NR_MEM_SECTIONS.
+ */
+int __section_nr(struct mem_section* ms)
+{
+	unsigned long root_nr;
+	struct mem_section* root;
+
+	for (root_nr = 0;
+	     root_nr < NR_MEM_SECTIONS;
+	     root_nr += SECTIONS_PER_ROOT) {
+		root = __nr_to_section(root_nr);
+
+		if (!root)
+			continue;
+
+		if ((ms >= root) && (ms < (root + SECTIONS_PER_ROOT)))
+		     break;
+	}
+
+	return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
+}
+
 /* Record a memory area against a node. */
 void memory_present(int nid, unsigned long start, unsigned long end)
 {

From c6a57e19e464db118dc4ab9cfe9e9748c6d630a0 Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Sat, 29 Oct 2005 18:16:52 -0700
Subject: [PATCH 80/91] [PATCH] memory hotplug prep: fixup bad_range()

When doing memory hotplug operations, the size of existing zones can obviously
change.  This means that zone->zone_{start_pfn,spanned_pages} can change.

There are currently no locks that protect these structure members.  However,
they are rarely accessed at runtime.  Outside of swsusp, the only place that I
can find is bad_range().

So, split bad_range() up into two pieces: one that needs to be locked and
anther that doesn't.

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9a2fa8110afc..a51ef94eec33 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -78,21 +78,37 @@ int min_free_kbytes = 1024;
 unsigned long __initdata nr_kernel_pages;
 unsigned long __initdata nr_all_pages;
 
-/*
- * Temporary debugging check for pages not lying within a given zone.
- */
-static int bad_range(struct zone *zone, struct page *page)
+static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
 {
 	if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages)
 		return 1;
 	if (page_to_pfn(page) < zone->zone_start_pfn)
 		return 1;
+
+	return 0;
+}
+
+static int page_is_consistent(struct zone *zone, struct page *page)
+{
 #ifdef CONFIG_HOLES_IN_ZONE
 	if (!pfn_valid(page_to_pfn(page)))
-		return 1;
+		return 0;
 #endif
 	if (zone != page_zone(page))
+		return 0;
+
+	return 1;
+}
+/*
+ * Temporary debugging check for pages not lying within a given zone.
+ */
+static int bad_range(struct zone *zone, struct page *page)
+{
+	if (page_outside_zone_boundaries(zone, page))
 		return 1;
+	if (!page_is_consistent(zone, page))
+		return 1;
+
 	return 0;
 }
 

From 208d54e5513c0c02d85af0990901354c74364d5c Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Sat, 29 Oct 2005 18:16:52 -0700
Subject: [PATCH 81/91] [PATCH] memory hotplug locking: node_size_lock

pgdat->node_size_lock is basically only neeeded in one place in the normal
code: show_mem(), which is the arch-specific sysrq-m printing function.

Strictly speaking, the architectures not doing memory hotplug do no need this
locking in show_mem().  However, they are all included for completeness.  This
should also make any future consolidation of all of the implementations a
little more straightforward.

This lock is also held in the sparsemem code during a memory removal, as
sections are invalidated.  This is the place there pfn_valid() is made false
for a memory area that's being removed.  The lock is only required when doing
pfn_valid() operations on memory which the user does not already have a
reference on the page, such as in show_mem().

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/alpha/mm/numa.c           |  3 +++
 arch/i386/mm/pgtable.c         |  3 +++
 arch/ia64/mm/discontig.c       |  7 ++++++-
 arch/m32r/mm/init.c            |  9 ++++++++-
 arch/parisc/mm/init.c          |  3 +++
 arch/ppc64/mm/init.c           |  6 ++++++
 include/linux/memory_hotplug.h | 34 ++++++++++++++++++++++++++++++++++
 include/linux/mmzone.h         | 12 ++++++++++++
 mm/page_alloc.c                |  1 +
 9 files changed, 76 insertions(+), 2 deletions(-)
 create mode 100644 include/linux/memory_hotplug.h

diff --git a/arch/alpha/mm/numa.c b/arch/alpha/mm/numa.c
index c7481d59b6df..6d5251254f68 100644
--- a/arch/alpha/mm/numa.c
+++ b/arch/alpha/mm/numa.c
@@ -371,6 +371,8 @@ show_mem(void)
 	show_free_areas();
 	printk("Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
 	for_each_online_node(nid) {
+		unsigned long flags;
+		pgdat_resize_lock(NODE_DATA(nid), &flags);
 		i = node_spanned_pages(nid);
 		while (i-- > 0) {
 			struct page *page = nid_page_nr(nid, i);
@@ -384,6 +386,7 @@ show_mem(void)
 			else
 				shared += page_count(page) - 1;
 		}
+		pgdat_resize_unlock(NODE_DATA(nid), &flags);
 	}
 	printk("%ld pages of RAM\n",total);
 	printk("%ld free pages\n",free);
diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c
index 39c099f15b5f..9db3242103be 100644
--- a/arch/i386/mm/pgtable.c
+++ b/arch/i386/mm/pgtable.c
@@ -31,11 +31,13 @@ void show_mem(void)
 	pg_data_t *pgdat;
 	unsigned long i;
 	struct page_state ps;
+	unsigned long flags;
 
 	printk(KERN_INFO "Mem-info:\n");
 	show_free_areas();
 	printk(KERN_INFO "Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
 	for_each_pgdat(pgdat) {
+		pgdat_resize_lock(pgdat, &flags);
 		for (i = 0; i < pgdat->node_spanned_pages; ++i) {
 			page = pgdat_page_nr(pgdat, i);
 			total++;
@@ -48,6 +50,7 @@ void show_mem(void)
 			else if (page_count(page))
 				shared += page_count(page) - 1;
 		}
+		pgdat_resize_unlock(pgdat, &flags);
 	}
 	printk(KERN_INFO "%d pages of RAM\n", total);
 	printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
index a3788fb84809..a88cdb7232f8 100644
--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c
@@ -555,9 +555,13 @@ void show_mem(void)
 	show_free_areas();
 	printk("Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
 	for_each_pgdat(pgdat) {
-		unsigned long present = pgdat->node_present_pages;
+		unsigned long present;
+		unsigned long flags;
 		int shared = 0, cached = 0, reserved = 0;
+
 		printk("Node ID: %d\n", pgdat->node_id);
+		pgdat_resize_lock(pgdat, &flags);
+		present = pgdat->node_present_pages;
 		for(i = 0; i < pgdat->node_spanned_pages; i++) {
 			struct page *page;
 			if (pfn_valid(pgdat->node_start_pfn + i))
@@ -571,6 +575,7 @@ void show_mem(void)
 			else if (page_count(page))
 				shared += page_count(page)-1;
 		}
+		pgdat_resize_unlock(pgdat, &flags);
 		total_present += present;
 		total_reserved += reserved;
 		total_cached += cached;
diff --git a/arch/m32r/mm/init.c b/arch/m32r/mm/init.c
index d9a40b1fe8ba..6facf15b04f3 100644
--- a/arch/m32r/mm/init.c
+++ b/arch/m32r/mm/init.c
@@ -48,6 +48,8 @@ void show_mem(void)
 	show_free_areas();
 	printk("Free swap:       %6ldkB\n",nr_swap_pages<<(PAGE_SHIFT-10));
 	for_each_pgdat(pgdat) {
+		unsigned long flags;
+		pgdat_resize_lock(pgdat, &flags);
 		for (i = 0; i < pgdat->node_spanned_pages; ++i) {
 			page = pgdat_page_nr(pgdat, i);
 			total++;
@@ -60,6 +62,7 @@ void show_mem(void)
 			else if (page_count(page))
 				shared += page_count(page) - 1;
 		}
+		pgdat_resize_unlock(pgdat, &flags);
 	}
 	printk("%d pages of RAM\n", total);
 	printk("%d pages of HIGHMEM\n",highmem);
@@ -150,10 +153,14 @@ int __init reservedpages_count(void)
 	int reservedpages, nid, i;
 
 	reservedpages = 0;
-	for_each_online_node(nid)
+	for_each_online_node(nid) {
+		unsigned long flags;
+		pgdat_resize_lock(NODE_DATA(nid), &flags);
 		for (i = 0 ; i < MAX_LOW_PFN(nid) - START_PFN(nid) ; i++)
 			if (PageReserved(nid_page_nr(nid, i)))
 				reservedpages++;
+		pgdat_resize_unlock(NODE_DATA(nid), &flags);
+	}
 
 	return reservedpages;
 }
diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c
index 2886ad70db48..29b998e430e6 100644
--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@@ -505,7 +505,9 @@ void show_mem(void)
 
 		for (j = node_start_pfn(i); j < node_end_pfn(i); j++) {
 			struct page *p;
+			unsigned long flags;
 
+			pgdat_resize_lock(NODE_DATA(i), &flags);
 			p = nid_page_nr(i, j) - node_start_pfn(i);
 
 			total++;
@@ -517,6 +519,7 @@ void show_mem(void)
 				free++;
 			else
 				shared += page_count(p) - 1;
+			pgdat_resize_unlock(NODE_DATA(i), &flags);
         	}
 	}
 #endif
diff --git a/arch/ppc64/mm/init.c b/arch/ppc64/mm/init.c
index a45584b3440c..975b26de34d6 100644
--- a/arch/ppc64/mm/init.c
+++ b/arch/ppc64/mm/init.c
@@ -104,6 +104,8 @@ void show_mem(void)
 	show_free_areas();
 	printk("Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
 	for_each_pgdat(pgdat) {
+		unsigned long flags;
+		pgdat_resize_lock(pgdat, &flags);
 		for (i = 0; i < pgdat->node_spanned_pages; i++) {
 			page = pgdat_page_nr(pgdat, i);
 			total++;
@@ -114,6 +116,7 @@ void show_mem(void)
 			else if (page_count(page))
 				shared += page_count(page) - 1;
 		}
+		pgdat_resize_unlock(pgdat, &flags);
 	}
 	printk("%ld pages of RAM\n", total);
 	printk("%ld reserved pages\n", reserved);
@@ -647,11 +650,14 @@ void __init mem_init(void)
 #endif
 
 	for_each_pgdat(pgdat) {
+		unsigned long flags;
+		pgdat_resize_lock(pgdat, &flags);
 		for (i = 0; i < pgdat->node_spanned_pages; i++) {
 			page = pgdat_page_nr(pgdat, i);
 			if (PageReserved(page))
 				reservedpages++;
 		}
+		pgdat_resize_unlock(pgdat, &flags);
 	}
 
 	codesize = (unsigned long)&_etext - (unsigned long)&_stext;
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
new file mode 100644
index 000000000000..e8103be9d528
--- /dev/null
+++ b/include/linux/memory_hotplug.h
@@ -0,0 +1,34 @@
+#ifndef __LINUX_MEMORY_HOTPLUG_H
+#define __LINUX_MEMORY_HOTPLUG_H
+
+#include <linux/mmzone.h>
+#include <linux/spinlock.h>
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+/*
+ * pgdat resizing functions
+ */
+static inline
+void pgdat_resize_lock(struct pglist_data *pgdat, unsigned long *flags)
+{
+	spin_lock_irqsave(&pgdat->node_size_lock, *flags);
+}
+static inline
+void pgdat_resize_unlock(struct pglist_data *pgdat, unsigned long *flags)
+{
+	spin_lock_irqrestore(&pgdat->node_size_lock, *flags);
+}
+static inline
+void pgdat_resize_init(struct pglist_data *pgdat)
+{
+	spin_lock_init(&pgdat->node_size_lock);
+}
+#else /* ! CONFIG_MEMORY_HOTPLUG */
+/*
+ * Stub functions for when hotplug is off
+ */
+static inline void pgdat_resize_lock(struct pglist_data *p, unsigned long *f) {}
+static inline void pgdat_resize_unlock(struct pglist_data *p, unsigned long *f) {}
+static inline void pgdat_resize_init(struct pglist_data *pgdat) {}
+#endif
+#endif /* __LINUX_MEMORY_HOTPLUG_H */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 4674145bb63d..e050d68963a1 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -273,6 +273,16 @@ typedef struct pglist_data {
 	struct page *node_mem_map;
 #endif
 	struct bootmem_data *bdata;
+#ifdef CONFIG_MEMORY_HOTPLUG
+	/*
+	 * Must be held any time you expect node_start_pfn, node_present_pages
+	 * or node_spanned_pages stay constant.  Holding this will also
+	 * guarantee that any pfn_valid() stays that way.
+	 *
+	 * Nests above zone->lock and zone->size_seqlock.
+	 */
+	spinlock_t node_size_lock;
+#endif
 	unsigned long node_start_pfn;
 	unsigned long node_present_pages; /* total number of physical pages */
 	unsigned long node_spanned_pages; /* total size of physical page
@@ -293,6 +303,8 @@ typedef struct pglist_data {
 #endif
 #define nid_page_nr(nid, pagenr) 	pgdat_page_nr(NODE_DATA(nid),(pagenr))
 
+#include <linux/memory_hotplug.h>
+
 extern struct pglist_data *pgdat_list;
 
 void __get_zone_counts(unsigned long *active, unsigned long *inactive,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a51ef94eec33..32fad6d23200 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1958,6 +1958,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
 	int nid = pgdat->node_id;
 	unsigned long zone_start_pfn = pgdat->node_start_pfn;
 
+	pgdat_resize_init(pgdat);
 	pgdat->nr_zones = 0;
 	init_waitqueue_head(&pgdat->kswapd_wait);
 	pgdat->kswapd_max_order = 0;

From bdc8cb984576ab5b550c8b24c6fa111a873503e3 Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Sat, 29 Oct 2005 18:16:53 -0700
Subject: [PATCH 82/91] [PATCH] memory hotplug locking: zone span seqlock

See the "fixup bad_range()" patch for more information, but this actually
creates a the lock to protect things making assumptions about a zone's size
staying constant at runtime.

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/memory_hotplug.h | 39 ++++++++++++++++++++++++++++++++--
 include/linux/mmzone.h         | 15 +++++++++++++
 mm/page_alloc.c                | 19 ++++++++++++-----
 3 files changed, 66 insertions(+), 7 deletions(-)

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index e8103be9d528..4b08bc947578 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -16,13 +16,36 @@ void pgdat_resize_lock(struct pglist_data *pgdat, unsigned long *flags)
 static inline
 void pgdat_resize_unlock(struct pglist_data *pgdat, unsigned long *flags)
 {
-	spin_lock_irqrestore(&pgdat->node_size_lock, *flags);
+	spin_unlock_irqrestore(&pgdat->node_size_lock, *flags);
 }
 static inline
 void pgdat_resize_init(struct pglist_data *pgdat)
 {
 	spin_lock_init(&pgdat->node_size_lock);
 }
+/*
+ * Zone resizing functions
+ */
+static inline unsigned zone_span_seqbegin(struct zone *zone)
+{
+	return read_seqbegin(&zone->span_seqlock);
+}
+static inline int zone_span_seqretry(struct zone *zone, unsigned iv)
+{
+	return read_seqretry(&zone->span_seqlock, iv);
+}
+static inline void zone_span_writelock(struct zone *zone)
+{
+	write_seqlock(&zone->span_seqlock);
+}
+static inline void zone_span_writeunlock(struct zone *zone)
+{
+	write_sequnlock(&zone->span_seqlock);
+}
+static inline void zone_seqlock_init(struct zone *zone)
+{
+	seqlock_init(&zone->span_seqlock);
+}
 #else /* ! CONFIG_MEMORY_HOTPLUG */
 /*
  * Stub functions for when hotplug is off
@@ -30,5 +53,17 @@ void pgdat_resize_init(struct pglist_data *pgdat)
 static inline void pgdat_resize_lock(struct pglist_data *p, unsigned long *f) {}
 static inline void pgdat_resize_unlock(struct pglist_data *p, unsigned long *f) {}
 static inline void pgdat_resize_init(struct pglist_data *pgdat) {}
-#endif
+
+static inline unsigned zone_span_seqbegin(struct zone *zone)
+{
+	return 0;
+}
+static inline int zone_span_seqretry(struct zone *zone, unsigned iv)
+{
+	return 0;
+}
+static inline void zone_span_writelock(struct zone *zone) {}
+static inline void zone_span_writeunlock(struct zone *zone) {}
+static inline void zone_seqlock_init(struct zone *zone) {}
+#endif /* ! CONFIG_MEMORY_HOTPLUG */
 #endif /* __LINUX_MEMORY_HOTPLUG_H */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index e050d68963a1..f5fa3082fd6a 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -12,6 +12,7 @@
 #include <linux/threads.h>
 #include <linux/numa.h>
 #include <linux/init.h>
+#include <linux/seqlock.h>
 #include <asm/atomic.h>
 
 /* Free memory management - zoned buddy allocator.  */
@@ -137,6 +138,10 @@ struct zone {
 	 * free areas of different sizes
 	 */
 	spinlock_t		lock;
+#ifdef CONFIG_MEMORY_HOTPLUG
+	/* see spanned/present_pages for more description */
+	seqlock_t		span_seqlock;
+#endif
 	struct free_area	free_area[MAX_ORDER];
 
 
@@ -220,6 +225,16 @@ struct zone {
 	/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
 	unsigned long		zone_start_pfn;
 
+	/*
+	 * zone_start_pfn, spanned_pages and present_pages are all
+	 * protected by span_seqlock.  It is a seqlock because it has
+	 * to be read outside of zone->lock, and it is done in the main
+	 * allocator path.  But, it is written quite infrequently.
+	 *
+	 * The lock is declared along with zone->lock because it is
+	 * frequently read in proximity to zone->lock.  It's good to
+	 * give them a chance of being in the same cacheline.
+	 */
 	unsigned long		spanned_pages;	/* total size, including holes */
 	unsigned long		present_pages;	/* amount of memory (excluding holes) */
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 32fad6d23200..817635f2ab62 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -33,6 +33,7 @@
 #include <linux/sysctl.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
+#include <linux/memory_hotplug.h>
 #include <linux/nodemask.h>
 #include <linux/vmalloc.h>
 
@@ -80,12 +81,19 @@ unsigned long __initdata nr_all_pages;
 
 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
 {
-	if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages)
-		return 1;
-	if (page_to_pfn(page) < zone->zone_start_pfn)
-		return 1;
+	int ret = 0;
+	unsigned seq;
+	unsigned long pfn = page_to_pfn(page);
 
-	return 0;
+	do {
+		seq = zone_span_seqbegin(zone);
+		if (pfn >= zone->zone_start_pfn + zone->spanned_pages)
+			ret = 1;
+		else if (pfn < zone->zone_start_pfn)
+			ret = 1;
+	} while (zone_span_seqretry(zone, seq));
+
+	return ret;
 }
 
 static int page_is_consistent(struct zone *zone, struct page *page)
@@ -1980,6 +1988,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
 		zone->name = zone_names[j];
 		spin_lock_init(&zone->lock);
 		spin_lock_init(&zone->lru_lock);
+		zone_seqlock_init(zone);
 		zone->zone_pgdat = pgdat;
 		zone->free_pages = 0;
 

From 3947be1969a9ce455ec30f60ef51efb10e4323d1 Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Sat, 29 Oct 2005 18:16:54 -0700
Subject: [PATCH 83/91] [PATCH] memory hotplug: sysfs and add/remove functions

This adds generic memory add/remove and supporting functions for memory
hotplug into a new file as well as a memory hotplug kernel config option.

Individual architecture patches will follow.

For now, disable memory hotplug when swsusp is enabled.  There's a lot of
churn there right now.  We'll fix it up properly once it calms down.

Signed-off-by: Matt Tolentino <matthew.e.tolentino@intel.com>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/base/Makefile          |   1 +
 drivers/base/init.c            |   2 +
 drivers/base/memory.c          | 455 +++++++++++++++++++++++++++++++++
 include/linux/memory.h         |  94 +++++++
 include/linux/memory_hotplug.h |  35 +++
 include/linux/mm.h             |   1 +
 mm/Kconfig                     |   8 +
 mm/Makefile                    |   2 +-
 mm/memory_hotplug.c            | 178 +++++++++++++
 mm/page_alloc.c                |   4 +-
 10 files changed, 777 insertions(+), 3 deletions(-)
 create mode 100644 drivers/base/memory.c
 create mode 100644 include/linux/memory.h
 create mode 100644 mm/memory_hotplug.c

diff --git a/drivers/base/Makefile b/drivers/base/Makefile
index 66d9c4643fc1..f12898d53078 100644
--- a/drivers/base/Makefile
+++ b/drivers/base/Makefile
@@ -7,6 +7,7 @@ obj-y			:= core.o sys.o bus.o dd.o \
 obj-y			+= power/
 obj-$(CONFIG_FW_LOADER)	+= firmware_class.o
 obj-$(CONFIG_NUMA)	+= node.o
+obj-$(CONFIG_MEMORY_HOTPLUG) += memory.o
 
 ifeq ($(CONFIG_DEBUG_DRIVER),y)
 EXTRA_CFLAGS += -DDEBUG
diff --git a/drivers/base/init.c b/drivers/base/init.c
index 84e604e25c4f..c648914b9cde 100644
--- a/drivers/base/init.c
+++ b/drivers/base/init.c
@@ -9,6 +9,7 @@
 
 #include <linux/device.h>
 #include <linux/init.h>
+#include <linux/memory.h>
 
 #include "base.h"
 
@@ -33,5 +34,6 @@ void __init driver_init(void)
 	platform_bus_init();
 	system_bus_init();
 	cpu_dev_init();
+	memory_dev_init();
 	attribute_container_init();
 }
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
new file mode 100644
index 000000000000..785cb6e6b91c
--- /dev/null
+++ b/drivers/base/memory.c
@@ -0,0 +1,455 @@
+/*
+ * drivers/base/memory.c - basic Memory class support
+ *
+ * Written by Matt Tolentino <matthew.e.tolentino@intel.com>
+ *            Dave Hansen <haveblue@us.ibm.com>
+ *
+ * This file provides the necessary infrastructure to represent
+ * a SPARSEMEM-memory-model system's physical memory in /sysfs.
+ * All arch-independent code that assumes MEMORY_HOTPLUG requires
+ * SPARSEMEM should be contained here, or in mm/memory_hotplug.c.
+ */
+
+#include <linux/sysdev.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>	/* capable() */
+#include <linux/topology.h>
+#include <linux/device.h>
+#include <linux/memory.h>
+#include <linux/kobject.h>
+#include <linux/memory_hotplug.h>
+#include <linux/mm.h>
+#include <asm/atomic.h>
+#include <asm/uaccess.h>
+
+#define MEMORY_CLASS_NAME	"memory"
+
+static struct sysdev_class memory_sysdev_class = {
+	set_kset_name(MEMORY_CLASS_NAME),
+};
+EXPORT_SYMBOL(memory_sysdev_class);
+
+static char *memory_hotplug_name(struct kset *kset, struct kobject *kobj)
+{
+	return MEMORY_CLASS_NAME;
+}
+
+static int memory_hotplug(struct kset *kset, struct kobject *kobj, char **envp,
+			int num_envp, char *buffer, int buffer_size)
+{
+	int retval = 0;
+
+	return retval;
+}
+
+static struct kset_hotplug_ops memory_hotplug_ops = {
+	.name		= memory_hotplug_name,
+	.hotplug	= memory_hotplug,
+};
+
+static struct notifier_block *memory_chain;
+
+static int register_memory_notifier(struct notifier_block *nb)
+{
+        return notifier_chain_register(&memory_chain, nb);
+}
+
+static void unregister_memory_notifier(struct notifier_block *nb)
+{
+        notifier_chain_unregister(&memory_chain, nb);
+}
+
+/*
+ * register_memory - Setup a sysfs device for a memory block
+ */
+static int
+register_memory(struct memory_block *memory, struct mem_section *section,
+		struct node *root)
+{
+	int error;
+
+	memory->sysdev.cls = &memory_sysdev_class;
+	memory->sysdev.id = __section_nr(section);
+
+	error = sysdev_register(&memory->sysdev);
+
+	if (root && !error)
+		error = sysfs_create_link(&root->sysdev.kobj,
+					  &memory->sysdev.kobj,
+					  kobject_name(&memory->sysdev.kobj));
+
+	return error;
+}
+
+static void
+unregister_memory(struct memory_block *memory, struct mem_section *section,
+		struct node *root)
+{
+	BUG_ON(memory->sysdev.cls != &memory_sysdev_class);
+	BUG_ON(memory->sysdev.id != __section_nr(section));
+
+	sysdev_unregister(&memory->sysdev);
+	if (root)
+		sysfs_remove_link(&root->sysdev.kobj,
+				  kobject_name(&memory->sysdev.kobj));
+}
+
+/*
+ * use this as the physical section index that this memsection
+ * uses.
+ */
+
+static ssize_t show_mem_phys_index(struct sys_device *dev, char *buf)
+{
+	struct memory_block *mem =
+		container_of(dev, struct memory_block, sysdev);
+	return sprintf(buf, "%08lx\n", mem->phys_index);
+}
+
+/*
+ * online, offline, going offline, etc.
+ */
+static ssize_t show_mem_state(struct sys_device *dev, char *buf)
+{
+	struct memory_block *mem =
+		container_of(dev, struct memory_block, sysdev);
+	ssize_t len = 0;
+
+	/*
+	 * We can probably put these states in a nice little array
+	 * so that they're not open-coded
+	 */
+	switch (mem->state) {
+		case MEM_ONLINE:
+			len = sprintf(buf, "online\n");
+			break;
+		case MEM_OFFLINE:
+			len = sprintf(buf, "offline\n");
+			break;
+		case MEM_GOING_OFFLINE:
+			len = sprintf(buf, "going-offline\n");
+			break;
+		default:
+			len = sprintf(buf, "ERROR-UNKNOWN-%ld\n",
+					mem->state);
+			WARN_ON(1);
+			break;
+	}
+
+	return len;
+}
+
+static inline int memory_notify(unsigned long val, void *v)
+{
+	return notifier_call_chain(&memory_chain, val, v);
+}
+
+/*
+ * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
+ * OK to have direct references to sparsemem variables in here.
+ */
+static int
+memory_block_action(struct memory_block *mem, unsigned long action)
+{
+	int i;
+	unsigned long psection;
+	unsigned long start_pfn, start_paddr;
+	struct page *first_page;
+	int ret;
+	int old_state = mem->state;
+
+	psection = mem->phys_index;
+	first_page = pfn_to_page(psection << PFN_SECTION_SHIFT);
+
+	/*
+	 * The probe routines leave the pages reserved, just
+	 * as the bootmem code does.  Make sure they're still
+	 * that way.
+	 */
+	if (action == MEM_ONLINE) {
+		for (i = 0; i < PAGES_PER_SECTION; i++) {
+			if (PageReserved(first_page+i))
+				continue;
+
+			printk(KERN_WARNING "section number %ld page number %d "
+				"not reserved, was it already online? \n",
+				psection, i);
+			return -EBUSY;
+		}
+	}
+
+	switch (action) {
+		case MEM_ONLINE:
+			start_pfn = page_to_pfn(first_page);
+			ret = online_pages(start_pfn, PAGES_PER_SECTION);
+			break;
+		case MEM_OFFLINE:
+			mem->state = MEM_GOING_OFFLINE;
+			memory_notify(MEM_GOING_OFFLINE, NULL);
+			start_paddr = page_to_pfn(first_page) << PAGE_SHIFT;
+			ret = remove_memory(start_paddr,
+					    PAGES_PER_SECTION << PAGE_SHIFT);
+			if (ret) {
+				mem->state = old_state;
+				break;
+			}
+			memory_notify(MEM_MAPPING_INVALID, NULL);
+			break;
+		default:
+			printk(KERN_WARNING "%s(%p, %ld) unknown action: %ld\n",
+					__FUNCTION__, mem, action, action);
+			WARN_ON(1);
+			ret = -EINVAL;
+	}
+	/*
+	 * For now, only notify on successful memory operations
+	 */
+	if (!ret)
+		memory_notify(action, NULL);
+
+	return ret;
+}
+
+static int memory_block_change_state(struct memory_block *mem,
+		unsigned long to_state, unsigned long from_state_req)
+{
+	int ret = 0;
+	down(&mem->state_sem);
+
+	if (mem->state != from_state_req) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = memory_block_action(mem, to_state);
+	if (!ret)
+		mem->state = to_state;
+
+out:
+	up(&mem->state_sem);
+	return ret;
+}
+
+static ssize_t
+store_mem_state(struct sys_device *dev, const char *buf, size_t count)
+{
+	struct memory_block *mem;
+	unsigned int phys_section_nr;
+	int ret = -EINVAL;
+
+	mem = container_of(dev, struct memory_block, sysdev);
+	phys_section_nr = mem->phys_index;
+
+	if (!valid_section_nr(phys_section_nr))
+		goto out;
+
+	if (!strncmp(buf, "online", min((int)count, 6)))
+		ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
+	else if(!strncmp(buf, "offline", min((int)count, 7)))
+		ret = memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE);
+out:
+	if (ret)
+		return ret;
+	return count;
+}
+
+/*
+ * phys_device is a bad name for this.  What I really want
+ * is a way to differentiate between memory ranges that
+ * are part of physical devices that constitute
+ * a complete removable unit or fru.
+ * i.e. do these ranges belong to the same physical device,
+ * s.t. if I offline all of these sections I can then
+ * remove the physical device?
+ */
+static ssize_t show_phys_device(struct sys_device *dev, char *buf)
+{
+	struct memory_block *mem =
+		container_of(dev, struct memory_block, sysdev);
+	return sprintf(buf, "%d\n", mem->phys_device);
+}
+
+static SYSDEV_ATTR(phys_index, 0444, show_mem_phys_index, NULL);
+static SYSDEV_ATTR(state, 0644, show_mem_state, store_mem_state);
+static SYSDEV_ATTR(phys_device, 0444, show_phys_device, NULL);
+
+#define mem_create_simple_file(mem, attr_name)	\
+	sysdev_create_file(&mem->sysdev, &attr_##attr_name)
+#define mem_remove_simple_file(mem, attr_name)	\
+	sysdev_remove_file(&mem->sysdev, &attr_##attr_name)
+
+/*
+ * Block size attribute stuff
+ */
+static ssize_t
+print_block_size(struct class *class, char *buf)
+{
+	return sprintf(buf, "%lx\n", (unsigned long)PAGES_PER_SECTION * PAGE_SIZE);
+}
+
+static CLASS_ATTR(block_size_bytes, 0444, print_block_size, NULL);
+
+static int block_size_init(void)
+{
+	sysfs_create_file(&memory_sysdev_class.kset.kobj,
+		&class_attr_block_size_bytes.attr);
+	return 0;
+}
+
+/*
+ * Some architectures will have custom drivers to do this, and
+ * will not need to do it from userspace.  The fake hot-add code
+ * as well as ppc64 will do all of their discovery in userspace
+ * and will require this interface.
+ */
+#ifdef CONFIG_ARCH_MEMORY_PROBE
+static ssize_t
+memory_probe_store(struct class *class, const char __user *buf, size_t count)
+{
+	u64 phys_addr;
+	int ret;
+
+	phys_addr = simple_strtoull(buf, NULL, 0);
+
+	ret = add_memory(phys_addr, PAGES_PER_SECTION << PAGE_SHIFT);
+
+	if (ret)
+		count = ret;
+
+	return count;
+}
+static CLASS_ATTR(probe, 0700, NULL, memory_probe_store);
+
+static int memory_probe_init(void)
+{
+	sysfs_create_file(&memory_sysdev_class.kset.kobj,
+		&class_attr_probe.attr);
+	return 0;
+}
+#else
+#define memory_probe_init(...)	do {} while (0)
+#endif
+
+/*
+ * Note that phys_device is optional.  It is here to allow for
+ * differentiation between which *physical* devices each
+ * section belongs to...
+ */
+
+static int add_memory_block(unsigned long node_id, struct mem_section *section,
+		     unsigned long state, int phys_device)
+{
+	size_t size = sizeof(struct memory_block);
+	struct memory_block *mem = kmalloc(size, GFP_KERNEL);
+	int ret = 0;
+
+	if (!mem)
+		return -ENOMEM;
+
+	memset(mem, 0, size);
+
+	mem->phys_index = __section_nr(section);
+	mem->state = state;
+	init_MUTEX(&mem->state_sem);
+	mem->phys_device = phys_device;
+
+	ret = register_memory(mem, section, NULL);
+	if (!ret)
+		ret = mem_create_simple_file(mem, phys_index);
+	if (!ret)
+		ret = mem_create_simple_file(mem, state);
+	if (!ret)
+		ret = mem_create_simple_file(mem, phys_device);
+
+	return ret;
+}
+
+/*
+ * For now, we have a linear search to go find the appropriate
+ * memory_block corresponding to a particular phys_index. If
+ * this gets to be a real problem, we can always use a radix
+ * tree or something here.
+ *
+ * This could be made generic for all sysdev classes.
+ */
+static struct memory_block *find_memory_block(struct mem_section *section)
+{
+	struct kobject *kobj;
+	struct sys_device *sysdev;
+	struct memory_block *mem;
+	char name[sizeof(MEMORY_CLASS_NAME) + 9 + 1];
+
+	/*
+	 * This only works because we know that section == sysdev->id
+	 * slightly redundant with sysdev_register()
+	 */
+	sprintf(&name[0], "%s%d", MEMORY_CLASS_NAME, __section_nr(section));
+
+	kobj = kset_find_obj(&memory_sysdev_class.kset, name);
+	if (!kobj)
+		return NULL;
+
+	sysdev = container_of(kobj, struct sys_device, kobj);
+	mem = container_of(sysdev, struct memory_block, sysdev);
+
+	return mem;
+}
+
+int remove_memory_block(unsigned long node_id, struct mem_section *section,
+		int phys_device)
+{
+	struct memory_block *mem;
+
+	mem = find_memory_block(section);
+	mem_remove_simple_file(mem, phys_index);
+	mem_remove_simple_file(mem, state);
+	mem_remove_simple_file(mem, phys_device);
+	unregister_memory(mem, section, NULL);
+
+	return 0;
+}
+
+/*
+ * need an interface for the VM to add new memory regions,
+ * but without onlining it.
+ */
+int register_new_memory(struct mem_section *section)
+{
+	return add_memory_block(0, section, MEM_OFFLINE, 0);
+}
+
+int unregister_memory_section(struct mem_section *section)
+{
+	if (!valid_section(section))
+		return -EINVAL;
+
+	return remove_memory_block(0, section, 0);
+}
+
+/*
+ * Initialize the sysfs support for memory devices...
+ */
+int __init memory_dev_init(void)
+{
+	unsigned int i;
+	int ret;
+
+	memory_sysdev_class.kset.hotplug_ops = &memory_hotplug_ops;
+	ret = sysdev_class_register(&memory_sysdev_class);
+
+	/*
+	 * Create entries for memory sections that were found
+	 * during boot and have been initialized
+	 */
+	for (i = 0; i < NR_MEM_SECTIONS; i++) {
+		if (!valid_section_nr(i))
+			continue;
+		add_memory_block(0, __nr_to_section(i), MEM_ONLINE, 0);
+	}
+
+	memory_probe_init();
+	block_size_init();
+
+	return ret;
+}
diff --git a/include/linux/memory.h b/include/linux/memory.h
new file mode 100644
index 000000000000..0def328ab5cf
--- /dev/null
+++ b/include/linux/memory.h
@@ -0,0 +1,94 @@
+/*
+ * include/linux/memory.h - generic memory definition
+ *
+ * This is mainly for topological representation. We define the
+ * basic "struct memory_block" here, which can be embedded in per-arch
+ * definitions or NUMA information.
+ *
+ * Basic handling of the devices is done in drivers/base/memory.c
+ * and system devices are handled in drivers/base/sys.c.
+ *
+ * Memory block are exported via sysfs in the class/memory/devices/
+ * directory.
+ *
+ */
+#ifndef _LINUX_MEMORY_H_
+#define _LINUX_MEMORY_H_
+
+#include <linux/sysdev.h>
+#include <linux/node.h>
+#include <linux/compiler.h>
+
+#include <asm/semaphore.h>
+
+struct memory_block {
+	unsigned long phys_index;
+	unsigned long state;
+	/*
+	 * This serializes all state change requests.  It isn't
+	 * held during creation because the control files are
+	 * created long after the critical areas during
+	 * initialization.
+	 */
+	struct semaphore state_sem;
+	int phys_device;		/* to which fru does this belong? */
+	void *hw;			/* optional pointer to fw/hw data */
+	int (*phys_callback)(struct memory_block *);
+	struct sys_device sysdev;
+};
+
+/* These states are exposed to userspace as text strings in sysfs */
+#define	MEM_ONLINE		(1<<0) /* exposed to userspace */
+#define	MEM_GOING_OFFLINE	(1<<1) /* exposed to userspace */
+#define	MEM_OFFLINE		(1<<2) /* exposed to userspace */
+
+/*
+ * All of these states are currently kernel-internal for notifying
+ * kernel components and architectures.
+ *
+ * For MEM_MAPPING_INVALID, all notifier chains with priority >0
+ * are called before pfn_to_page() becomes invalid.  The priority=0
+ * entry is reserved for the function that actually makes
+ * pfn_to_page() stop working.  Any notifiers that want to be called
+ * after that should have priority <0.
+ */
+#define	MEM_MAPPING_INVALID	(1<<3)
+
+#ifndef CONFIG_MEMORY_HOTPLUG
+static inline int memory_dev_init(void)
+{
+	return 0;
+}
+static inline int register_memory_notifier(struct notifier_block *nb)
+{
+	return 0;
+}
+static inline void unregister_memory_notifier(struct notifier_block *nb)
+{
+}
+#else
+extern int register_memory(struct memory_block *, struct mem_section *section, struct node *);
+extern int register_new_memory(struct mem_section *);
+extern int unregister_memory_section(struct mem_section *);
+extern int memory_dev_init(void);
+extern int register_memory_notifier(struct notifier_block *nb);
+extern void unregister_memory_notifier(struct notifier_block *nb);
+
+#define CONFIG_MEM_BLOCK_SIZE	(PAGES_PER_SECTION<<PAGE_SHIFT)
+
+extern int invalidate_phys_mapping(unsigned long, unsigned long);
+struct notifier_block;
+
+extern int register_memory_notifier(struct notifier_block *nb);
+extern void unregister_memory_notifier(struct notifier_block *nb);
+
+extern struct sysdev_class memory_sysdev_class;
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
+#define hotplug_memory_notifier(fn, pri) {			\
+	static struct notifier_block fn##_mem_nb =		\
+		{ .notifier_call = fn, .priority = pri };	\
+	register_memory_notifier(&fn##_mem_nb);			\
+}
+
+#endif /* _LINUX_MEMORY_H_ */
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 4b08bc947578..01f03bc06eff 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -3,6 +3,8 @@
 
 #include <linux/mmzone.h>
 #include <linux/spinlock.h>
+#include <linux/mmzone.h>
+#include <linux/notifier.h>
 
 #ifdef CONFIG_MEMORY_HOTPLUG
 /*
@@ -46,6 +48,19 @@ static inline void zone_seqlock_init(struct zone *zone)
 {
 	seqlock_init(&zone->span_seqlock);
 }
+extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages);
+extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages);
+extern int add_one_highpage(struct page *page, int pfn, int bad_ppro);
+/* need some defines for these for archs that don't support it */
+extern void online_page(struct page *page);
+/* VM interface that may be used by firmware interface */
+extern int add_memory(u64 start, u64 size);
+extern int remove_memory(u64 start, u64 size);
+extern int online_pages(unsigned long, unsigned long);
+
+/* reasonably generic interface to expand the physical pages in a zone  */
+extern int __add_pages(struct zone *zone, unsigned long start_pfn,
+	unsigned long nr_pages);
 #else /* ! CONFIG_MEMORY_HOTPLUG */
 /*
  * Stub functions for when hotplug is off
@@ -65,5 +80,25 @@ static inline int zone_span_seqretry(struct zone *zone, unsigned iv)
 static inline void zone_span_writelock(struct zone *zone) {}
 static inline void zone_span_writeunlock(struct zone *zone) {}
 static inline void zone_seqlock_init(struct zone *zone) {}
+
+static inline int mhp_notimplemented(const char *func)
+{
+	printk(KERN_WARNING "%s() called, with CONFIG_MEMORY_HOTPLUG disabled\n", func);
+	dump_stack();
+	return -ENOSYS;
+}
+
+static inline int __add_pages(struct zone *zone, unsigned long start_pfn,
+	unsigned long nr_pages)
+{
+	return mhp_notimplemented(__FUNCTION__);
+}
 #endif /* ! CONFIG_MEMORY_HOTPLUG */
+static inline int __remove_pages(struct zone *zone, unsigned long start_pfn,
+	unsigned long nr_pages)
+{
+	printk(KERN_WARNING "%s() called, not yet supported\n", __FUNCTION__);
+	dump_stack();
+	return -ENOSYS;
+}
 #endif /* __LINUX_MEMORY_HOTPLUG_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8a514eca40d5..5c1fb0a2e806 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -840,6 +840,7 @@ extern void free_area_init_node(int nid, pg_data_t *pgdat,
 	unsigned long * zones_size, unsigned long zone_start_pfn, 
 	unsigned long *zholes_size);
 extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long);
+extern void setup_per_zone_pages_min(void);
 extern void mem_init(void);
 extern void show_mem(void);
 extern void si_meminfo(struct sysinfo * val);
diff --git a/mm/Kconfig b/mm/Kconfig
index f35a550ba4b9..1a4473fcb2ca 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -112,6 +112,14 @@ config SPARSEMEM_EXTREME
 	def_bool y
 	depends on SPARSEMEM && !SPARSEMEM_STATIC
 
+# eventually, we can have this option just 'select SPARSEMEM'
+config MEMORY_HOTPLUG
+	bool "Allow for memory hot-add"
+	depends on SPARSEMEM && HOTPLUG && !SOFTWARE_SUSPEND
+
+comment "Memory hotplug is currently incompatible with Software Suspend"
+	depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND
+
 # Heavily threaded applications may benefit from splitting the mm-wide
 # page_table_lock, so that faults on different parts of the user address
 # space can be handled with less contention: split it at this NR_CPUS.
diff --git a/mm/Makefile b/mm/Makefile
index 4cd69e3ce421..2fa6d2ca9f28 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -18,5 +18,5 @@ obj-$(CONFIG_NUMA) 	+= mempolicy.o
 obj-$(CONFIG_SPARSEMEM)	+= sparse.o
 obj-$(CONFIG_SHMEM) += shmem.o
 obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
-
+obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 obj-$(CONFIG_FS_XIP) += filemap_xip.o
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
new file mode 100644
index 000000000000..855e0fc928b3
--- /dev/null
+++ b/mm/memory_hotplug.c
@@ -0,0 +1,178 @@
+/*
+ *  linux/mm/memory_hotplug.c
+ *
+ *  Copyright (C)
+ */
+
+#include <linux/config.h>
+#include <linux/stddef.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/interrupt.h>
+#include <linux/pagemap.h>
+#include <linux/bootmem.h>
+#include <linux/compiler.h>
+#include <linux/module.h>
+#include <linux/pagevec.h>
+#include <linux/slab.h>
+#include <linux/sysctl.h>
+#include <linux/cpu.h>
+#include <linux/memory.h>
+#include <linux/memory_hotplug.h>
+#include <linux/highmem.h>
+#include <linux/vmalloc.h>
+
+#include <asm/tlbflush.h>
+
+static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
+{
+	struct page *page, *ret;
+	unsigned long memmap_size = sizeof(struct page) * nr_pages;
+
+	page = alloc_pages(GFP_KERNEL, get_order(memmap_size));
+	if (page)
+		goto got_map_page;
+
+	ret = vmalloc(memmap_size);
+	if (ret)
+		goto got_map_ptr;
+
+	return NULL;
+got_map_page:
+	ret = (struct page *)pfn_to_kaddr(page_to_pfn(page));
+got_map_ptr:
+	memset(ret, 0, memmap_size);
+
+	return ret;
+}
+
+extern void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
+			  unsigned long size);
+static void __add_zone(struct zone *zone, unsigned long phys_start_pfn)
+{
+	struct pglist_data *pgdat = zone->zone_pgdat;
+	int nr_pages = PAGES_PER_SECTION;
+	int nid = pgdat->node_id;
+	int zone_type;
+
+	zone_type = zone - pgdat->node_zones;
+	memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn);
+	zonetable_add(zone, nid, zone_type, phys_start_pfn, nr_pages);
+}
+
+extern int sparse_add_one_section(struct zone *, unsigned long,
+				  struct page *mem_map);
+static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
+{
+	struct pglist_data *pgdat = zone->zone_pgdat;
+	int nr_pages = PAGES_PER_SECTION;
+	struct page *memmap;
+	int ret;
+
+	/*
+	 * This can potentially allocate memory, and does its own
+	 * internal locking.
+	 */
+	sparse_index_init(pfn_to_section_nr(phys_start_pfn), pgdat->node_id);
+
+	pgdat_resize_lock(pgdat, &flags);
+	memmap = __kmalloc_section_memmap(nr_pages);
+	ret = sparse_add_one_section(zone, phys_start_pfn, memmap);
+	pgdat_resize_unlock(pgdat, &flags);
+
+	if (ret <= 0) {
+		/* the mem_map didn't get used */
+		if (memmap >= (struct page *)VMALLOC_START &&
+		    memmap < (struct page *)VMALLOC_END)
+			vfree(memmap);
+		else
+			free_pages((unsigned long)memmap,
+				   get_order(sizeof(struct page) * nr_pages));
+	}
+
+	if (ret < 0)
+		return ret;
+
+	__add_zone(zone, phys_start_pfn);
+	return register_new_memory(__pfn_to_section(phys_start_pfn));
+}
+
+/*
+ * Reasonably generic function for adding memory.  It is
+ * expected that archs that support memory hotplug will
+ * call this function after deciding the zone to which to
+ * add the new pages.
+ */
+int __add_pages(struct zone *zone, unsigned long phys_start_pfn,
+		 unsigned long nr_pages)
+{
+	unsigned long i;
+	int err = 0;
+
+	for (i = 0; i < nr_pages; i += PAGES_PER_SECTION) {
+		err = __add_section(zone, phys_start_pfn + i);
+
+		if (err)
+			break;
+	}
+
+	return err;
+}
+
+static void grow_zone_span(struct zone *zone,
+		unsigned long start_pfn, unsigned long end_pfn)
+{
+	unsigned long old_zone_end_pfn;
+
+	zone_span_writelock(zone);
+
+	old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
+	if (start_pfn < zone->zone_start_pfn)
+		zone->zone_start_pfn = start_pfn;
+
+	if (end_pfn > old_zone_end_pfn)
+		zone->spanned_pages = end_pfn - zone->zone_start_pfn;
+
+	zone_span_writeunlock(zone);
+}
+
+static void grow_pgdat_span(struct pglist_data *pgdat,
+		unsigned long start_pfn, unsigned long end_pfn)
+{
+	unsigned long old_pgdat_end_pfn =
+		pgdat->node_start_pfn + pgdat->node_spanned_pages;
+
+	if (start_pfn < pgdat->node_start_pfn)
+		pgdat->node_start_pfn = start_pfn;
+
+	if (end_pfn > old_pgdat_end_pfn)
+		pgdat->node_spanned_pages = end_pfn - pgdat->node_spanned_pages;
+}
+
+int online_pages(unsigned long pfn, unsigned long nr_pages)
+{
+	unsigned long i;
+	unsigned long flags;
+	unsigned long onlined_pages = 0;
+	struct zone *zone;
+
+	/*
+	 * This doesn't need a lock to do pfn_to_page().
+	 * The section can't be removed here because of the
+	 * memory_block->state_sem.
+	 */
+	zone = page_zone(pfn_to_page(pfn));
+	pgdat_resize_lock(zone->zone_pgdat, &flags);
+	grow_zone_span(zone, pfn, pfn + nr_pages);
+	grow_pgdat_span(zone->zone_pgdat, pfn, pfn + nr_pages);
+	pgdat_resize_unlock(zone->zone_pgdat, &flags);
+
+	for (i = 0; i < nr_pages; i++) {
+		struct page *page = pfn_to_page(pfn + i);
+		online_page(page);
+		onlined_pages++;
+	}
+	zone->present_pages += onlined_pages;
+
+	return 0;
+}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 817635f2ab62..183abf39b445 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1686,7 +1686,7 @@ static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
  * up by free_all_bootmem() once the early boot process is
  * done. Non-atomic initialization, single-pass.
  */
-void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone,
+void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 		unsigned long start_pfn)
 {
 	struct page *page;
@@ -2407,7 +2407,7 @@ static void setup_per_zone_lowmem_reserve(void)
  *	that the pages_{min,low,high} values for each zone are set correctly 
  *	with respect to min_free_kbytes.
  */
-static void setup_per_zone_pages_min(void)
+void setup_per_zone_pages_min(void)
 {
 	unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
 	unsigned long lowmem_pages = 0;

From 0b0acbec1bed75ec1e1daa7f7006323a2a2b2844 Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Sat, 29 Oct 2005 18:16:55 -0700
Subject: [PATCH 84/91] [PATCH] memory hotplug: move section_mem_map alloc to
 sparse.c

This basically keeps up from having to extern __kmalloc_section_memmap().

The vaddr_in_vmalloc_area() helper could go in a vmalloc header, but that
header gets hard to work with, because it needs some arch-specific macros.
Just stick it in here for now, instead of creating another header.

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Lion Vollnhals <webmaster@schiggl.de>
Signed-off-by: Jiri Slaby <xslaby@fi.muni.cz>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/acpi/acpi_memhotplug.c |  5 +--
 drivers/base/memory.c          |  5 +--
 mm/memory_hotplug.c            | 48 ++--------------------
 mm/sparse.c                    | 74 +++++++++++++++++++++++++++++++---
 4 files changed, 75 insertions(+), 57 deletions(-)

diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index 01a1bd239263..2143609d2936 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -200,8 +200,7 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
 	 * Note: Assume that this function returns zero on success
 	 */
 	result = add_memory(mem_device->start_addr,
-			    (mem_device->end_addr - mem_device->start_addr) + 1,
-			    mem_device->read_write_attribute);
+			    (mem_device->end_addr - mem_device->start_addr) + 1);
 	if (result) {
 		ACPI_DEBUG_PRINT((ACPI_DB_ERROR, "\nadd_memory failed\n"));
 		mem_device->state = MEMORY_INVALID_STATE;
@@ -259,7 +258,7 @@ static int acpi_memory_disable_device(struct acpi_memory_device *mem_device)
 	 * Ask the VM to offline this memory range.
 	 * Note: Assume that this function returns zero on success
 	 */
-	result = remove_memory(start, len, attr);
+	result = remove_memory(start, len);
 	if (result) {
 		ACPI_DEBUG_PRINT((ACPI_DB_ERROR, "Hot-Remove failed.\n"));
 		return_VALUE(result);
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 785cb6e6b91c..b7ddd651d664 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -340,15 +340,12 @@ static int memory_probe_init(void)
 static int add_memory_block(unsigned long node_id, struct mem_section *section,
 		     unsigned long state, int phys_device)
 {
-	size_t size = sizeof(struct memory_block);
-	struct memory_block *mem = kmalloc(size, GFP_KERNEL);
+	struct memory_block *mem = kzalloc(sizeof(*mem), GFP_KERNEL);
 	int ret = 0;
 
 	if (!mem)
 		return -ENOMEM;
 
-	memset(mem, 0, size);
-
 	mem->phys_index = __section_nr(section);
 	mem->state = state;
 	init_MUTEX(&mem->state_sem);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 855e0fc928b3..2e916c308ae6 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -24,28 +24,6 @@
 
 #include <asm/tlbflush.h>
 
-static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
-{
-	struct page *page, *ret;
-	unsigned long memmap_size = sizeof(struct page) * nr_pages;
-
-	page = alloc_pages(GFP_KERNEL, get_order(memmap_size));
-	if (page)
-		goto got_map_page;
-
-	ret = vmalloc(memmap_size);
-	if (ret)
-		goto got_map_ptr;
-
-	return NULL;
-got_map_page:
-	ret = (struct page *)pfn_to_kaddr(page_to_pfn(page));
-got_map_ptr:
-	memset(ret, 0, memmap_size);
-
-	return ret;
-}
-
 extern void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
 			  unsigned long size);
 static void __add_zone(struct zone *zone, unsigned long phys_start_pfn)
@@ -60,35 +38,15 @@ static void __add_zone(struct zone *zone, unsigned long phys_start_pfn)
 	zonetable_add(zone, nid, zone_type, phys_start_pfn, nr_pages);
 }
 
-extern int sparse_add_one_section(struct zone *, unsigned long,
-				  struct page *mem_map);
+extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
+				  int nr_pages);
 static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
 {
 	struct pglist_data *pgdat = zone->zone_pgdat;
 	int nr_pages = PAGES_PER_SECTION;
-	struct page *memmap;
 	int ret;
 
-	/*
-	 * This can potentially allocate memory, and does its own
-	 * internal locking.
-	 */
-	sparse_index_init(pfn_to_section_nr(phys_start_pfn), pgdat->node_id);
-
-	pgdat_resize_lock(pgdat, &flags);
-	memmap = __kmalloc_section_memmap(nr_pages);
-	ret = sparse_add_one_section(zone, phys_start_pfn, memmap);
-	pgdat_resize_unlock(pgdat, &flags);
-
-	if (ret <= 0) {
-		/* the mem_map didn't get used */
-		if (memmap >= (struct page *)VMALLOC_START &&
-		    memmap < (struct page *)VMALLOC_END)
-			vfree(memmap);
-		else
-			free_pages((unsigned long)memmap,
-				   get_order(sizeof(struct page) * nr_pages));
-	}
+	ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages);
 
 	if (ret < 0)
 		return ret;
diff --git a/mm/sparse.c b/mm/sparse.c
index 0d3bd4bf3aaa..72079b538e2d 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -5,8 +5,10 @@
 #include <linux/mm.h>
 #include <linux/mmzone.h>
 #include <linux/bootmem.h>
+#include <linux/highmem.h>
 #include <linux/module.h>
 #include <linux/spinlock.h>
+#include <linux/vmalloc.h>
 #include <asm/dma.h>
 
 /*
@@ -187,6 +189,45 @@ static struct page *sparse_early_mem_map_alloc(unsigned long pnum)
 	return NULL;
 }
 
+static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
+{
+	struct page *page, *ret;
+	unsigned long memmap_size = sizeof(struct page) * nr_pages;
+
+	page = alloc_pages(GFP_KERNEL, get_order(memmap_size));
+	if (page)
+		goto got_map_page;
+
+	ret = vmalloc(memmap_size);
+	if (ret)
+		goto got_map_ptr;
+
+	return NULL;
+got_map_page:
+	ret = (struct page *)pfn_to_kaddr(page_to_pfn(page));
+got_map_ptr:
+	memset(ret, 0, memmap_size);
+
+	return ret;
+}
+
+static int vaddr_in_vmalloc_area(void *addr)
+{
+	if (addr >= (void *)VMALLOC_START &&
+	    addr < (void *)VMALLOC_END)
+		return 1;
+	return 0;
+}
+
+static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
+{
+	if (vaddr_in_vmalloc_area(memmap))
+		vfree(memmap);
+	else
+		free_pages((unsigned long)memmap,
+			   get_order(sizeof(struct page) * nr_pages));
+}
+
 /*
  * Allocate the accumulated non-linear sections, allocate a mem_map
  * for each and record the physical to section mapping.
@@ -212,14 +253,37 @@ void sparse_init(void)
  * set.  If this is <=0, then that means that the passed-in
  * map was not consumed and must be freed.
  */
-int sparse_add_one_section(unsigned long start_pfn, int nr_pages, struct page *map)
+int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
+			   int nr_pages)
 {
-	struct mem_section *ms = __pfn_to_section(start_pfn);
+	unsigned long section_nr = pfn_to_section_nr(start_pfn);
+	struct pglist_data *pgdat = zone->zone_pgdat;
+	struct mem_section *ms;
+	struct page *memmap;
+	unsigned long flags;
+	int ret;
 
-	if (ms->section_mem_map & SECTION_MARKED_PRESENT)
-		return -EEXIST;
+	/*
+	 * no locking for this, because it does its own
+	 * plus, it does a kmalloc
+	 */
+	sparse_index_init(section_nr, pgdat->node_id);
+	memmap = __kmalloc_section_memmap(nr_pages);
 
+	pgdat_resize_lock(pgdat, &flags);
+
+	ms = __pfn_to_section(start_pfn);
+	if (ms->section_mem_map & SECTION_MARKED_PRESENT) {
+		ret = -EEXIST;
+		goto out;
+	}
 	ms->section_mem_map |= SECTION_MARKED_PRESENT;
 
-	return sparse_init_one_section(ms, pfn_to_section_nr(start_pfn), map);
+	ret = sparse_init_one_section(ms, section_nr, memmap);
+
+	if (ret <= 0)
+		__kfree_section_memmap(memmap, nr_pages);
+out:
+	pgdat_resize_unlock(pgdat, &flags);
+	return ret;
 }

From 61b13993a81866fc1d4830dfab80530c9c061e37 Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Sat, 29 Oct 2005 18:16:56 -0700
Subject: [PATCH 85/91] [PATCH] memory hotplug: call setup_per_zone_pages_min
 after hotplug

From: IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
> I found the tests does not work well with Dave's patchset.
> I've found the followings:
>
> 	- setup_per_zone_pages_min() calls should be added in
> 	   capture_page_range() and online_pages()
> 	- lru_add_drain() should be called before try_to_migrate_pages()

The following patch deals with the first item.

Signed-off-by: IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory_hotplug.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 2e916c308ae6..431a64f021c0 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -132,5 +132,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
 	}
 	zone->present_pages += onlined_pages;
 
+	setup_per_zone_pages_min();
+
 	return 0;
 }

From 05039b926374212b2d861860cf54b9e839d4dd76 Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Sat, 29 Oct 2005 18:16:57 -0700
Subject: [PATCH 86/91] [PATCH] memory hotplug: i386 addition functions

Adds the necessary for non-NUMA hot-add of highmem to an existing zone on
i386.

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/mm/discontig.c |  4 +--
 arch/i386/mm/init.c      | 62 ++++++++++++++++++++++++++++++++++++----
 2 files changed, 59 insertions(+), 7 deletions(-)

diff --git a/arch/i386/mm/discontig.c b/arch/i386/mm/discontig.c
index 244d8ec66be2..c4af9638dbfa 100644
--- a/arch/i386/mm/discontig.c
+++ b/arch/i386/mm/discontig.c
@@ -98,7 +98,7 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
 
 extern unsigned long find_max_low_pfn(void);
 extern void find_max_pfn(void);
-extern void one_highpage_init(struct page *, int, int);
+extern void add_one_highpage_init(struct page *, int, int);
 
 extern struct e820map e820;
 extern unsigned long init_pg_tables_end;
@@ -427,7 +427,7 @@ void __init set_highmem_pages_init(int bad_ppro)
 			if (!pfn_valid(node_pfn))
 				continue;
 			page = pfn_to_page(node_pfn);
-			one_highpage_init(page, node_pfn, bad_ppro);
+			add_one_highpage_init(page, node_pfn, bad_ppro);
 		}
 	}
 	totalram_pages += totalhigh_pages;
diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index 2ebaf75f732e..542d9298da5e 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -27,6 +27,7 @@
 #include <linux/slab.h>
 #include <linux/proc_fs.h>
 #include <linux/efi.h>
+#include <linux/memory_hotplug.h>
 
 #include <asm/processor.h>
 #include <asm/system.h>
@@ -266,17 +267,46 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
 	pkmap_page_table = pte;	
 }
 
-void __init one_highpage_init(struct page *page, int pfn, int bad_ppro)
+void __devinit free_new_highpage(struct page *page)
+{
+	set_page_count(page, 1);
+	__free_page(page);
+	totalhigh_pages++;
+}
+
+void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
 {
 	if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) {
 		ClearPageReserved(page);
-		set_page_count(page, 1);
-		__free_page(page);
-		totalhigh_pages++;
+		free_new_highpage(page);
 	} else
 		SetPageReserved(page);
 }
 
+static int add_one_highpage_hotplug(struct page *page, unsigned long pfn)
+{
+	free_new_highpage(page);
+	totalram_pages++;
+#ifdef CONFIG_FLATMEM
+	max_mapnr = max(pfn, max_mapnr);
+#endif
+	num_physpages++;
+	return 0;
+}
+
+/*
+ * Not currently handling the NUMA case.
+ * Assuming single node and all memory that
+ * has been added dynamically that would be
+ * onlined here is in HIGHMEM
+ */
+void online_page(struct page *page)
+{
+	ClearPageReserved(page);
+	add_one_highpage_hotplug(page, page_to_pfn(page));
+}
+
+
 #ifdef CONFIG_NUMA
 extern void set_highmem_pages_init(int);
 #else
@@ -284,7 +314,7 @@ static void __init set_highmem_pages_init(int bad_ppro)
 {
 	int pfn;
 	for (pfn = highstart_pfn; pfn < highend_pfn; pfn++)
-		one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro);
+		add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro);
 	totalram_pages += totalhigh_pages;
 }
 #endif /* CONFIG_FLATMEM */
@@ -615,6 +645,28 @@ void __init mem_init(void)
 #endif
 }
 
+/*
+ * this is for the non-NUMA, single node SMP system case.
+ * Specifically, in the case of x86, we will always add
+ * memory to the highmem for now.
+ */
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+int add_memory(u64 start, u64 size)
+{
+	struct pglist_data *pgdata = &contig_page_data;
+	struct zone *zone = pgdata->node_zones + MAX_NR_ZONES-1;
+	unsigned long start_pfn = start >> PAGE_SHIFT;
+	unsigned long nr_pages = size >> PAGE_SHIFT;
+
+	return __add_pages(zone, start_pfn, nr_pages);
+}
+
+int remove_memory(u64 start, u64 size)
+{
+	return -EINVAL;
+}
+#endif
+
 kmem_cache_t *pgd_cache;
 kmem_cache_t *pmd_cache;
 

From bb7e7e032d2cb8e0e9a88a2be209de5e61033b39 Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Sat, 29 Oct 2005 18:16:58 -0700
Subject: [PATCH 87/91] [PATCH] memory hotplug: ppc64 specific hot-add
 functions

Here is a set of ppc64 specific patches that at least allow
compilation/booting with the following configurations:

FLATMEM
SPARSEMEN
SPARSEMEM + MEMORY_HOTPLUG

Signed-off-by: Mike Kravetz <kravetz@us.ibm.com>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ppc64/mm/init.c | 77 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 77 insertions(+)

diff --git a/arch/ppc64/mm/init.c b/arch/ppc64/mm/init.c
index 975b26de34d6..e2bd7776622f 100644
--- a/arch/ppc64/mm/init.c
+++ b/arch/ppc64/mm/init.c
@@ -871,3 +871,80 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long addr,
 	return vma_prot;
 }
 EXPORT_SYMBOL(phys_mem_access_prot);
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+
+void online_page(struct page *page)
+{
+	ClearPageReserved(page);
+	free_cold_page(page);
+	totalram_pages++;
+	num_physpages++;
+}
+
+/*
+ * This works only for the non-NUMA case.  Later, we'll need a lookup
+ * to convert from real physical addresses to nid, that doesn't use
+ * pfn_to_nid().
+ */
+int __devinit add_memory(u64 start, u64 size)
+{
+	struct pglist_data *pgdata = NODE_DATA(0);
+	struct zone *zone;
+	unsigned long start_pfn = start >> PAGE_SHIFT;
+	unsigned long nr_pages = size >> PAGE_SHIFT;
+
+	/* this should work for most non-highmem platforms */
+	zone = pgdata->node_zones;
+
+	return __add_pages(zone, start_pfn, nr_pages);
+
+	return 0;
+}
+
+/*
+ * First pass at this code will check to determine if the remove
+ * request is within the RMO.  Do not allow removal within the RMO.
+ */
+int __devinit remove_memory(u64 start, u64 size)
+{
+	struct zone *zone;
+	unsigned long start_pfn, end_pfn, nr_pages;
+
+	start_pfn = start >> PAGE_SHIFT;
+	nr_pages = size >> PAGE_SHIFT;
+	end_pfn = start_pfn + nr_pages;
+
+	printk("%s(): Attempting to remove memoy in range "
+			"%lx to %lx\n", __func__, start, start+size);
+	/*
+	 * check for range within RMO
+	 */
+	zone = page_zone(pfn_to_page(start_pfn));
+
+	printk("%s(): memory will be removed from "
+			"the %s zone\n", __func__, zone->name);
+
+	/*
+	 * not handling removing memory ranges that
+	 * overlap multiple zones yet
+	 */
+	if (end_pfn > (zone->zone_start_pfn + zone->spanned_pages))
+		goto overlap;
+
+	/* make sure it is NOT in RMO */
+	if ((start < lmb.rmo_size) || ((start+size) < lmb.rmo_size)) {
+		printk("%s(): range to be removed must NOT be in RMO!\n",
+			__func__);
+		goto in_rmo;
+	}
+
+	return __remove_pages(zone, start_pfn, nr_pages);
+
+overlap:
+	printk("%s(): memory range to be removed overlaps "
+		"multiple zones!!!\n", __func__);
+in_rmo:
+	return -1;
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */

From 8bccd85ffbaf8ff1448d1235fa6594e207695531 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Sat, 29 Oct 2005 18:16:59 -0700
Subject: [PATCH 88/91] [PATCH] Implement sys_* do_* layering in the memory
 policy layer.

- Do a separation between do_xxx and sys_xxx functions. sys_xxx functions
  take variable sized bitmaps from user space as arguments. do_xxx functions
  take fixed sized nodemask_t as arguments and may be used from inside the
  kernel. Doing so simplifies the initialization code. There is no
  fs = kernel_ds assumption anymore.

- Split up get_nodes into get_nodes (which gets the node list) and
  contextualize_policy which restricts the nodes to those accessible
  to the task and updates cpusets.

- Add comments explaining limitations of bind policy

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mempolicy.c | 276 +++++++++++++++++++++++++++++--------------------
 1 file changed, 162 insertions(+), 114 deletions(-)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 902d4c9eccdc..123925f50f86 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2,6 +2,7 @@
  * Simple NUMA memory policy for the Linux kernel.
  *
  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
+ * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
  * Subject to the GNU Public License, version 2.
  *
  * NUMA policy allows the user to give hints in which node(s) memory should
@@ -17,13 +18,19 @@
  *                offset into the backing object or offset into the mapping
  *                for anonymous memory. For process policy an process counter
  *                is used.
+ *
  * bind           Only allocate memory on a specific set of nodes,
  *                no fallback.
+ *                FIXME: memory is allocated starting with the first node
+ *                to the last. It would be better if bind would truly restrict
+ *                the allocation to memory nodes instead
+ *
  * preferred       Try a specific node first before normal fallback.
  *                As a special case node -1 here means do the allocation
  *                on the local CPU. This is normally identical to default,
  *                but useful to set in a VMA when you have a non default
  *                process policy.
+ *
  * default        Allocate on the local node first, or when on a VMA
  *                use the process policy. This is what Linux always did
  *		  in a NUMA aware kernel and still does by, ahem, default.
@@ -113,56 +120,6 @@ static int mpol_check_policy(int mode, nodemask_t *nodes)
 	}
 	return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
 }
-
-/* Copy a node mask from user space. */
-static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask,
-		     unsigned long maxnode, int mode)
-{
-	unsigned long k;
-	unsigned long nlongs;
-	unsigned long endmask;
-
-	--maxnode;
-	nodes_clear(*nodes);
-	if (maxnode == 0 || !nmask)
-		return 0;
-
-	nlongs = BITS_TO_LONGS(maxnode);
-	if ((maxnode % BITS_PER_LONG) == 0)
-		endmask = ~0UL;
-	else
-		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
-
-	/* When the user specified more nodes than supported just check
-	   if the non supported part is all zero. */
-	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
-		if (nlongs > PAGE_SIZE/sizeof(long))
-			return -EINVAL;
-		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
-			unsigned long t;
-			if (get_user(t, nmask + k))
-				return -EFAULT;
-			if (k == nlongs - 1) {
-				if (t & endmask)
-					return -EINVAL;
-			} else if (t)
-				return -EINVAL;
-		}
-		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
-		endmask = ~0UL;
-	}
-
-	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
-		return -EFAULT;
-	nodes_addr(*nodes)[nlongs-1] &= endmask;
-	/* Update current mems_allowed */
-	cpuset_update_current_mems_allowed();
-	/* Ignore nodes not set in current->mems_allowed */
-	/* AK: shouldn't this error out instead? */
-	cpuset_restrict_to_mems_allowed(nodes_addr(*nodes));
-	return mpol_check_policy(mode, nodes);
-}
-
 /* Generate a custom zonelist for the BIND policy. */
 static struct zonelist *bind_zonelist(nodemask_t *nodes)
 {
@@ -380,17 +337,25 @@ static int mbind_range(struct vm_area_struct *vma, unsigned long start,
 	return err;
 }
 
-/* Change policy for a memory range */
-asmlinkage long sys_mbind(unsigned long start, unsigned long len,
-			  unsigned long mode,
-			  unsigned long __user *nmask, unsigned long maxnode,
-			  unsigned flags)
+static int contextualize_policy(int mode, nodemask_t *nodes)
+{
+	if (!nodes)
+		return 0;
+
+	/* Update current mems_allowed */
+	cpuset_update_current_mems_allowed();
+	/* Ignore nodes not set in current->mems_allowed */
+	cpuset_restrict_to_mems_allowed(nodes->bits);
+	return mpol_check_policy(mode, nodes);
+}
+
+long do_mbind(unsigned long start, unsigned long len,
+		unsigned long mode, nodemask_t *nmask, unsigned long flags)
 {
 	struct vm_area_struct *vma;
 	struct mm_struct *mm = current->mm;
 	struct mempolicy *new;
 	unsigned long end;
-	nodemask_t nodes;
 	int err;
 
 	if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
@@ -405,12 +370,9 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 		return -EINVAL;
 	if (end == start)
 		return 0;
-
-	err = get_nodes(&nodes, nmask, maxnode, mode);
-	if (err)
-		return err;
-
-	new = mpol_new(mode, &nodes);
+	if (contextualize_policy(mode, nmask))
+		return -EINVAL;
+	new = mpol_new(mode, nmask);
 	if (IS_ERR(new))
 		return PTR_ERR(new);
 
@@ -418,7 +380,7 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 			mode,nodes_addr(nodes)[0]);
 
 	down_write(&mm->mmap_sem);
-	vma = check_range(mm, start, end, &nodes, flags);
+	vma = check_range(mm, start, end, nmask, flags);
 	err = PTR_ERR(vma);
 	if (!IS_ERR(vma))
 		err = mbind_range(vma, start, end, new);
@@ -428,19 +390,13 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 }
 
 /* Set the process memory policy */
-asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
-				   unsigned long maxnode)
+long do_set_mempolicy(int mode, nodemask_t *nodes)
 {
-	int err;
 	struct mempolicy *new;
-	nodemask_t nodes;
 
-	if (mode < 0 || mode > MPOL_MAX)
+	if (contextualize_policy(mode, nodes))
 		return -EINVAL;
-	err = get_nodes(&nodes, nmask, maxnode, mode);
-	if (err)
-		return err;
-	new = mpol_new(mode, &nodes);
+	new = mpol_new(mode, nodes);
 	if (IS_ERR(new))
 		return PTR_ERR(new);
 	mpol_free(current->mempolicy);
@@ -459,7 +415,8 @@ static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
 	switch (p->policy) {
 	case MPOL_BIND:
 		for (i = 0; p->v.zonelist->zones[i]; i++)
-			node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id, *nodes);
+			node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
+				*nodes);
 		break;
 	case MPOL_DEFAULT:
 		break;
@@ -491,38 +448,17 @@ static int lookup_node(struct mm_struct *mm, unsigned long addr)
 	return err;
 }
 
-/* Copy a kernel node mask to user space */
-static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
-			      nodemask_t *nodes)
-{
-	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
-	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
-
-	if (copy > nbytes) {
-		if (copy > PAGE_SIZE)
-			return -EINVAL;
-		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
-			return -EFAULT;
-		copy = nbytes;
-	}
-	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
-}
-
 /* Retrieve NUMA policy */
-asmlinkage long sys_get_mempolicy(int __user *policy,
-				  unsigned long __user *nmask,
-				  unsigned long maxnode,
-				  unsigned long addr, unsigned long flags)
+long do_get_mempolicy(int *policy, nodemask_t *nmask,
+			unsigned long addr, unsigned long flags)
 {
-	int err, pval;
+	int err;
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma = NULL;
 	struct mempolicy *pol = current->mempolicy;
 
 	if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
 		return -EINVAL;
-	if (nmask != NULL && maxnode < MAX_NUMNODES)
-		return -EINVAL;
 	if (flags & MPOL_F_ADDR) {
 		down_read(&mm->mmap_sem);
 		vma = find_vma_intersection(mm, addr, addr+1);
@@ -545,31 +481,25 @@ asmlinkage long sys_get_mempolicy(int __user *policy,
 			err = lookup_node(mm, addr);
 			if (err < 0)
 				goto out;
-			pval = err;
+			*policy = err;
 		} else if (pol == current->mempolicy &&
 				pol->policy == MPOL_INTERLEAVE) {
-			pval = current->il_next;
+			*policy = current->il_next;
 		} else {
 			err = -EINVAL;
 			goto out;
 		}
 	} else
-		pval = pol->policy;
+		*policy = pol->policy;
 
 	if (vma) {
 		up_read(&current->mm->mmap_sem);
 		vma = NULL;
 	}
 
-	if (policy && put_user(pval, policy))
-		return -EFAULT;
-
 	err = 0;
-	if (nmask) {
-		nodemask_t nodes;
-		get_zonemask(pol, &nodes);
-		err = copy_nodes_to_user(nmask, maxnode, &nodes);
-	}
+	if (nmask)
+		get_zonemask(pol, nmask);
 
  out:
 	if (vma)
@@ -577,6 +507,126 @@ asmlinkage long sys_get_mempolicy(int __user *policy,
 	return err;
 }
 
+/*
+ * User space interface with variable sized bitmaps for nodelists.
+ */
+
+/* Copy a node mask from user space. */
+static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask,
+		     unsigned long maxnode)
+{
+	unsigned long k;
+	unsigned long nlongs;
+	unsigned long endmask;
+
+	--maxnode;
+	nodes_clear(*nodes);
+	if (maxnode == 0 || !nmask)
+		return 0;
+
+	nlongs = BITS_TO_LONGS(maxnode);
+	if ((maxnode % BITS_PER_LONG) == 0)
+		endmask = ~0UL;
+	else
+		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
+
+	/* When the user specified more nodes than supported just check
+	   if the non supported part is all zero. */
+	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
+		if (nlongs > PAGE_SIZE/sizeof(long))
+			return -EINVAL;
+		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
+			unsigned long t;
+			if (get_user(t, nmask + k))
+				return -EFAULT;
+			if (k == nlongs - 1) {
+				if (t & endmask)
+					return -EINVAL;
+			} else if (t)
+				return -EINVAL;
+		}
+		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
+		endmask = ~0UL;
+	}
+
+	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
+		return -EFAULT;
+	nodes_addr(*nodes)[nlongs-1] &= endmask;
+	return 0;
+}
+
+/* Copy a kernel node mask to user space */
+static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
+			      nodemask_t *nodes)
+{
+	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
+	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
+
+	if (copy > nbytes) {
+		if (copy > PAGE_SIZE)
+			return -EINVAL;
+		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
+			return -EFAULT;
+		copy = nbytes;
+	}
+	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
+}
+
+asmlinkage long sys_mbind(unsigned long start, unsigned long len,
+			unsigned long mode,
+			unsigned long __user *nmask, unsigned long maxnode,
+			unsigned flags)
+{
+	nodemask_t nodes;
+	int err;
+
+	err = get_nodes(&nodes, nmask, maxnode);
+	if (err)
+		return err;
+	return do_mbind(start, len, mode, &nodes, flags);
+}
+
+/* Set the process memory policy */
+asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
+		unsigned long maxnode)
+{
+	int err;
+	nodemask_t nodes;
+
+	if (mode < 0 || mode > MPOL_MAX)
+		return -EINVAL;
+	err = get_nodes(&nodes, nmask, maxnode);
+	if (err)
+		return err;
+	return do_set_mempolicy(mode, &nodes);
+}
+
+/* Retrieve NUMA policy */
+asmlinkage long sys_get_mempolicy(int __user *policy,
+				unsigned long __user *nmask,
+				unsigned long maxnode,
+				unsigned long addr, unsigned long flags)
+{
+	int err, pval;
+	nodemask_t nodes;
+
+	if (nmask != NULL && maxnode < MAX_NUMNODES)
+		return -EINVAL;
+
+	err = do_get_mempolicy(&pval, &nodes, addr, flags);
+
+	if (err)
+		return err;
+
+	if (policy && put_user(pval, policy))
+		return -EFAULT;
+
+	if (nmask)
+		err = copy_nodes_to_user(nmask, maxnode, &nodes);
+
+	return err;
+}
+
 #ifdef CONFIG_COMPAT
 
 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
@@ -664,7 +714,7 @@ get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned lo
 
 	if (vma) {
 		if (vma->vm_ops && vma->vm_ops->get_policy)
-		        pol = vma->vm_ops->get_policy(vma, addr);
+			pol = vma->vm_ops->get_policy(vma, addr);
 		else if (vma->vm_policy &&
 				vma->vm_policy->policy != MPOL_DEFAULT)
 			pol = vma->vm_policy;
@@ -1147,14 +1197,12 @@ void __init numa_policy_init(void)
 	/* Set interleaving policy for system init. This way not all
 	   the data structures allocated at system boot end up in node zero. */
 
-	if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map),
-							MAX_NUMNODES) < 0)
+	if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
 		printk("numa_policy_init: interleaving failed\n");
 }
 
-/* Reset policy of current process to default.
- * Assumes fs == KERNEL_DS */
+/* Reset policy of current process to default */
 void numa_default_policy(void)
 {
-	sys_set_mempolicy(MPOL_DEFAULT, NULL, 0);
+	do_set_mempolicy(MPOL_DEFAULT, NULL);
 }

From 5fcbb23050936d69de8087d4b311eaf55cb42740 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Sat, 29 Oct 2005 18:17:00 -0700
Subject: [PATCH 89/91] [PATCH] Remove policy contextualization from mbind

Policy contextualization is only useful for task based policies and not for
vma based policies.  It may be useful to define allowed nodes that are not
accessible from this thread because other threads may have access to these
nodes.  Without this patch strange memory policy situations may cause an
application to fail with out of memory.

Example:

Let's say we have two threads A and B that share the same address space and
a huge array computational array X.

Thread A is restricted by its cpuset to nodes 0 and 1 and thread B is
restricted by its cpuset to nodes 2 and 3.

Thread A now wants to restrict allocations to the first node and thus
applies a BIND policy on X to node 0 and 2.  The cpuset limits this to node
0.  Thus pages for X must be allocated on node 0 now.

Thread B now touches a page that has never been used in X and faults in a
page.  According to the BIND policy of the vma for X the page must be
allocated on page 0.  However, the cpuset of B does not allow allocation on
0 and 1.  Now the application fails in alloc_pages with out of memory.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mempolicy.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 123925f50f86..2076b1542b8a 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -370,7 +370,7 @@ long do_mbind(unsigned long start, unsigned long len,
 		return -EINVAL;
 	if (end == start)
 		return 0;
-	if (contextualize_policy(mode, nmask))
+	if (mpol_check_policy(mode, nmask))
 		return -EINVAL;
 	new = mpol_new(mode, nmask);
 	if (IS_ERR(new))

From 2f96996de0eda378df2a5f857ee1ef615ae10a4f Mon Sep 17 00:00:00 2001
From: John Hawkes <hawkes@sgi.com>
Date: Sat, 29 Oct 2005 18:17:01 -0700
Subject: [PATCH 90/91] [PATCH] mm: wider use of for_each_*cpu()

In 'mm' change the explicit use of a for-loop using NR_CPUS into the
general for_each_cpu() constructs.  This widens the scope of potential
future optimizations of the general constructs, as well as takes advantage
of the existing optimizations of first_cpu() and next_cpu(), which is
advantageous when the true CPU count is much smaller than NR_CPUS.

Signed-off-by: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 183abf39b445..2dbdd98426fd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1331,12 +1331,9 @@ void show_free_areas(void)
 		} else
 			printk("\n");
 
-		for (cpu = 0; cpu < NR_CPUS; ++cpu) {
+		for_each_cpu(cpu) {
 			struct per_cpu_pageset *pageset;
 
-			if (!cpu_possible(cpu))
-				continue;
-
 			pageset = zone_pcp(zone, cpu);
 
 			for (temperature = 0; temperature < 2; temperature++)

From b1459461f1e0abd5c28317d6bff6f2ca612a719d Mon Sep 17 00:00:00 2001
From: Nikita Danilov <nikita@clusterfs.com>
Date: Sat, 29 Oct 2005 18:17:02 -0700
Subject: [PATCH 91/91] [PATCH] mm/filemap.c:filemap_populate(): move export.

move EXPORT_SYMBOL(filemap_populate) to the proper place: just after
function itself: it's easy to miss that function is exported otherwise.

Signed-off-by: Nikita Danilov <nikita@clusterfs.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/filemap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index 036599d1177e..768687f1d46b 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1537,6 +1537,7 @@ repeat:
 
 	return 0;
 }
+EXPORT_SYMBOL(filemap_populate);
 
 struct vm_operations_struct generic_file_vm_ops = {
 	.nopage		= filemap_nopage,
@@ -1555,7 +1556,6 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
 	vma->vm_ops = &generic_file_vm_ops;
 	return 0;
 }
-EXPORT_SYMBOL(filemap_populate);
 
 /*
  * This is for filesystems which do not implement ->writepage.