[PATCH 09/39] bootmem, x86: Add weak version of reserve_bootmem_generic

Previous thread: [PATCH 18/39] lmb: Add lmb_reserve_area_overlap_ok() by Yinghai Lu on Thursday, April 8, 2010 - 11:03 pm. (5 messages)

Next thread: [PATCH 2/5] sched: add asymmetric packing option for sibling domain by Michael Neuling on Thursday, April 8, 2010 - 11:21 pm. (3 messages)
From: Yinghai Lu
Date: Thursday, April 8, 2010 - 11:03 pm

the new lmb could be used to early_res in x86.

Suggested by: David, Ben, and Thomas

-v6: change sequence as requested by Thomas
-v7: seperate them to more patches
-v8: add boundary checking to make sure not free partial page.
-v9: use lmb_debug to control print out of reserve_lmb.
     add e820 clean up, and e820 become __initdata
-v10:use lmb.rmo_size and ARCH_DISCARD_LMB according to Michael
     change name to lmb_find_area/reserve_lmb_area/free_lmb_area,
      according to Michael
     update find_lmb_area to use __lmb_alloc_base according to ben
-v11:move find_lmb_area_size back to x86.
     x86 has own find_lmb_area, and could be disabled by ARCH_LMB_FIND_AREA
      because _lmb_find_base has different behavoir from x86's old one.
      one from high to high and one from low to high
      need more test
     tested for x86 32bit/64bit, numa/nonuma, nobootmem/bootmem.
-v12:refresh the series with current tip
     seperate nobootmem.c, so could remove some #ifdef
     still keep CONFIG_NO_BOOTMEM, in x86 .c, and could use the as tags
     so other lmb could refer them to use NO_BOOTMEM.

still keep find_lmb_area, may replace those find_lmb_area will lmb_alloc, if
everything go fine

Please put them into tip and -next to have more test coverage.

Thanks

Yinghai
--

From: Yinghai Lu
Date: Thursday, April 8, 2010 - 11:03 pm

size is returned according free range.
Will be used to find free ranges for early_memtest and memory corruption check

Do not mess it up with mm/lmb.c yet.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
---
 arch/x86/mm/Makefile |    2 +
 arch/x86/mm/lmb.c    |   88 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 90 insertions(+), 0 deletions(-)
 create mode 100644 arch/x86/mm/lmb.c

diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index a4c7683..8ab0505 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -26,4 +26,6 @@ obj-$(CONFIG_NUMA)		+= numa.o numa_$(BITS).o
 obj-$(CONFIG_K8_NUMA)		+= k8topology_64.o
 obj-$(CONFIG_ACPI_NUMA)		+= srat_$(BITS).o
 
+obj-$(CONFIG_HAVE_LMB)		+= lmb.o
+
 obj-$(CONFIG_MEMTEST)		+= memtest.o
diff --git a/arch/x86/mm/lmb.c b/arch/x86/mm/lmb.c
new file mode 100644
index 0000000..3229e9e
--- /dev/null
+++ b/arch/x86/mm/lmb.c
@@ -0,0 +1,88 @@
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/bitops.h>
+#include <linux/lmb.h>
+#include <linux/bootmem.h>
+#include <linux/mm.h>
+#include <linux/range.h>
+
+/* Check for already reserved areas */
+static inline bool __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align)
+{
+	int i;
+	u64 addr = *addrp, last;
+	u64 size = *sizep;
+	bool changed = false;
+again:
+	last = addr + size;
+	for (i = 0; i < lmb.reserved.cnt && lmb.reserved.region[i].size; i++) {
+		struct lmb_property *r = &lmb.reserved.region[i];
+		if (last > r->base && addr < r->base) {
+			size = r->base - addr;
+			changed = true;
+			goto again;
+		}
+		if (last > (r->base + r->size) && addr < (r->base + r->size)) {
+			addr = round_up(r->base + r->size, align);
+			size = last - addr;
+			changed = true;
+			goto again;
+		}
+		if (last <= (r->base + r->size) && addr >= r->base) {
+			(*sizep)++;
+			return false;
+		}
+	}
+	if (changed) {
+		*addrp = addr;
+		*sizep = size;
+	}
+	return changed;
+}
+
+static u64 ...
From: Yinghai Lu
Date: Thursday, April 8, 2010 - 11:03 pm

So those lmb bits could be released after kernel is booted up.

Arch code could define ARCH_DISCARD_LMB in asm/lmb.h, __init_lmb will become __init, __initdata_lmb will becom __initdata

x86 code will use that.

-v2: use ARCH_DISCARD_LMB according to Michael Ellerman

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
---
 include/linux/lmb.h |    8 +++++++
 mm/lmb.c            |   54 ++++++++++++++++++++++++++++++--------------------
 2 files changed, 40 insertions(+), 22 deletions(-)

diff --git a/include/linux/lmb.h b/include/linux/lmb.h
index 3c8095f..f615626 100644
--- a/include/linux/lmb.h
+++ b/include/linux/lmb.h
@@ -106,6 +106,14 @@ u64 lmb_memory_size(u64 addr, u64 limit);
 
 #include <asm/lmb.h>
 
+#ifdef ARCH_DISCARD_LMB
+#define __init_lmb __init
+#define __initdata_lmb __initdata
+#else
+#define __init_lmb
+#define __initdata_lmb
+#endif
+
 #endif /* CONFIG_HAVE_LMB */
 
 #endif /* __KERNEL__ */
diff --git a/mm/lmb.c b/mm/lmb.c
index dfbf660..ab3d85f 100644
--- a/mm/lmb.c
+++ b/mm/lmb.c
@@ -21,11 +21,11 @@
 
 #define LMB_ALLOC_ANYWHERE	0
 
-struct lmb lmb;
-static struct lmb_property lmb_memory_region[MAX_LMB_REGIONS + 1];
-static struct lmb_property lmb_reserved_region[MAX_LMB_REGIONS + 1];
+struct lmb lmb __initdata_lmb;
+static struct lmb_property lmb_memory_region[MAX_LMB_REGIONS + 1] __initdata_lmb;
+static struct lmb_property lmb_reserved_region[MAX_LMB_REGIONS + 1] __initdata_lmb;
 
-static int lmb_debug;
+static int lmb_debug __initdata_lmb;
 
 static int __init early_lmb(char *p)
 {
@@ -35,7 +35,7 @@ static int __init early_lmb(char *p)
 }
 early_param("lmb", early_lmb);
 
-static void lmb_dump(struct lmb_region *region, char *name)
+static void __init_lmb lmb_dump(struct lmb_region *region, char *name)
 {
 	unsigned long long base, size;
 	int i;
@@ -51,7 +51,7 @@ static void lmb_dump(struct lmb_region *region, char *name)
 	}
 }
 
-void lmb_dump_all(void)
+void __init_lmb lmb_dump_all(void)
 {
 	if (!lmb_debug)
 ...
From: Yinghai Lu
Date: Thursday, April 8, 2010 - 11:03 pm

It will be used lmb_to_bootmem converting

It is an wrapper for reserve_bootmem, and x86 64bit is using special one.

Also clean up that version for x86_64. We don't need to take care of numa
path for that, bootmem can handle it how

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
---
 arch/x86/mm/init_32.c |    6 ------
 arch/x86/mm/init_64.c |   20 ++------------------
 mm/bootmem.c          |    6 ++++++
 3 files changed, 8 insertions(+), 24 deletions(-)

diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index bca7909..90e0545 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -1069,9 +1069,3 @@ void mark_rodata_ro(void)
 #endif
 }
 #endif
-
-int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
-				   int flags)
-{
-	return reserve_bootmem(phys, len, flags);
-}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index ee41bba..634fa08 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -799,13 +799,10 @@ void mark_rodata_ro(void)
 
 #endif
 
+#ifndef CONFIG_NO_BOOTMEM
 int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
 				   int flags)
 {
-#ifdef CONFIG_NUMA
-	int nid, next_nid;
-	int ret;
-#endif
 	unsigned long pfn = phys >> PAGE_SHIFT;
 
 	if (pfn >= max_pfn) {
@@ -821,21 +818,7 @@ int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
 		return -EFAULT;
 	}
 
-	/* Should check here against the e820 map to avoid double free */
-#ifdef CONFIG_NUMA
-	nid = phys_to_nid(phys);
-	next_nid = phys_to_nid(phys + len - 1);
-	if (nid == next_nid)
-		ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags);
-	else
-		ret = reserve_bootmem(phys, len, flags);
-
-	if (ret != 0)
-		return ret;
-
-#else
 	reserve_bootmem(phys, len, flags);
-#endif
 
 	if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
 		dma_reserve += len / PAGE_SIZE;
@@ -844,6 +827,7 @@ int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
 
 	return 0;
 ...
From: Yinghai Lu
Date: Thursday, April 8, 2010 - 11:03 pm

lmb_to_bootmem() will reserve lmb.reserved.region in bootmem after bootmem is
set up.

We can use it to with all arches that support lmb later.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
---
 include/linux/lmb.h |    2 ++
 mm/lmb.c            |   32 ++++++++++++++++++++++++++++++++
 2 files changed, 34 insertions(+), 0 deletions(-)

diff --git a/include/linux/lmb.h b/include/linux/lmb.h
index 598662f..1e236d1 100644
--- a/include/linux/lmb.h
+++ b/include/linux/lmb.h
@@ -91,6 +91,8 @@ u64 __lmb_find_area(u64 ei_start, u64 ei_last, u64 start, u64 end,
 			 u64 size, u64 align);
 u64 lmb_find_area(u64 start, u64 end, u64 size, u64 align);
 
+void lmb_to_bootmem(u64 start, u64 end);
+
 #include <asm/lmb.h>
 
 #endif /* __KERNEL__ */
diff --git a/mm/lmb.c b/mm/lmb.c
index a514d41..ee3d945 100644
--- a/mm/lmb.c
+++ b/mm/lmb.c
@@ -630,6 +630,38 @@ void __init lmb_free_area(u64 start, u64 end)
 	__check_and_double_region_array(&lmb.reserved, &lmb_reserved_region[0]);
 }
 
+#ifndef CONFIG_NO_BOOTMEM
+void __init lmb_to_bootmem(u64 start, u64 end)
+{
+	int i, count;
+	u64 final_start, final_end;
+
+	/* Take out region array itself */
+	if (lmb.reserved.region != lmb_reserved_region)
+		lmb_free(__pa(lmb.reserved.region), sizeof(struct lmb_property) * lmb.reserved.nr_regions);
+
+	count  = lmb.reserved.cnt;
+	pr_info("(%d early reservations) ==> bootmem [%010llx - %010llx]\n", count, start, end);
+	for (i = 0; i < count; i++) {
+		struct lmb_property *r = &lmb.reserved.region[i];
+		pr_info("  #%d [%010llx - %010llx] ", i, r->base, r->base + r->size);
+		final_start = max(start, r->base);
+		final_end = min(end, r->base + r->size);
+		if (final_start >= final_end) {
+			pr_cont("\n");
+			continue;
+		}
+		pr_cont(" ==> [%010llx - %010llx]\n", final_start, final_end);
+		reserve_bootmem_generic(final_start, final_end - final_start, BOOTMEM_DEFAULT);
+	}
+	/* Clear them to avoid misusing ? */
+	memset(&lmb.reserved.region[0], 0, sizeof(struct lmb_property) * ...
From: Yinghai Lu
Date: Thursday, April 8, 2010 - 11:03 pm

lmb_init() will connect them back.
Add nr_regions in struct lmb_region to track region array size.

So later We can install dynamically allocated region array to that pointer

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
---
 include/linux/lmb.h |    3 ++-
 mm/lmb.c            |    9 ++++++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/include/linux/lmb.h b/include/linux/lmb.h
index f3d1433..e14ea8d 100644
--- a/include/linux/lmb.h
+++ b/include/linux/lmb.h
@@ -26,7 +26,8 @@ struct lmb_property {
 struct lmb_region {
 	unsigned long cnt;
 	u64 size;
-	struct lmb_property region[MAX_LMB_REGIONS+1];
+	struct lmb_property *region;
+	unsigned long nr_regions;
 };
 
 struct lmb {
diff --git a/mm/lmb.c b/mm/lmb.c
index b1fc526..65b62dc 100644
--- a/mm/lmb.c
+++ b/mm/lmb.c
@@ -18,6 +18,8 @@
 #define LMB_ALLOC_ANYWHERE	0
 
 struct lmb lmb;
+static struct lmb_property lmb_memory_region[MAX_LMB_REGIONS + 1];
+static struct lmb_property lmb_reserved_region[MAX_LMB_REGIONS + 1];
 
 static int lmb_debug;
 
@@ -106,6 +108,11 @@ static void lmb_coalesce_regions(struct lmb_region *rgn,
 
 void __init lmb_init(void)
 {
+	lmb.memory.region   = lmb_memory_region;
+	lmb.reserved.region = lmb_reserved_region;
+	lmb.memory.nr_regions   = ARRAY_SIZE(lmb_memory_region);
+	lmb.reserved.nr_regions = ARRAY_SIZE(lmb_reserved_region);
+
 	/* Create a dummy zero size LMB which will get coalesced away later.
 	 * This simplifies the lmb_add() code below...
 	 */
@@ -169,7 +176,7 @@ static long lmb_add_region(struct lmb_region *rgn, u64 base, u64 size)
 
 	if (coalesced)
 		return coalesced;
-	if (rgn->cnt >= MAX_LMB_REGIONS)
+	if (rgn->cnt > rgn->nr_regions)
 		return -1;
 
 	/* Couldn't coalesce the LMB, so add it to the sorted table. */
-- 
1.6.4.2

--

From: Benjamin Herrenschmidt
Date: Monday, April 12, 2010 - 8:56 pm

From: Yinghai Lu
Date: Thursday, April 8, 2010 - 11:04 pm

We can reduce #ifdef number from 3 to one in init_32.c

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
---
 arch/x86/mm/init_32.c |   15 ++++++++++-----
 1 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index c01c711..dfdd035 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -771,11 +771,9 @@ static unsigned long __init setup_node_bootmem(int nodeid,
 
 	return bootmap + bootmap_size;
 }
-#endif
 
 void __init setup_bootmem_allocator(void)
 {
-#ifndef CONFIG_NO_BOOTMEM
 	int nodeid;
 	unsigned long bootmap_size, bootmap;
 	/*
@@ -787,13 +785,11 @@ void __init setup_bootmem_allocator(void)
 	if (bootmap == -1L)
 		panic("Cannot find bootmem map of size %ld\n", bootmap_size);
 	lmb_reserve_area(bootmap, bootmap + bootmap_size, "BOOTMAP");
-#endif
 
 	printk(KERN_INFO "  mapped low ram: 0 - %08lx\n",
 		 max_pfn_mapped<<PAGE_SHIFT);
 	printk(KERN_INFO "  low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT);
 
-#ifndef CONFIG_NO_BOOTMEM
 	for_each_online_node(nodeid) {
 		 unsigned long start_pfn, end_pfn;
 
@@ -811,10 +807,19 @@ void __init setup_bootmem_allocator(void)
 		bootmap = setup_node_bootmem(nodeid, start_pfn, end_pfn,
 						 bootmap);
 	}
-#endif
 
 	after_bootmem = 1;
 }
+#else
+void __init setup_bootmem_allocator(void)
+{
+	printk(KERN_INFO "  mapped low ram: 0 - %08lx\n",
+		 max_pfn_mapped<<PAGE_SHIFT);
+	printk(KERN_INFO "  low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT);
+
+	after_bootmem = 1;
+}
+#endif
 
 /*
  * paging_init() sets up the page tables - note that the first 8MB are
-- 
1.6.4.2

--

From: Yinghai Lu
Date: Thursday, April 8, 2010 - 11:04 pm

We don't need to expose e820_any_mapped() anymore

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
---
 arch/x86/kernel/e820.c |    6 +++---
 1 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index be6e1e6..471784b 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -47,9 +47,10 @@ EXPORT_SYMBOL(pci_mem_start);
 /*
  * This function checks if any part of the range <start,end> is mapped
  * with type.
+ * phys_pud_init() is using it and is _meminit, but we have !after_bootmem
+ * so could use refok here
  */
-int
-e820_any_mapped(u64 start, u64 end, unsigned type)
+int __init_refok e820_any_mapped(u64 start, u64 end, unsigned type)
 {
 	int i;
 
@@ -64,7 +65,6 @@ e820_any_mapped(u64 start, u64 end, unsigned type)
 	}
 	return 0;
 }
-EXPORT_SYMBOL_GPL(e820_any_mapped);
 
 /*
  * This function checks if the entire range <start,end> is mapped with type.
-- 
1.6.4.2

--

From: Yinghai Lu
Date: Thursday, April 8, 2010 - 11:04 pm

So we can make e820 to be __initdata

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
---
 arch/x86/kernel/tboot.c |   22 +++++++++-------------
 1 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index cc2c604..cf27d64 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -170,34 +170,30 @@ static void tboot_create_trampoline(void)
 
 #ifdef CONFIG_ACPI_SLEEP
 
-static void add_mac_region(phys_addr_t start, unsigned long size)
+static int
+add_mac_region(unsigned long start_pfn, unsigned long nr_pages, void  *arg)
 {
+	u64 start = start_pfn;
+	u64 size = nr_pages;
 	struct tboot_mac_region *mr;
-	phys_addr_t end = start + size;
 
 	if (tboot->num_mac_regions >= MAX_TB_MAC_REGIONS)
 		panic("tboot: Too many MAC regions\n");
 
 	if (start && size) {
 		mr = &tboot->mac_regions[tboot->num_mac_regions++];
-		mr->start = round_down(start, PAGE_SIZE);
-		mr->size  = round_up(end, PAGE_SIZE) - mr->start;
+		mr->start = start << PAGE_SHIFT;
+		mr->size  = (u32) (size << PAGE_SHIFT);
 	}
+
+	return 0;
 }
 
 static int tboot_setup_sleep(void)
 {
-	int i;
-
 	tboot->num_mac_regions = 0;
 
-	for (i = 0; i < e820.nr_map; i++) {
-		if ((e820.map[i].type != E820_RAM)
-		 && (e820.map[i].type != E820_RESERVED_KERN))
-			continue;
-
-		add_mac_region(e820.map[i].addr, e820.map[i].size);
-	}
+	walk_system_ram_range(0, max_pfn, NULL, add_mac_region);
 
 	tboot->acpi_sinfo.kernel_s3_resume_vector = acpi_wakeup_address;
 
-- 
1.6.4.2

--

From: Yinghai Lu
Date: Thursday, April 8, 2010 - 11:04 pm

We can remove #ifdef in mm/page_alloc.c

and change that function to static

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
---
 include/linux/mm.h |    2 --
 mm/nobootmem.c     |   21 +++++++++++++++++++++
 mm/page_alloc.c    |   24 ------------------------
 3 files changed, 21 insertions(+), 26 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7774e1d..2a14361 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1161,8 +1161,6 @@ int add_from_early_node_map(struct range *range, int az,
 				   int nr_range, int nid);
 u64 __init find_memory_core_early(int nid, u64 size, u64 align,
 					u64 goal, u64 limit);
-void *__alloc_memory_core_early(int nodeid, u64 size, u64 align,
-				 u64 goal, u64 limit);
 typedef int (*work_fn_t)(unsigned long, unsigned long, void *);
 extern void work_with_active_regions(int nid, work_fn_t work_fn, void *data);
 extern void sparse_memory_present_with_active_regions(int nid);
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index abaec96..8c26d02 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -40,6 +40,27 @@ unsigned long max_pfn;
 unsigned long saved_max_pfn;
 #endif
 
+static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
+					u64 goal, u64 limit)
+{
+	void *ptr;
+
+	u64 addr;
+
+	if (limit > lmb.default_alloc_limit)
+		limit = lmb.default_alloc_limit;
+
+	addr = find_memory_core_early(nid, size, align, goal, limit);
+
+	if (addr == -1ULL)
+		return NULL;
+
+	ptr = phys_to_virt(addr);
+	memset(ptr, 0, size);
+	lmb_reserve_area(addr, addr + size, "BOOTMEM");
+	return ptr;
+}
+
 /*
  * free_bootmem_late - free bootmem pages directly to page allocator
  * @addr: starting address of the range
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 233c403..faa749c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3437,30 +3437,6 @@ int __init add_from_early_node_map(struct range *range, int az,
 	return nr_range;
 }
 
-#ifdef CONFIG_NO_BOOTMEM
-void * __init ...
From: Yinghai Lu
Date: Thursday, April 8, 2010 - 11:04 pm

So we can avoid to access e820.map[] directly.

later we could move e820 to static and _initdata

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
---
 arch/x86/include/asm/e820.h   |    9 ++++++
 arch/x86/kernel/cpu/centaur.c |   53 +-------------------------------------
 arch/x86/kernel/e820.c        |   56 +++++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/setup.c       |    2 +
 4 files changed, 69 insertions(+), 51 deletions(-)

diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 334281f..cd7de51 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -76,6 +76,15 @@ struct e820map {
 /* see comment in arch/x86/kernel/e820.c */
 extern struct e820map e820;
 
+#if defined(CONFIG_X86_OOSTORE) && defined(CONFIG_CPU_SUP_CENTAUR)
+extern int centaur_ram_top;
+void get_centaur_ram_top(void);
+#else
+static inline void get_centaur_ram_top(void)
+{
+}
+#endif
+
 extern unsigned long pci_mem_start;
 extern int e820_any_mapped(u64 start, u64 end, unsigned type);
 extern int e820_all_mapped(u64 start, u64 end, unsigned type);
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index e58d978..bb49358 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -37,63 +37,14 @@ static void __cpuinit centaur_mcr_insert(int reg, u32 base, u32 size, int key)
 	mtrr_centaur_report_mcr(reg, lo, hi);	/* Tell the mtrr driver */
 }
 
-/*
- * Figure what we can cover with MCR's
- *
- * Shortcut: We know you can't put 4Gig of RAM on a winchip
- */
-static u32 __cpuinit ramtop(void)
-{
-	u32 clip = 0xFFFFFFFFUL;
-	u32 top = 0;
-	int i;
-
-	for (i = 0; i < e820.nr_map; i++) {
-		unsigned long start, end;
-
-		if (e820.map[i].addr > 0xFFFFFFFFUL)
-			continue;
-		/*
-		 * Don't MCR over reserved space. Ignore the ISA hole
-		 * we frob around that catastrophe already
-		 */
-		if (e820.map[i].type == E820_RESERVED) {
-			if (e820.map[i].addr >= 0x100000UL &&
-			    ...
From: Yinghai Lu
Date: Thursday, April 8, 2010 - 11:04 pm

Move apterture_valid back to .c

and early path still use e820_any_mapped()

So later we can make e820_any_mapped() to _init

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
---
 arch/x86/include/asm/gart.h   |   22 ----------------------
 arch/x86/kernel/aperture_64.c |   22 ++++++++++++++++++++++
 drivers/char/agp/amd64-agp.c  |   39 ++++++++++++++++++++++++++++++++++++++-
 3 files changed, 60 insertions(+), 23 deletions(-)

diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h
index 4ac5b0f..2b63a91 100644
--- a/arch/x86/include/asm/gart.h
+++ b/arch/x86/include/asm/gart.h
@@ -74,26 +74,4 @@ static inline void enable_gart_translation(struct pci_dev *dev, u64 addr)
         pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl);
 }
 
-static inline int aperture_valid(u64 aper_base, u32 aper_size, u32 min_size)
-{
-	if (!aper_base)
-		return 0;
-
-	if (aper_base + aper_size > 0x100000000ULL) {
-		printk(KERN_INFO "Aperture beyond 4GB. Ignoring.\n");
-		return 0;
-	}
-	if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) {
-		printk(KERN_INFO "Aperture pointing to e820 RAM. Ignoring.\n");
-		return 0;
-	}
-	if (aper_size < min_size) {
-		printk(KERN_INFO "Aperture too small (%d MB) than (%d MB)\n",
-				 aper_size>>20, min_size>>20);
-		return 0;
-	}
-
-	return 1;
-}
-
 #endif /* _ASM_X86_GART_H */
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 3704997..f6e6270 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -145,6 +145,28 @@ static u32 __init find_cap(int bus, int slot, int func, int cap)
 	return 0;
 }
 
+static int __init aperture_valid(u64 aper_base, u32 aper_size, u32 min_size)
+{
+	if (!aper_base)
+		return 0;
+
+	if (aper_base + aper_size > 0x100000000ULL) {
+		printk(KERN_INFO "Aperture beyond 4GB. Ignoring.\n");
+		return 0;
+	}
+	if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) {
+		printk(KERN_INFO "Aperture pointing to e820 ...
From: Yinghai Lu
Date: Thursday, April 8, 2010 - 11:04 pm

Finally no user after init boot stage. We can free it to save some bytes.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
---
 arch/x86/include/asm/e820.h |    2 --
 arch/x86/kernel/e820.c      |    2 +-
 2 files changed, 1 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index cd7de51..f2ab72e 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -73,8 +73,6 @@ struct e820map {
 #define BIOS_END		0x00100000
 
 #ifdef __KERNEL__
-/* see comment in arch/x86/kernel/e820.c */
-extern struct e820map e820;
 
 #if defined(CONFIG_X86_OOSTORE) && defined(CONFIG_CPU_SUP_CENTAUR)
 extern int centaur_ram_top;
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 471784b..2c1260f 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -35,7 +35,7 @@
  * user can e.g. boot the original kernel with mem=1G while still booting the
  * next kernel with full memory.
  */
-struct e820map e820;
+static struct e820map __initdata e820;
 static struct e820map __initdata e820_saved;
 
 /* For PCI or other memory-mapped resources */
-- 
1.6.4.2

--

From: Yinghai Lu
Date: Thursday, April 8, 2010 - 11:03 pm

lmb_memory_size() will return memory size in lmb.memory.region.
lmb_free_memory_size() will return free memory size in lmb.memory.region.

So We can get exact reseved size in specified range.

Set the size right after initmem_init(), because later bootmem API will
get area above 16M. (except some fallback).

Later after we remove the bootmem, We could call that just before paging_init().

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
---
 arch/x86/include/asm/e820.h |    2 ++
 arch/x86/kernel/e820.c      |   17 +++++++++++++++++
 arch/x86/kernel/setup.c     |    1 +
 arch/x86/mm/init_64.c       |    7 -------
 4 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index de6cd06..334281f 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -117,6 +117,8 @@ extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align);
 
 void init_lmb_memory(void);
 void fill_lmb_memory(void);
+void find_lmb_dma_reserve(void);
+
 extern void finish_e820_parsing(void);
 extern void e820_reserve_resources(void);
 extern void e820_reserve_resources_late(void);
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index e0ba87d..57c938a 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1151,3 +1151,20 @@ void __init fill_lmb_memory(void)
 	lmb_analyze();
 	lmb_dump_all();
 }
+
+void __init find_lmb_dma_reserve(void)
+{
+#ifdef CONFIG_X86_64
+	u64 free_size_pfn;
+	u64 mem_size_pfn;
+	/*
+	 * need to find out used area below MAX_DMA_PFN
+	 * need to use lmb to get free size in [0, MAX_DMA_PFN]
+	 * at first, and assume boot_mem will not take below MAX_DMA_PFN
+	 */
+	mem_size_pfn = lmb_memory_size(0, MAX_DMA_PFN << PAGE_SHIFT) >> PAGE_SHIFT;
+	free_size_pfn = lmb_free_memory_size(0, MAX_DMA_PFN << PAGE_SHIFT) >> PAGE_SHIFT;
+	set_dma_reserve(mem_size_pfn - free_size_pfn);
+#endif
+}
+
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index ...
From: Yinghai Lu
Date: Thursday, April 8, 2010 - 11:03 pm

32bit now can use generic __find_lmb_area now. So we can turn arch one off

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
---
 arch/x86/Kconfig |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index da9040b..bbe4e99 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -584,7 +584,7 @@ config PARAVIRT_DEBUG
 	  a paravirt_op is missing when it is called.
 
 config ARCH_LMB_FIND_AREA
-	default y
+	default n
 	bool "Use x86 own lmb_find_area()"
 	---help---
 	  Use lmb_find_area() version instead of generic version, it get free
-- 
1.6.4.2

--

From: Yinghai Lu
Date: Thursday, April 8, 2010 - 11:03 pm

and some functions in e820.c that are not used anymore

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
---
 arch/x86/include/asm/e820.h |   14 -
 arch/x86/kernel/e820.c      |   42 ---
 include/linux/early_res.h   |   23 --
 kernel/early_res.c          |  584 -------------------------------------------
 4 files changed, 0 insertions(+), 663 deletions(-)
 delete mode 100644 include/linux/early_res.h
 delete mode 100644 kernel/early_res.c

diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 396c849..de6cd06 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -111,32 +111,18 @@ static inline void early_memtest(unsigned long start, unsigned long end)
 }
 #endif
 
-extern unsigned long end_user_pfn;
-
-extern u64 find_e820_area(u64 start, u64 end, u64 size, u64 align);
-extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align);
-extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align);
-
 extern unsigned long e820_end_of_ram_pfn(void);
 extern unsigned long e820_end_of_low_ram_pfn(void);
-extern void e820_register_active_regions(int nid, unsigned long start_pfn,
-					 unsigned long end_pfn);
-extern u64 e820_hole_size(u64 start, u64 end);
-
 extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align);
 
 void init_lmb_memory(void);
 void fill_lmb_memory(void);
-
 extern void finish_e820_parsing(void);
 extern void e820_reserve_resources(void);
 extern void e820_reserve_resources_late(void);
 extern void setup_memory_map(void);
 extern char *default_machine_specific_memory_setup(void);
 
-void reserve_early(u64 start, u64 end, char *name);
-void free_early(u64 start, u64 end);
-
 /*
  * Returns true iff the specified range [s,e) is completely contained inside
  * the ISA region.
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 3fa3c0a..e0ba87d 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -744,22 +744,6 @@ core_initcall(e820_mark_nvs_memory);
 #endif
 ...
From: Yinghai Lu
Date: Thursday, April 8, 2010 - 11:03 pm

Do not use 0x8000 hard code value anymore.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
---
 arch/x86/mm/numa_64.c |    2 +-
 arch/x86/mm/srat_64.c |    4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 18d2296..b8438ac 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -88,7 +88,7 @@ static int __init allocate_cachealigned_memnodemap(void)
 	if (memnodemapsize <= ARRAY_SIZE(memnode.embedded_map))
 		return 0;
 
-	addr = 0x8000;
+	addr = __pa(MAX_DMA_ADDRESS);
 	nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
 	nodemap_addr = lmb_find_area(addr, max_pfn<<PAGE_SHIFT,
 				      nodemap_size, L1_CACHE_BYTES);
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 84e11b9..416b665 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -99,8 +99,8 @@ void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
 	unsigned long phys;
 
 	length = slit->header.length;
-	phys = lmb_find_area(0, max_pfn_mapped<<PAGE_SHIFT, length,
-		 PAGE_SIZE);
+	phys = lmb_find_area(__pa(MAX_DMA_ADDRESS), max_pfn_mapped<<PAGE_SHIFT,
+				 length, PAGE_SIZE);
 
 	if (phys == -1L)
 		panic(" Can not save slit!\n");
-- 
1.6.4.2

--

From: Yinghai Lu
Date: Thursday, April 8, 2010 - 11:04 pm

Introduce nobootmem.c to hold wrapper for CONFIG_NO_BOOTMEM=y.

that will remove related #ifdef in bootmem.c

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
---
 mm/Makefile    |    8 +-
 mm/bootmem.c   |  151 +----------------------
 mm/nobootmem.c |  389 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 397 insertions(+), 151 deletions(-)
 create mode 100644 mm/nobootmem.c

diff --git a/mm/Makefile b/mm/Makefile
index 52492f9..2ab3039 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -7,12 +7,18 @@ mmu-$(CONFIG_MMU)	:= fremap.o highmem.o madvise.o memory.o mincore.o \
 			   mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
 			   vmalloc.o pagewalk.o
 
-obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
+obj-y			:= filemap.o mempool.o oom_kill.o fadvise.o \
 			   maccess.o page_alloc.o page-writeback.o \
 			   readahead.o swap.o truncate.o vmscan.o shmem.o \
 			   prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
 			   page_isolation.o mm_init.o mmu_context.o \
 			   $(mmu-y)
+ifdef CONFIG_NO_BOOTMEM
+	obj-y		+= nobootmem.o
+else
+	obj-y		+= bootmem.o
+endif
+
 obj-y += init-mm.o
 
 obj-$(CONFIG_HAVE_LMB) += lmb.o
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 2a4c8b5..2741c34 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -35,7 +35,6 @@ unsigned long max_pfn;
 unsigned long saved_max_pfn;
 #endif
 
-#ifndef CONFIG_NO_BOOTMEM
 bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata;
 
 static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list);
@@ -146,7 +145,7 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
 	min_low_pfn = start;
 	return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages);
 }
-#endif
+
 /*
  * free_bootmem_late - free bootmem pages directly to page allocator
  * @addr: starting address of the range
@@ -171,53 +170,6 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size)
 	}
 }
 
-#ifdef CONFIG_NO_BOOTMEM
-static ...
From: Yinghai Lu
Date: Thursday, April 8, 2010 - 11:03 pm

lmb_register_active_regions() will be used to fill early_node_map,
the result will be lmb.memory.region AND numa data

lmb_hole_size will be used to find hole size on lmb.memory.region
with specified range.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
---
 include/linux/lmb.h |    4 +++
 mm/lmb.c            |   68 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 72 insertions(+), 0 deletions(-)

diff --git a/include/linux/lmb.h b/include/linux/lmb.h
index 2ee2cc1..cf8f7ca 100644
--- a/include/linux/lmb.h
+++ b/include/linux/lmb.h
@@ -95,6 +95,10 @@ void lmb_to_bootmem(u64 start, u64 end);
 struct range;
 int get_free_all_memory_range(struct range **rangep, int nodeid);
 
+void lmb_register_active_regions(int nid, unsigned long start_pfn,
+					 unsigned long last_pfn);
+u64 lmb_hole_size(u64 start, u64 end);
+
 #include <asm/lmb.h>
 
 #endif /* __KERNEL__ */
diff --git a/mm/lmb.c b/mm/lmb.c
index f11df14..cf0f1c9 100644
--- a/mm/lmb.c
+++ b/mm/lmb.c
@@ -790,3 +790,71 @@ u64 __init __weak lmb_find_area(u64 start, u64 end, u64 size, u64 align)
 	}
 	return -1ULL;
 }
+/*
+ * Finds an active region in the address range from start_pfn to last_pfn and
+ * returns its range in ei_startpfn and ei_endpfn for the lmb entry.
+ */
+static int __init lmb_find_active_region(const struct lmb_property *ei,
+				  unsigned long start_pfn,
+				  unsigned long last_pfn,
+				  unsigned long *ei_startpfn,
+				  unsigned long *ei_endpfn)
+{
+	u64 align = PAGE_SIZE;
+
+	*ei_startpfn = round_up(ei->base, align) >> PAGE_SHIFT;
+	*ei_endpfn = round_down(ei->base + ei->size, align) >> PAGE_SHIFT;
+
+	/* Skip map entries smaller than a page */
+	if (*ei_startpfn >= *ei_endpfn)
+		return 0;
+
+	/* Skip if map is outside the node */
+	if (*ei_endpfn <= start_pfn || *ei_startpfn >= last_pfn)
+		return 0;
+
+	/* Check for overlaps */
+	if (*ei_startpfn < start_pfn)
+		*ei_startpfn = start_pfn;
+	if (*ei_endpfn > last_pfn)
+		*ei_endpfn = ...
From: Yinghai Lu
Date: Thursday, April 8, 2010 - 11:04 pm

We can remove #ifdef in mm/page_alloc.c

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
---
 mm/bootmem.c    |    7 +++++++
 mm/nobootmem.c  |    5 +++++
 mm/page_alloc.c |    9 ---------
 3 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/mm/bootmem.c b/mm/bootmem.c
index 2741c34..ff55ad7 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -23,6 +23,13 @@
 
 #include "internal.h"
 
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+struct pglist_data __refdata contig_page_data = {
+ .bdata = &bootmem_node_data[0]
+ };
+EXPORT_SYMBOL(contig_page_data);
+#endif
+
 unsigned long max_low_pfn;
 unsigned long min_low_pfn;
 unsigned long max_pfn;
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 283673e..abaec96 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -23,6 +23,11 @@
 
 #include "internal.h"
 
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+struct pglist_data __refdata contig_page_data;
+EXPORT_SYMBOL(contig_page_data);
+#endif
+
 unsigned long max_low_pfn;
 unsigned long min_low_pfn;
 unsigned long max_pfn;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 256aed0..233c403 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4496,15 +4496,6 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
 	dma_reserve = new_dma_reserve;
 }
 
-#ifndef CONFIG_NEED_MULTIPLE_NODES
-struct pglist_data __refdata contig_page_data = {
-#ifndef CONFIG_NO_BOOTMEM
- .bdata = &bootmem_node_data[0]
-#endif
- };
-EXPORT_SYMBOL(contig_page_data);
-#endif
-
 void __init free_area_init(unsigned long *zones_size)
 {
 	free_area_init_node(0, zones_size,
-- 
1.6.4.2

--

From: Yinghai Lu
Date: Thursday, April 8, 2010 - 11:03 pm

According to node range in early_node_map[] with __lmb_find_area
to find free range.

Will be used by lmb_find_area_node()

lmb_find_area_node will be used to find right buffer for NODE_DATA

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
---
 include/linux/mm.h |    2 ++
 mm/page_alloc.c    |   29 +++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+), 0 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index fb19bb9..7774e1d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1159,6 +1159,8 @@ extern void free_bootmem_with_active_regions(int nid,
 						unsigned long max_low_pfn);
 int add_from_early_node_map(struct range *range, int az,
 				   int nr_range, int nid);
+u64 __init find_memory_core_early(int nid, u64 size, u64 align,
+					u64 goal, u64 limit);
 void *__alloc_memory_core_early(int nodeid, u64 size, u64 align,
 				 u64 goal, u64 limit);
 typedef int (*work_fn_t)(unsigned long, unsigned long, void *);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d03c946..12a74ad 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -21,6 +21,7 @@
 #include <linux/pagemap.h>
 #include <linux/jiffies.h>
 #include <linux/bootmem.h>
+#include <linux/lmb.h>
 #include <linux/compiler.h>
 #include <linux/kernel.h>
 #include <linux/kmemcheck.h>
@@ -3393,6 +3394,34 @@ void __init free_bootmem_with_active_regions(int nid,
 	}
 }
 
+#ifdef CONFIG_HAVE_LMB
+u64 __init find_memory_core_early(int nid, u64 size, u64 align,
+					u64 goal, u64 limit)
+{
+	int i;
+
+	/* Need to go over early_node_map to find out good range for node */
+	for_each_active_range_index_in_nid(i, nid) {
+		u64 addr;
+		u64 ei_start, ei_last;
+
+		ei_last = early_node_map[i].end_pfn;
+		ei_last <<= PAGE_SHIFT;
+		ei_start = early_node_map[i].start_pfn;
+		ei_start <<= PAGE_SHIFT;
+		addr = __lmb_find_area(ei_start, ei_last,
+					 goal, limit, size, align);
+
+		if (addr == -1ULL)
+			continue;
+
+		return addr;
+	}
+
+	return ...
From: Yinghai Lu
Date: Thursday, April 8, 2010 - 11:03 pm

lmb.c is memory related, so move it to mm/. It is suggested by Ingo

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
---
 lib/Makefile      |    2 --
 mm/Makefile       |    2 ++
 {lib => mm}/lmb.c |    0
 3 files changed, 2 insertions(+), 2 deletions(-)
 rename {lib => mm}/lmb.c (100%)

diff --git a/lib/Makefile b/lib/Makefile
index 2e152ae..a463a4d 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -85,8 +85,6 @@ obj-$(CONFIG_FAULT_INJECTION) += fault-inject.o
 
 lib-$(CONFIG_GENERIC_BUG) += bug.o
 
-obj-$(CONFIG_HAVE_LMB) += lmb.o
-
 obj-$(CONFIG_HAVE_ARCH_TRACEHOOK) += syscall.o
 
 obj-$(CONFIG_DYNAMIC_DEBUG) += dynamic_debug.o
diff --git a/mm/Makefile b/mm/Makefile
index 6c2a73a..52492f9 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -15,6 +15,8 @@ obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
 			   $(mmu-y)
 obj-y += init-mm.o
 
+obj-$(CONFIG_HAVE_LMB) += lmb.o
+
 obj-$(CONFIG_BOUNCE)	+= bounce.o
 obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o thrash.o
 obj-$(CONFIG_HAS_DMA)	+= dmapool.o
diff --git a/lib/lmb.c b/mm/lmb.c
similarity index 100%
rename from lib/lmb.c
rename to mm/lmb.c
-- 
1.6.4.2

--

From: Benjamin Herrenschmidt
Date: Monday, April 12, 2010 - 8:52 pm

Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>


--

From: Yinghai Lu
Date: Thursday, April 8, 2010 - 11:03 pm

Generic version is going from high to low, and it seems it can not find
right area compact enough.

the x86 version will go from goal to limit and just like the way We used
for early_res

use ARCH_FIND_LMB_AREA to select from them.

For 32 bit have to use CONFIG_ARCH_FIND_LMB_AREA=y, because some alloc_bootmem
in nobootmem config, hard code -1ULL as limit

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
---
 arch/x86/Kconfig  |    8 +++++
 arch/x86/mm/lmb.c |   78 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 86 insertions(+), 0 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d3b7bb3..7415db5 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -585,6 +585,14 @@ config PARAVIRT_DEBUG
 	  Enable to debug paravirt_ops internals.  Specifically, BUG if
 	  a paravirt_op is missing when it is called.
 
+config ARCH_LMB_FIND_AREA
+	default y
+	bool "Use x86 own lmb_find_area()"
+	---help---
+	  Use lmb_find_area() version instead of generic version, it get free
+	  area up from low.
+	  Generic one try to get free area down from limit.
+
 config NO_BOOTMEM
 	default y
 	bool "Disable Bootmem code"
diff --git a/arch/x86/mm/lmb.c b/arch/x86/mm/lmb.c
index 3229e9e..302a205 100644
--- a/arch/x86/mm/lmb.c
+++ b/arch/x86/mm/lmb.c
@@ -86,3 +86,81 @@ u64 __init lmb_find_area_size(u64 start, u64 *sizep, u64 align)
 	return -1ULL;
 }
 
+#ifdef CONFIG_ARCH_LMB_FIND_AREA
+static int __init find_overlapped_early(u64 start, u64 end)
+{
+	int i;
+	struct lmb_property *r;
+
+	for (i = 0; i < lmb.reserved.cnt && lmb.reserved.region[i].size; i++) {
+		r = &lmb.reserved.region[i];
+		if (end > r->base && start < (r->base + r->size))
+			break;
+	}
+
+	return i;
+}
+
+/* Check for already reserved areas */
+static inline bool __init bad_addr(u64 *addrp, u64 size, u64 align)
+{
+	int i;
+	u64 addr = *addrp;
+	bool changed = false;
+	struct lmb_property *r;
+again:
+	i = find_overlapped_early(addr, addr + size);
+	r = ...
From: Yinghai Lu
Date: Thursday, April 8, 2010 - 11:03 pm

Generic version __lmb_find_area() is going from high to low, and for 32bit
active_region for 32bit does include high pages

need to replace the limit with lmb.default_alloc_limit, aka get_max_mapped()

with this patch, x86 32bit could use generic version of __lmb_find_area()

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
---
 mm/page_alloc.c |    3 +++
 1 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 79bd44b..256aed0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3445,6 +3445,9 @@ void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
 
 	u64 addr;
 
+	if (limit > lmb.default_alloc_limit)
+		limit = lmb.default_alloc_limit;
+
 	addr = find_memory_core_early(nid, size, align, goal, limit);
 
 	if (addr == -1ULL)
-- 
1.6.4.2

--

From: Benjamin Herrenschmidt
Date: Monday, April 12, 2010 - 9:23 pm

So you unconditionally add access to some lmb specific data structure to
generic code ? That isn't going to work very well on archs that don't
use lmb.

Also, those things should be local to lmb_* anyways.

Cheers,


--

From: Yinghai
Date: Monday, April 12, 2010 - 9:50 pm

the whole function

#ifdef CONFIG_HAVE_LMB
u64 __init find_memory_core_early(int nid, u64 size, u64 align,
                                        u64 goal, u64 limit)
{
        int i;

        /* Need to go over early_node_map to find out good range for node */
        for_each_active_range_index_in_nid(i, nid) {
                u64 addr;
                u64 ei_start, ei_last;

                ei_last = early_node_map[i].end_pfn;
                ei_last <<= PAGE_SHIFT;
                ei_start = early_node_map[i].start_pfn;
                ei_start <<= PAGE_SHIFT;
                addr = __lmb_find_area(ei_start, ei_last,
                                         goal, limit, size, align);

                if (addr == -1ULL)
                        continue;

                return addr;
        }

        return -1ULL;
}
#endif


if you insist, could move it to lmb.c and use work_with_active_regions() around it.

YH
--

From: Benjamin Herrenschmidt
Date: Monday, April 12, 2010 - 10:13 pm

Hrm... find_memory_core_early() is a broken API anyways. Did you add
that ? Again, you insist on pushing all over the place that crakpot
find/reserve API instead of doing a proper allocation, and it's now
leaking with ifdef's & all into the generic code.

This is just all a pile of shit.

I'm tempted to NACK the whole thing and wait for somebody who can code
to come up with something half decent.



--

From: Yinghai
Date: Monday, April 12, 2010 - 10:42 pm

__alloc_memory_core_early() already include that find_memory_core_early() lines
it is merged for CONFIG_NO_BOOTMEM support.

I split it out, so lmb_find_area_node() could reuse those lines.

YH
--

From: Yinghai Lu
Date: Thursday, April 8, 2010 - 11:03 pm

It will try find area according with size/align in specified range (start, end).

lmb_find_area() will honor goal/limit.

also make it more easy for x86 to use lmb.
x86 early_res is using find/reserve pattern instead of alloc.

When we need temporaray buff for range array etc for range work, if We are using
lmb_alloc(), We will need to add some post fix code for buffer that is used
by range array, because it is in the lmb.reserved already. and have to call
extra lmb_free().

-v2: Change name to lmb_find_area() according to Michael Ellerman
-v3: Add generic weak version __lmb_find_area()

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
---
 include/linux/lmb.h |    4 ++++
 mm/lmb.c            |   49 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 53 insertions(+), 0 deletions(-)

diff --git a/include/linux/lmb.h b/include/linux/lmb.h
index e14ea8d..4cf2f3b 100644
--- a/include/linux/lmb.h
+++ b/include/linux/lmb.h
@@ -83,6 +83,10 @@ lmb_end_pfn(struct lmb_region *type, unsigned long region_nr)
 	       lmb_size_pages(type, region_nr);
 }
 
+u64 __lmb_find_area(u64 ei_start, u64 ei_last, u64 start, u64 end,
+			 u64 size, u64 align);
+u64 lmb_find_area(u64 start, u64 end, u64 size, u64 align);
+
 #include <asm/lmb.h>
 
 #endif /* __KERNEL__ */
diff --git a/mm/lmb.c b/mm/lmb.c
index 392d805..7010212 100644
--- a/mm/lmb.c
+++ b/mm/lmb.c
@@ -11,9 +11,13 @@
  */
 
 #include <linux/kernel.h>
+#include <linux/types.h>
 #include <linux/init.h>
 #include <linux/bitops.h>
 #include <linux/lmb.h>
+#include <linux/bootmem.h>
+#include <linux/mm.h>
+#include <linux/range.h>
 
 #define LMB_ALLOC_ANYWHERE	0
 
@@ -559,3 +563,48 @@ int lmb_find(struct lmb_property *res)
 	}
 	return -1;
 }
+
+u64 __init __weak __lmb_find_area(u64 ei_start, u64 ei_last, u64 start, u64 end,
+				 u64 size, u64 align)
+{
+	u64 final_start, final_end;
+	u64 mem;
+
+	final_start = max(ei_start, start);
+	final_end = min(ei_last, end);
+
+	if (final_start >= ...
From: Benjamin Herrenschmidt
Date: Monday, April 12, 2010 - 9:05 pm

Haven't you noticed there's already way too many functions walking the
LMBs ? :-)

I think the ones doing nid alloc could/should be also rewritten to use
one single low level __lmb_find_* no ?

Cheers,


--

From: Yinghai
Date: Monday, April 12, 2010 - 9:29 pm

x86 is using original lmb_reserve, lmb_free(), but have own version lmb_find_area(), and it will be dropped after

that nid_alloc() only has one user (sparc64).

maybe could be replaced by lmd_find_area_node(), but need to make sure early_node_map[] is filled at first.

Thanks

Yinghai
--

From: Benjamin Herrenschmidt
Date: Monday, April 12, 2010 - 10:07 pm

Do -not- add no APIs that are meant to be dropped. They never are in
practice. What I'm saying here is that the LMB code (including existing

How does it work today ? IE. Which ever mechanism is used that works I
don't care but we shouldn't use 2 different ones.

Cheers,
Ben.

--

From: Yinghai
Date: Monday, April 12, 2010 - 10:26 pm

current generic lmb_find_area() is allocating from high to low.
x86 32bit seems have problem with that.
in this patchset, it is fixed, but not sure if i missed sth.

x86 is only use find_area_early() with node area scope, David point that could have problem with cross node mem map.

YH 
--

From: H. Peter Anvin
Date: Monday, April 12, 2010 - 10:46 pm

Presumably because it fills up ZONE_DMA.

	-hpa
-- 
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.

--

From: Benjamin Herrenschmidt
Date: Tuesday, April 13, 2010 - 3:15 am

I'm working on some LMB cleanups now, among others trying to take into
account what I think are Yinghai requirements. Give me a few days.

For the specific problem above, my idea is to have the low level alloc
function be able to take both low and high limits, so that x86 can
figure out what's best for a given allocation.

Cheers,
Ben.


--

From: Yinghai Lu
Date: Thursday, April 8, 2010 - 11:03 pm

1. replace find_e820_area with lmb_find_area
2. replace reserve_early with lmb_reserve_area
3. replace free_early with lmb_free_area.
4. NO_BOOTMEM will switch to use lmb too.
5. use _e820, _early wrap in the patch, in following patch, will
   replace them all
6. because lmb_free_area support partial free, we can remove some special care
7. Need to make sure that lmb_find_area() is called after fill_lmb_memory()
   so adjust some calling later in setup.c::setup_arch()
   -- corruption_check and mptable_update

-v2: Move reserve_brk() early
    Before fill_lmb_area, to avoid overlap between brk and lmb_find_area()
    that could happen We have more then 128 RAM entry in E820 tables, and
    fill_lmb_memory() could use lmb_find_area() to find a new place for
    lmb.memory.region array.
    and We don't need to use extend_brk() after fill_lmb_area()
    So move reserve_brk() early before fill_lmb_area().
-v3: Move find_smp_config early
    To make sure lmb_find_area not find wrong place, if BIOS doesn't put mptable
    in right place.
-v4: Treat RESERVED_KERN as RAM in lmb.memory. and they are already in
    lmb.reserved already..
    use __NOT_KEEP_LMB to make sure lmb related code could be freed later.

Suggested-by: David S. Miller <davem@davemloft.net>
Suggested-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
---
 arch/x86/Kconfig               |    9 +--
 arch/x86/include/asm/e820.h    |   15 +++--
 arch/x86/include/asm/lmb.h     |   12 +++
 arch/x86/kernel/check.c        |   14 ++--
 arch/x86/kernel/e820.c         |  147 ++++++++++-----------------------------
 arch/x86/kernel/head.c         |    3 +-
 arch/x86/kernel/head32.c       |    6 +-
 arch/x86/kernel/head64.c       |    3 +
 arch/x86/kernel/mpparse.c      |    5 +-
 arch/x86/kernel/setup.c        |   46 +++++++++----
 arch/x86/kernel/setup_percpu.c |    6 --
 arch/x86/mm/numa_64.c          |    5 +-
 ...
From: Yinghai Lu
Date: Thursday, April 8, 2010 - 11:03 pm

We could call free_bootmem_late() if swiotlb is not used, and
it will shrink to page alignement.

So alloc them with page alignment at first, to avoid lose two pages

before patch:
[    0.000000]     lmb_reserve_area: [00d3600000, 00d7600000]   swiotlb buffer
[    0.000000]     lmb_reserve_area: [00d7e7ef40, 00d7e9ef40]     swiotlb list
[    0.000000]     lmb_reserve_area: [00d7e3ef40, 00d7e7ef40]  swiotlb orig_ad
[    0.000000]     lmb_reserve_area: [000008a000, 0000092000]  swiotlb overflo

after patch will get
[    0.000000]     lmb_reserve_area: [00d3600000, 00d7600000]   swiotlb buffer
[    0.000000]     lmb_reserve_area: [00d7e7e000, 00d7e9e000]     swiotlb list
[    0.000000]     lmb_reserve_area: [00d7e3e000, 00d7e7e000]  swiotlb orig_ad
[    0.000000]     lmb_reserve_area: [000008a000, 0000092000]  swiotlb overflo

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Becky Bruce <beckyb@kernel.crashing.org>
---
 lib/swiotlb.c |   16 ++++++++--------
 1 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 5fddf72..1bd4258 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -159,7 +159,7 @@ swiotlb_init_with_default_size(size_t default_size, int verbose)
 	/*
 	 * Get IO TLB memory from the low pages
 	 */
-	io_tlb_start = alloc_bootmem_low_pages(bytes);
+	io_tlb_start = alloc_bootmem_low_pages(PAGE_ALIGN(bytes));
 	if (!io_tlb_start)
 		panic("Cannot allocate SWIOTLB buffer");
 	io_tlb_end = io_tlb_start + bytes;
@@ -169,16 +169,16 @@ swiotlb_init_with_default_size(size_t default_size, int verbose)
 	 * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE
 	 * between io_tlb_start and io_tlb_end.
 	 */
-	io_tlb_list = alloc_bootmem(io_tlb_nslabs * sizeof(int));
+	io_tlb_list = alloc_bootmem_pages(PAGE_ALIGN(io_tlb_nslabs * sizeof(int)));
 	for (i = 0; i < io_tlb_nslabs; i++)
  		io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
 ...
From: Yinghai Lu
Date: Thursday, April 8, 2010 - 11:03 pm

to workaround wrong BIOS memory map.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
---
 arch/x86/kernel/e820.c |   44 ++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 44 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 14d0a1a..73dc6a7 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1044,6 +1044,47 @@ static int __init parse_memmap_opt(char *p)
 }
 early_param("memmap", parse_memmap_opt);
 
+static void __init e820_align_ram_page(void)
+{
+	int i;
+	bool changed = false;;
+
+	for (i = 0; i < e820.nr_map; i++) {
+		struct e820entry *entry = &e820.map[i];
+		u64 start, end;
+		u64 start_aligned, end_aligned;
+
+		if (entry->type != E820_RAM)
+			continue;
+
+		start = entry->addr;
+		end = start + entry->size;
+
+		start_aligned = round_up(start, PAGE_SIZE);
+		end_aligned = round_down(end, PAGE_SIZE);
+
+		if (end_aligned <= start_aligned) {
+			e820_update_range(start, end - start, E820_RAM, E820_RESERVED);
+			changed = true;
+			continue;
+		}
+		if (start < start_aligned) {
+			e820_update_range(start, start_aligned - start, E820_RAM, E820_RESERVED);
+			changed = true;
+		}
+		if (end_aligned < end) {
+			e820_update_range(end_aligned, end - end_aligned, E820_RAM, E820_RESERVED);
+			changed = true;
+		}
+	}
+
+	if (changed) {
+		sanitize_e820_map();
+		printk(KERN_INFO "aligned physical RAM map:\n");
+		e820_print_map("aligned");
+	}
+}
+
 void __init finish_e820_parsing(void)
 {
 	if (userdef) {
@@ -1056,6 +1097,9 @@ void __init finish_e820_parsing(void)
 		printk(KERN_INFO "user-defined physical RAM map:\n");
 		e820_print_map("user");
 	}
+
+	/* In case, We have RAM entres that are not PAGE aligned */
+	e820_align_ram_page();
 }
 
 static inline const char *e820_type_to_string(int e820_type)
-- 
1.6.4.2

--

From: Yinghai Lu
Date: Thursday, April 8, 2010 - 11:03 pm

Also let lmb_reserve_area/lmb_free_area could print out name if lmb=debug is
specified

will also print ther name when reserve_lmb_area/free_lmb_area are called.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
---
 mm/lmb.c |   29 +++++++++++++++++++++--------
 1 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/mm/lmb.c b/mm/lmb.c
index 34fc030..dfbf660 100644
--- a/mm/lmb.c
+++ b/mm/lmb.c
@@ -591,8 +591,9 @@ static void __init __check_and_double_region_array(struct lmb_region *type,
 	memset(&old[0], 0, sizeof(struct lmb_property) * rgnsz);
 	type->region = new;
 	type->nr_regions = rgnsz * 2;
-	printk(KERN_DEBUG "lmb.reserved.region array is doubled to %ld at [%llx - %llx]\n",
-		type->nr_regions, mem, mem + size - 1);
+	if (lmb_debug)
+		pr_info("lmb.reserved.region array is doubled to %ld at [%010llx - %010llx]\n",
+				type->nr_regions, mem, mem + size - 1);
 
 	/* Free old one ?*/
 	if (old != static_region)
@@ -619,6 +620,8 @@ void __init lmb_reserve_area(u64 start, u64 end, char *name)
 	if (WARN_ONCE(start > end, "lmb_reserve_area: wrong range [%#llx, %#llx]\n", start, end))
 		return;
 
+	if (lmb_debug)
+		pr_info("    lmb_reserve_area: [%010llx, %010llx] %16s\n", start, end, name);
 	__lmb_reserve_area(start, end, name);
 }
 
@@ -635,6 +638,8 @@ void __init lmb_reserve_area_overlap_ok(u64 start, u64 end, char *name)
 	if (WARN_ONCE(start > end, "lmb_reserve_area_overlap_ok: wrong range [%#llx, %#llx]\n", start, end))
 		return;
 
+	if (lmb_debug)
+		pr_info("    lmb_reserve_area_overlap_ok: [%010llx, %010llx] %16s\n", start, end, name);
 	/* Free that region at first */
 	lmb_free(start, end - start);
 	__lmb_reserve_area(start, end, name);
@@ -648,6 +653,8 @@ void __init lmb_free_area(u64 start, u64 end)
 	if (WARN_ONCE(start > end, "lmb_free_area: wrong range [%#llx, %#llx]\n", start, end))
 		return;
 
+	if (lmb_debug)
+		pr_info("       lmb_free_area: [%010llx, %010llx]\n", start, end);
 	/* keep punching hole, could run out of ...
From: Yinghai Lu
Date: Thursday, April 8, 2010 - 11:03 pm

get_free_all_memory_range is for CONFIG_NO_BOOTMEM=y, and will be called by
free_all_memory_core_early().

It will use early_node_map aka active ranges subtract lmb.reserved to
get all free range, and those ranges will convert to slab pages.

-v3: use __lmb_find_base() to get range free buffer.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Jan Beulich <jbeulich@novell.com>
---
 include/linux/lmb.h |    2 +
 mm/lmb.c            |   86 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 87 insertions(+), 1 deletions(-)

diff --git a/include/linux/lmb.h b/include/linux/lmb.h
index 1e236d1..2ee2cc1 100644
--- a/include/linux/lmb.h
+++ b/include/linux/lmb.h
@@ -92,6 +92,8 @@ u64 __lmb_find_area(u64 ei_start, u64 ei_last, u64 start, u64 end,
 u64 lmb_find_area(u64 start, u64 end, u64 size, u64 align);
 
 void lmb_to_bootmem(u64 start, u64 end);
+struct range;
+int get_free_all_memory_range(struct range **rangep, int nodeid);
 
 #include <asm/lmb.h>
 
diff --git a/mm/lmb.c b/mm/lmb.c
index ee3d945..f11df14 100644
--- a/mm/lmb.c
+++ b/mm/lmb.c
@@ -630,7 +630,91 @@ void __init lmb_free_area(u64 start, u64 end)
 	__check_and_double_region_array(&lmb.reserved, &lmb_reserved_region[0]);
 }
 
-#ifndef CONFIG_NO_BOOTMEM
+static __init struct range *find_range_array(int count)
+{
+	u64 end, size, mem;
+	struct range *range;
+
+	size = sizeof(struct range) * count;
+	end = lmb.default_alloc_limit;
+
+	mem = __lmb_find_base(size, sizeof(struct range), end);
+	if (mem == -1ULL)
+		panic("can not find more space for range array");
+
+	/*
+	 * This range is tempoaray, so don't reserve it, it will not be
+	 * overlapped because We will not alloccate new buffer before
+	 * We discard this one
+	 */
+	range = __va(mem);
+	memset(range, 0, size);
+
+	return range;
+}
+
+#ifdef CONFIG_NO_BOOTMEM
+static void __init subtract_lmb_reserved(struct range *range, int az)
+{
+	int i, count;
+	u64 final_start, final_end;
+
+	/* Take out region array ...
From: Yinghai Lu
Date: Thursday, April 8, 2010 - 11:03 pm

So We don't need to take e820.map with it.

Also change e820_saved to initdata to get some bytes memory back.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
---
 arch/x86/include/asm/e820.h |    5 ++---
 arch/x86/kernel/e820.c      |   26 ++++++++++++++++++--------
 arch/x86/kernel/efi.c       |    2 +-
 arch/x86/kernel/setup.c     |   10 +++++-----
 arch/x86/xen/setup.c        |    4 +---
 5 files changed, 27 insertions(+), 20 deletions(-)

diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index ec8a52d..0457c49 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -75,15 +75,14 @@ struct e820map {
 #ifdef __KERNEL__
 /* see comment in arch/x86/kernel/e820.c */
 extern struct e820map e820;
-extern struct e820map e820_saved;
 
 extern unsigned long pci_mem_start;
 extern int e820_any_mapped(u64 start, u64 end, unsigned type);
 extern int e820_all_mapped(u64 start, u64 end, unsigned type);
 extern void e820_add_region(u64 start, u64 size, int type);
 extern void e820_print_map(char *who);
-extern int
-sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, u32 *pnr_map);
+int sanitize_e820_map(void);
+void save_e820_map(void);
 extern u64 e820_update_range(u64 start, u64 size, unsigned old_type,
 			       unsigned new_type);
 extern u64 e820_remove_range(u64 start, u64 size, unsigned old_type,
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 7bca3c6..14d0a1a 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -35,7 +35,7 @@
  * next kernel with full memory.
  */
 struct e820map e820;
-struct e820map e820_saved;
+static struct e820map __initdata e820_saved;
 
 /* For PCI or other memory-mapped resources */
 unsigned long pci_mem_start = 0xaeedbabe;
@@ -224,7 +224,7 @@ void __init e820_print_map(char *who)
  *	   ______________________4_
  */
 
-int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
+static int __init __sanitize_e820_map(struct e820entry ...
From: Yinghai Lu
Date: Thursday, April 8, 2010 - 11:03 pm

They will check if the region array is big enough.

__check_and_double_region_array will try to double the region array if that
array spare slots is not big enough.  Old array will be copied to new array.

Arch code should set lmb.default_alloc_limit accordingly, so the new array is in
accessiable address.

-v2: change get_max_mapped() to lmb.default_alloc_limit according to Michael
      Ellerman and Ben
     change to lmb_reserve_area and lmb_free_area according to Michael Ellerman
-v3: call check_and_double after reserve/free, so could avoid to use
      find_lmb_area. Suggested by Michael Ellerman

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
---
 include/linux/lmb.h |    4 +++
 mm/lmb.c            |   66 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 70 insertions(+), 0 deletions(-)

diff --git a/include/linux/lmb.h b/include/linux/lmb.h
index 4cf2f3b..598662f 100644
--- a/include/linux/lmb.h
+++ b/include/linux/lmb.h
@@ -33,6 +33,7 @@ struct lmb_region {
 struct lmb {
 	unsigned long debug;
 	u64 rmo_size;
+	u64 default_alloc_limit;
 	struct lmb_region memory;
 	struct lmb_region reserved;
 };
@@ -83,6 +84,9 @@ lmb_end_pfn(struct lmb_region *type, unsigned long region_nr)
 	       lmb_size_pages(type, region_nr);
 }
 
+void lmb_reserve_area(u64 start, u64 end, char *name);
+void lmb_free_area(u64 start, u64 end);
+void lmb_add_memory(u64 start, u64 end);
 u64 __lmb_find_area(u64 ei_start, u64 ei_last, u64 start, u64 end,
 			 u64 size, u64 align);
 u64 lmb_find_area(u64 start, u64 end, u64 size, u64 align);
diff --git a/mm/lmb.c b/mm/lmb.c
index 7010212..a514d41 100644
--- a/mm/lmb.c
+++ b/mm/lmb.c
@@ -564,6 +564,72 @@ int lmb_find(struct lmb_property *res)
 	return -1;
 }
 
+static void __init __check_and_double_region_array(struct lmb_region *type,
+			 struct lmb_property *static_region)
+{
+	u64 size, mem;
+	struct lmb_property *new, *old;
+	unsigned long rgnsz = type->nr_regions;
+
+	/* Do we have enough slots left ? ...
From: Benjamin Herrenschmidt
Date: Monday, April 12, 2010 - 9:15 pm

So a few things here:

default_alloc_limit: This should be a patch of its own I believe, we
should provide a way for callers to also honor the limit, I'm sure
without that we're going to hit funny problems -especially- if we start
replacing bootmem. (Heh, low/high mem anyone ?)

I would think that the basic lmb_alloc() should be modified to use the
current limit, and maybe add an lmb_alloc_anywhere() as an inline
wrapper to lmb_alloc_base(..., LMB_ALLOC_ANYWHERE); In fact, lmb_alloc()
should become an inline wrapper too.

Also, the way you added the calls to __check_and_double_region_array()
is fishy (what a function name btw !). IE. You added it in 2 or 3
places, missing a whole bunch, which will guarantee some kind of
unexpected behaviour especially when using the _nid variants.

Now, maybe the idea of moving things to -after- the call wasn't that
good. I still don't quite get why we can't do things lazily, especially
if we remove some of the code duplication in there. 

In any case, its about time to clarify what is API and what is internal
in LMB and clean up the entry path.

Cheers,


--

From: Yinghai Lu
Date: Thursday, April 8, 2010 - 11:03 pm

Need to add protection in linux/lmb.h, to prepare to include it in
 mm/page_alloc.c and mm/bootmem.c etc.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
---
 include/linux/lmb.h |    3 +++
 1 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/include/linux/lmb.h b/include/linux/lmb.h
index cf8f7ca..5dbc4ef 100644
--- a/include/linux/lmb.h
+++ b/include/linux/lmb.h
@@ -2,6 +2,7 @@
 #define _LINUX_LMB_H
 #ifdef __KERNEL__
 
+#ifdef CONFIG_HAVE_LMB
 /*
  * Logical memory blocks.
  *
@@ -101,6 +102,8 @@ u64 lmb_hole_size(u64 start, u64 end);
 
 #include <asm/lmb.h>
 
+#endif /* CONFIG_HAVE_LMB */
+
 #endif /* __KERNEL__ */
 
 #endif /* _LINUX_LMB_H */
-- 
1.6.4.2

--

From: Yinghai Lu
Date: Thursday, April 8, 2010 - 11:03 pm

Seperate those three functions and could be shared by related callers.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
---
 mm/lmb.c |   63 +++++++++++++++++++++++++++++++++++--------------------------
 1 files changed, 36 insertions(+), 27 deletions(-)

diff --git a/mm/lmb.c b/mm/lmb.c
index ab3d85f..5483d69 100644
--- a/mm/lmb.c
+++ b/mm/lmb.c
@@ -684,16 +684,41 @@ static __init struct range *find_range_array(int count)
 }
 
 #ifdef CONFIG_NO_BOOTMEM
-static void __init subtract_lmb_reserved(struct range *range, int az)
+static void __init __free_lmb_reserved_region_array(void)
 {
-	int i, count;
-	u64 final_start, final_end;
-
 #ifdef ARCH_DISCARD_LMB
 	/* Take out region array itself at first*/
 	if (lmb.reserved.region != lmb_reserved_region)
 		lmb_free(__pa(lmb.reserved.region), sizeof(struct lmb_property) * lmb.reserved.nr_regions);
 #endif
+}
+static void __init __reserve_lmb_reserved_region_array(void)
+{
+#ifdef ARCH_DISCARD_LMB
+	/* Put region array back ? */
+	if (lmb.reserved.region != lmb_reserved_region)
+		lmb_reserve(__pa(lmb.reserved.region), sizeof(struct lmb_property) * lmb.reserved.nr_regions);
+#endif
+}
+
+static void __init __clear_lmb_reserved_region_array(void)
+{
+#ifdef ARCH_DISCARD_LMB
+	memset(&lmb.reserved.region[0], 0, sizeof(struct lmb_property) * lmb.reserved.nr_regions);
+	lmb.reserved.region = NULL;
+	lmb.reserved.nr_regions = 0;
+	lmb.reserved.cnt = 0;
+#endif
+}
+
+static void __init subtract_lmb_reserved(struct range *range, int az)
+{
+	int i, count;
+	u64 final_start, final_end;
+
+	/* Take out region array itself at first*/
+	__free_lmb_reserved_region_array();
+
 	count  = lmb.reserved.cnt;
 
 	if (lmb_debug)
@@ -709,11 +734,9 @@ static void __init subtract_lmb_reserved(struct range *range, int az)
 			continue;
 		subtract_range(range, az, final_start, final_end);
 	}
-#ifdef ARCH_DISCARD_LMB
+
 	/* Put region array back ? */
-	if (lmb.reserved.region != ...
From: Benjamin Herrenschmidt
Date: Monday, April 12, 2010 - 8:41 pm

I still find most of your changeset comments to be very very poor, if
comprehensible at all. You really MUST make an effort there.

Cheers,


--

Previous thread: [PATCH 18/39] lmb: Add lmb_reserve_area_overlap_ok() by Yinghai Lu on Thursday, April 8, 2010 - 11:03 pm. (5 messages)

Next thread: [PATCH 2/5] sched: add asymmetric packing option for sibling domain by Michael Neuling on Thursday, April 8, 2010 - 11:21 pm. (3 messages)