(relative to git-x86#mm 05afd57b4438d4650d93e6f54f465b0cdd2d9ea8) They should be all safe/useful enough to be considered .25 candidates. Some minor features for pageattr.c: - Use less memory in the !DEBUG_PAGEALLOC case (second patch for that is just a optional cleanup) - Support CPU self snoop (resubmit of earlier patch) - Add some statistic counters about the direct mapping to /proc/meminfo - Shrink code size by doing less inlining - Remove obsolete BUG()s The patches are all fairly independent (except the two debug-pagealloc patches) so it's possible to cherry-pick. -Andi --
Not much left from the original code and I don't want my name on it because there is code in there I disagree with. I don't think any of the Ben inspired code is left in there either, but I left his name in for now. Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86/mm/pageattr.c | 1 - 1 file changed, 1 deletion(-) Index: linux/arch/x86/mm/pageattr.c =================================================================== --- linux.orig/arch/x86/mm/pageattr.c +++ linux/arch/x86/mm/pageattr.c @@ -1,5 +1,4 @@ /* - * Copyright 2002 Andi Kleen, SuSE Labs. * Thanks to Ben LaHaise for precious feedback. */ #include <linux/highmem.h> --
While I have to respect your decision, please clarify whether you are entitled to remove the SuSE Labs copyright as well. Thanks, tglx --
The function is huge and called three times, so don't inline it.
Signed-off-by: Andi Kleen <ak@suse.de>
---
arch/x86/mm/pageattr.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
Index: linux/arch/x86/mm/pageattr.c
===================================================================
--- linux.orig/arch/x86/mm/pageattr.c
+++ linux/arch/x86/mm/pageattr.c
@@ -139,7 +139,7 @@ static unsigned long virt_to_direct(void
* right (again, ioremap() on BIOS memory is not uncommon) so this function
* checks and fixes these known static required protection bits.
*/
-static inline pgprot_t static_protections(pgprot_t prot, unsigned long address)
+static pgprot_t static_protections(pgprot_t prot, unsigned long address)
{
pgprot_t forbidden = __pgprot(0);
--
thanks, applied. As most inlines it started out small ;-) Ingo --
Even with the pool cpa in debug_pagealloc can fail (e.g. consider memory hotplug) Also the code does not clear PSE anymore as the comment claims. Replace wrong comment with correct one. Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86/mm/pageattr.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) Index: linux/arch/x86/mm/pageattr.c =================================================================== --- linux.orig/arch/x86/mm/pageattr.c +++ linux/arch/x86/mm/pageattr.c @@ -864,10 +864,7 @@ void kernel_map_pages(struct page *page, if (!debug_pagealloc_enabled) return; - /* - * The return value is ignored - the calls cannot fail, - * large pages are disabled at boot time: - */ + /* This can fail in theory but then you're out of luck */ if (enable) __set_pages_p(page, numpages); else --
When DEBUG_PAGEALLOC is not set there is no need for the complicated
cpa pool because no recursion is possible in the memory allocator.
So ifdef this case. This will save some memory because
the pool won't be needed for normal operation.
Signed-off-by: Andi Kleen <ak@suse.de>
---
arch/x86/mm/pageattr.c | 27 +++++++++++++++++++++++++--
1 file changed, 25 insertions(+), 2 deletions(-)
Index: linux/arch/x86/mm/pageattr.c
===================================================================
--- linux.orig/arch/x86/mm/pageattr.c
+++ linux/arch/x86/mm/pageattr.c
@@ -345,6 +345,8 @@ out_unlock:
return do_split;
}
+#ifdef CONFIG_DEBUG_PAGEALLOC
+
static LIST_HEAD(page_pool);
static unsigned long pool_size, pool_pages, pool_low;
static unsigned long pool_used, pool_failed, pool_refill;
@@ -365,14 +367,12 @@ static void cpa_fill_pool(void)
if (pool_pages >= pool_size || test_and_set_bit_lock(0, &pool_refill))
return;
-#ifdef CONFIG_DEBUG_PAGEALLOC
/*
* We could do:
* gfp = in_atomic() ? GFP_ATOMIC : GFP_KERNEL;
* but this fails on !PREEMPT kernels
*/
gfp = GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN;
-#endif
while (pool_pages < pool_size) {
p = alloc_pages(gfp, 0);
@@ -416,6 +416,16 @@ void __init cpa_init(void)
pool_pages, pool_size);
}
+#else
+void __init cpa_init(void)
+{
+}
+
+static inline void cpa_fill_pool(void)
+{
+}
+#endif
+
static int split_large_page(pte_t *kpte, unsigned long address)
{
unsigned long flags, pfn, pfninc = 1;
@@ -424,12 +434,19 @@ static int split_large_page(pte_t *kpte,
pgprot_t ref_prot;
struct page *base;
+#ifndef CONFIG_DEBUG_PAGEALLOC
+ base = alloc_page(GFP_KERNEL);
+ if (!base)
+ return -ENOMEM;
+#endif
+
/*
* Get a page from the pool. The pool list is protected by the
* pgd_lock, which we have to take anyway for the split
* operation:
*/
spin_lock_irqsave(&pgd_lock, flags);
+#ifdef CONFIG_DEBUG_PAGEALLOC
if ...Following up the previous ifdef patch this moves the pool allocation/free
code into separate functions.
Minor drawback is that the pgd_lock is now taken more times (this
was needed to separate the with/without DEBUG_PAGEALLOC cases cleanly), but the
additional lock only hits in the DEBUG_PAGEALLOC case which is already slow and
shouldn't be a big issue. Advantage is easier readable code because split_large_page
gets smaller again and less ifdef jungle.
Other than taking the locks explicitely again for alloc and free
there should be no behaviour change.
Signed-off-by: Andi Kleen <ak@suse.de>
---
arch/x86/mm/pageattr.c | 84 +++++++++++++++++++++++++++++--------------------
1 file changed, 51 insertions(+), 33 deletions(-)
Index: linux/arch/x86/mm/pageattr.c
===================================================================
--- linux.orig/arch/x86/mm/pageattr.c
+++ linux/arch/x86/mm/pageattr.c
@@ -416,6 +416,42 @@ void __init cpa_init(void)
pool_pages, pool_size);
}
+static struct page *cpa_get_page(void)
+{
+ unsigned long flags;
+ struct page *page = NULL;
+
+ spin_lock_irqsave(&pgd_lock, flags);
+
+ if (list_empty(&page_pool))
+ goto out;
+
+ page = list_first_entry(&page_pool, struct page, lru);
+ list_del(&page->lru);
+ pool_pages--;
+
+ if (pool_pages < pool_low)
+ pool_low = pool_pages;
+ pool_used++;
+
+out:
+ spin_unlock_irqrestore(&pgd_lock, flags);
+
+ return page;
+}
+
+static void cpa_free_page(struct page *page)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&pgd_lock, flags);
+ list_add(&page->lru, &page_pool);
+ pool_pages++;
+ pool_used--;
+
+ spin_unlock_irqrestore(&pgd_lock, flags);
+}
+
#else
void __init cpa_init(void)
{
@@ -424,6 +460,16 @@ void __init cpa_init(void)
static inline void cpa_fill_pool(void)
{
}
+
+static struct page *cpa_get_page(void)
+{
+ return alloc_page(GFP_KERNEL);
+}
+
+static inline void cpa_free_page(struct page *page)
+{
+ __free_page(page);
+}
#endif
...New implementation does not use lru for anything so there is no need
to reject pages that are in the LRU. Similar for compound pages (which
were checked because they also use page->lru)
Signed-off-by: Andi Kleen <ak@suse.de>
---
arch/x86/mm/pageattr.c | 5 -----
1 file changed, 5 deletions(-)
Index: linux/arch/x86/mm/pageattr.c
===================================================================
--- linux.orig/arch/x86/mm/pageattr.c
+++ linux/arch/x86/mm/pageattr.c
@@ -545,7 +545,6 @@ static int __change_page_attr(unsigned l
{
int do_split, err;
unsigned int level;
- struct page *kpte_page;
pte_t *kpte;
repeat:
@@ -553,10 +552,6 @@ repeat:
if (!kpte)
return -EINVAL;
- kpte_page = virt_to_page(kpte);
- BUG_ON(PageLRU(kpte_page));
- BUG_ON(PageCompound(kpte_page));
-
if (level == PG_LEVEL_4K) {
pte_t new_pte, old_pte = *kpte;
pgprot_t new_prot = pte_pgprot(old_pte);
--
Applied. Removed the now unused variable as well. Thanks, tglx --
[2.6.25 candidate I believe] The specification of SS in the public manuals is a little unclear, but I got confirmation from Intel that SS implies that there is no cache flush needed on caching attribute changes. Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86/mm/pageattr.c | 5 ++++- include/asm-x86/cpufeature.h | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) Index: linux/arch/x86/mm/pageattr.c =================================================================== --- linux.orig/arch/x86/mm/pageattr.c +++ linux/arch/x86/mm/pageattr.c @@ -74,7 +74,7 @@ static void __cpa_flush_all(void *arg) */ __flush_tlb_all(); - if (cache && boot_cpu_data.x86_model >= 4) + if (!cpu_has_ss && cache && boot_cpu_data.x86_model >= 4) wbinvd(); } @@ -108,6 +108,9 @@ static void cpa_flush_range(unsigned lon if (!cache) return; + if (cpu_has_ss) + return; + /* * We only need to flush on one CPU, * clflush is a MESI-coherent instruction that Index: linux/include/asm-x86/cpufeature.h =================================================================== --- linux.orig/include/asm-x86/cpufeature.h +++ linux/include/asm-x86/cpufeature.h @@ -157,6 +157,7 @@ extern const char * const x86_power_flag #define cpu_has_mtrr boot_cpu_has(X86_FEATURE_MTRR) #define cpu_has_mmx boot_cpu_has(X86_FEATURE_MMX) #define cpu_has_fxsr boot_cpu_has(X86_FEATURE_FXSR) +#define cpu_has_ss boot_cpu_has(X86_FEATURE_SELFSNOOP) #define cpu_has_xmm boot_cpu_has(X86_FEATURE_XMM) #define cpu_has_xmm2 boot_cpu_has(X86_FEATURE_XMM2) #define cpu_has_xmm3 boot_cpu_has(X86_FEATURE_XMM3) --
On Mon, 11 Feb 2008 10:50:22 +0100 (CET) I'm hearing slightly different information, but am still chasing this one. It might be better to delay this one to .26 ;-( (I would very much like to avoid the cache flush but... it really HAS to be safe) -- If you want to reach me at my work email, use arjan@linux.intel.com For development, discussion and tips for power savings, visit http://www.lesswatts.org --
10.12.8 in the ia32 manual volume 3a says: " When remapping a page that was previously mapped as a cacheable memory type to a WC page, an operating system can avoid this type of aliasing by doing the following: ... 3. Create a new mapping to the same physical address with a new memory type, for instance, WC. 4. Flush the caches on all processors that may have used the mapping previously. Note on processors that support self-snooping, CPUID feature flag bit 27, this ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ step is unnecessary. ^^^^^^^^^^^^^^^^^^^^ " That is exactly the situation in pageattr.c. You're saying the manual is wrong Unless there are known errata (i'm not aware of any) it is safe according to the manual. -Andi --
On Mon, 11 Feb 2008 16:12:56 +0100 I'm saying that we are not following step 2 (marking the pages not present) so 3 and 4 aren't all too relevant. As I said before I've asked internally about this and got mixed answers so I'm still chasing it down. -- If you want to reach me at my work email, use arjan@linux.intel.com For development, discussion and tips for power savings, visit http://www.lesswatts.org --
Yes that's true. It's one of the design problems of the intent API that makes fixing this hard unfortunately. (intent API assumes that the caller doesn't fully own the page to change, and Ok. -Andi --
BTW, making pages not present is required only while changing the attribute from WB to WC or WC to WB. I think this step is for avoiding attribute aliasing for speculative accesses. For UC, we don't have speculative accesses and such we probably don't need it. True. Perhaps for WC chages we can assume the ownership? thanks, suresh --
Ok that would imply that my patch is ok for all current in tree users at least (none of them ever set WC currently, only UC). It might not be safe for the out of tree WC users though. Not assuming owner ship was the main reason why the intent change was originally submitted. I personally never quite understood the rationale for that, but the submitter and the maintainers apparently thought it a valuable thing to have, otherwise the patch would not have been written and included. You'll need to ask them if that has changed now. At least if that requirement was dropped it would be possible to remove quite a lot of code from pageattr.c -Andi --
There is atleast one issue though. For an I/O device which is not capable of snooping the caches, if the driver model assumes that ioremap_nocache() will flush the cpu caches(before initiating DMA), then the lack of cache flush in cpa() might cause some breakages. I don't think we can drop that requirement for UC atleast. For WC, probably yes. thanks, suresh --
On Mon, 11 Feb 2008 11:47:12 -0800 this is a totally separate issue and I agree, it needs a separate API. another option is to do a three-step tango; go from WB to UC, flush, then go from UC to WB. We need to deal with this for correctness anyway (since this could happen naturally) so might as well do things that way. -- If you want to reach me at my work email, use arjan@linux.intel.com For development, discussion and tips for power savings, visit http://www.lesswatts.org --
I doubt we have any drivers with this problem because ioremap() for the longest time (until I added change_page_attr() to it the first time one a few releases ago) didn't flush its caches at all and it still doesn't flush the caches for the common case of the ioremapped area not being part of the direct mapping (standard case on 32bit and Please clarify what you mean. For what correctness aspect and in what circumstances would such a three step tango be needed? I don't see any mention of such a requirement in the manual. -Andi --
Add information about the mapping state of the direct mapping to /proc/meminfo.
This way we can see how many large pages are really used for it.
Signed-off-by: Andi Kleen <ak@suse.de>
---
arch/x86/mm/init_32.c | 2 ++
arch/x86/mm/init_64.c | 2 ++
arch/x86/mm/pageattr.c | 21 +++++++++++++++++++++
fs/proc/proc_misc.c | 7 +++++++
include/asm-x86/pgtable.h | 3 +++
5 files changed, 35 insertions(+)
Index: linux/arch/x86/mm/init_64.c
===================================================================
--- linux.orig/arch/x86/mm/init_64.c
+++ linux/arch/x86/mm/init_64.c
@@ -306,6 +306,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned
if (pmd_val(*pmd))
continue;
+ dpages_cnt[PG_LEVEL_2M]++;
set_pte((pte_t *)pmd,
pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
}
@@ -351,6 +352,7 @@ phys_pud_init(pud_t *pud_page, unsigned
}
if (direct_gbpages) {
+ dpages_cnt[PG_LEVEL_1G]++;
set_pte((pte_t *)pud,
pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
true_end = (addr & PUD_MASK) + PUD_SIZE;
Index: linux/arch/x86/mm/pageattr.c
===================================================================
--- linux.orig/arch/x86/mm/pageattr.c
+++ linux/arch/x86/mm/pageattr.c
@@ -18,6 +18,8 @@
#include <asm/uaccess.h>
#include <asm/pgalloc.h>
+unsigned long dpages_cnt[PG_LEVEL_NUM];
+
/*
* The current flushing context - we pass it instead of 5 arguments:
*/
@@ -516,6 +518,11 @@ static int split_large_page(pte_t *kpte,
for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
set_pte(&pbase[i], pfn_pte(pfn, ref_prot));
+ if (address >= __va(0) && address < __va(end_pfn_map)) {
+ dpages_cnt[level]--;
+ dpages_cnt[level - 1] += PTRS_PER_PTE;
+ }
+
/*
* Install the new, split up pagetable. Important details here:
*
@@ -961,6 +968,22 @@ __initcall(debug_pagealloc_proc_init);
#endif
+#ifdef CONFIG_PROC_FS
+int arch_report_meminfo(char *page)
+{
+ int n;
+ n = sprintf(page, ...