Merge branch 'akpm-current/current'

[deliverable/linux.git] / mm / page_alloc.c
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 41940f6e3c1c07eec3467f41de4e5337e5d4b654..721d62c5be69977bc595f9cd8d8d9e5ce618ea26 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -260,7 +260,7 @@ int watermark_scale_factor = 10;
  
  static unsigned long __meminitdata nr_kernel_pages;
  static unsigned long __meminitdata nr_all_pages;
-static unsigned long __meminitdata dma_reserve;
+static unsigned long __meminitdata nr_memory_reserve;
  
  #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
  static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
@@ -613,6 +613,9 @@ static bool need_debug_guardpage(void)
         if (!debug_pagealloc_enabled())
                 return false;
  
+       if (!debug_guardpage_minorder())
+               return false;
+
         return true;
  }
  
@@ -621,6 +624,9 @@ static void init_debug_guardpage(void)
         if (!debug_pagealloc_enabled())
                 return;
  
+       if (!debug_guardpage_minorder())
+               return;
+
         _debug_guardpage_enabled = true;
  }
  
@@ -641,19 +647,22 @@ static int __init debug_guardpage_minorder_setup(char *buf)
         pr_info("Setting debug_guardpage_minorder to %lu\n", res);
         return 0;
  }
-__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
+early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup);
  
-static inline void set_page_guard(struct zone *zone, struct page *page,
+static inline bool set_page_guard(struct zone *zone, struct page *page,
                                 unsigned int order, int migratetype)
  {
         struct page_ext *page_ext;
  
         if (!debug_guardpage_enabled())
-               return;
+               return false;
+
+       if (order >= debug_guardpage_minorder())
+               return false;
  
         page_ext = lookup_page_ext(page);
         if (unlikely(!page_ext))
-               return;
+               return false;
  
         __set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
  
@@ -661,6 +670,8 @@ static inline void set_page_guard(struct zone *zone, struct page *page,
         set_page_private(page, order);
         /* Guard pages are not available for any usage */
         __mod_zone_freepage_state(zone, -(1 << order), migratetype);
+
+       return true;
  }
  
  static inline void clear_page_guard(struct zone *zone, struct page *page,
@@ -682,9 +693,9 @@ static inline void clear_page_guard(struct zone *zone, struct page *page,
                 __mod_zone_freepage_state(zone, (1 << order), migratetype);
  }
  #else
-struct page_ext_operations debug_guardpage_ops = { NULL, };
-static inline void set_page_guard(struct zone *zone, struct page *page,
-                               unsigned int order, int migratetype) {}
+struct page_ext_operations debug_guardpage_ops;
+static inline bool set_page_guard(struct zone *zone, struct page *page,
+                       unsigned int order, int migratetype) { return false; }
  static inline void clear_page_guard(struct zone *zone, struct page *page,
                                 unsigned int order, int migratetype) {}
  #endif
@@ -1419,15 +1430,18 @@ static void __init deferred_free_range(struct page *page,
                 return;
  
         /* Free a large naturally-aligned chunk if possible */
-       if (nr_pages == MAX_ORDER_NR_PAGES &&
-           (pfn & (MAX_ORDER_NR_PAGES-1)) == 0) {
+       if (nr_pages == pageblock_nr_pages &&
+           (pfn & (pageblock_nr_pages - 1)) == 0) {
                 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
-               __free_pages_boot_core(page, MAX_ORDER-1);
+               __free_pages_boot_core(page, pageblock_order);
                 return;
         }
  
-       for (i = 0; i < nr_pages; i++, page++)
+       for (i = 0; i < nr_pages; i++, page++, pfn++) {
+               if ((pfn & (pageblock_nr_pages - 1)) == 0)
+                       set_pageblock_migratetype(page, MIGRATE_MOVABLE);
                 __free_pages_boot_core(page, 0);
+       }
  }
  
  /* Completion tracking for deferred_init_memmap() threads */
@@ -1495,9 +1509,9 @@ static int __init deferred_init_memmap(void *data)
  
                         /*
                          * Ensure pfn_valid is checked every
-                        * MAX_ORDER_NR_PAGES for memory holes
+                        * pageblock_nr_pages for memory holes
                          */
-                       if ((pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) {
+                       if ((pfn & (pageblock_nr_pages - 1)) == 0) {
                                 if (!pfn_valid(pfn)) {
                                         page = NULL;
                                         goto free_range;
@@ -1510,7 +1524,7 @@ static int __init deferred_init_memmap(void *data)
                         }
  
                         /* Minimise pfn page lookups and scheduler checks */
-                       if (page && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0) {
+                       if (page && (pfn & (pageblock_nr_pages - 1)) != 0) {
                                 page++;
                         } else {
                                 nr_pages += nr_to_free;
@@ -1546,6 +1560,9 @@ free_range:
                         free_base_page = NULL;
                         free_base_pfn = nr_to_free = 0;
                 }
+               /* Free the last block of pages to allocator */
+               nr_pages += nr_to_free;
+               deferred_free_range(free_base_page, free_base_pfn, nr_to_free);
  
                 first_init_pfn = max(end_pfn, first_init_pfn);
         }
@@ -1642,18 +1659,15 @@ static inline void expand(struct zone *zone, struct page *page,
                 size >>= 1;
                 VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
  
-               if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) &&
-                       debug_guardpage_enabled() &&
-                       high < debug_guardpage_minorder()) {
-                       /*
-                        * Mark as guard pages (or page), that will allow to
-                        * merge back to allocator when buddy will be freed.
-                        * Corresponding page table entries will not be touched,
-                        * pages will stay not present in virtual address space
-                        */
-                       set_page_guard(zone, &page[size], high, migratetype);
+               /*
+                * Mark as guard pages (or page), that will allow to
+                * merge back to allocator when buddy will be freed.
+                * Corresponding page table entries will not be touched,
+                * pages will stay not present in virtual address space
+                */
+               if (set_page_guard(zone, &page[size], high, migratetype))
                         continue;
-               }
+
                 list_add(&page[size].lru, &area->free_list[migratetype]);
                 area->nr_free++;
                 set_page_order(&page[size], high);
@@ -2515,9 +2529,14 @@ int __isolate_free_page(struct page *page, unsigned int order)
         mt = get_pageblock_migratetype(page);
  
         if (!is_migrate_isolate(mt)) {
-               /* Obey watermarks as if the page was being allocated */
-               watermark = low_wmark_pages(zone) + (1 << order);
-               if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
+               /*
+                * Obey watermarks as if the page was being allocated. We can
+                * emulate a high-order watermark check with a raised order-0
+                * watermark, because we already know our high-order page
+                * exists.
+                */
+               watermark = min_wmark_pages(zone) + (1UL << order);
+               if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
                         return 0;
  
                 __mod_zone_freepage_state(zone, -(1UL << order), mt);
@@ -3163,6 +3182,61 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
         return NULL;
  }
  
+static inline bool
+should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
+                    enum compact_result compact_result,
+                    enum compact_priority *compact_priority,
+                    int compaction_retries)
+{
+       int max_retries = MAX_COMPACT_RETRIES;
+       int min_priority;
+
+       if (!order)
+               return false;
+
+       /*
+        * compaction considers all the zone as desperately out of memory
+        * so it doesn't really make much sense to retry except when the
+        * failure could be caused by insufficient priority
+        */
+       if (compaction_failed(compact_result))
+               goto check_priority;
+
+       /*
+        * make sure the compaction wasn't deferred or didn't bail out early
+        * due to locks contention before we declare that we should give up.
+        * But do not retry if the given zonelist is not suitable for
+        * compaction.
+        */
+       if (compaction_withdrawn(compact_result))
+               return compaction_zonelist_suitable(ac, order, alloc_flags);
+
+       /*
+        * !costly requests are much more important than __GFP_REPEAT
+        * costly ones because they are de facto nofail and invoke OOM
+        * killer to move on while costly can fail and users are ready
+        * to cope with that. 1/4 retries is rather arbitrary but we
+        * would need much more detailed feedback from compaction to
+        * make a better decision.
+        */
+       if (order > PAGE_ALLOC_COSTLY_ORDER)
+               max_retries /= 4;
+       if (compaction_retries <= max_retries)
+               return true;
+
+       /*
+        * Make sure there is at least one attempt at the highest priority
+        * if we exhausted all retries at the lower priorities
+        */
+check_priority:
+       min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ?
+                       MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY;
+       if (*compact_priority > min_priority) {
+               (*compact_priority)--;
+               return true;
+       }
+       return false;
+}
  #else
  static inline struct page *
  __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
@@ -3173,8 +3247,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
         return NULL;
  }
  
-#endif /* CONFIG_COMPACTION */
-
  static inline bool
  should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags,
                      enum compact_result compact_result,
@@ -3201,6 +3273,7 @@ should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_fla
         }
         return false;
  }
+#endif /* CONFIG_COMPACTION */
  
  /* Perform direct synchronous page reclaim */
  static int
@@ -4581,7 +4654,7 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
         int j;
         struct zonelist *zonelist;
  
-       zonelist = &pgdat->node_zonelists[0];
+       zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
         for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
                 ;
         j = build_zonelists_node(NODE_DATA(node), zonelist, j);
@@ -4597,7 +4670,7 @@ static void build_thisnode_zonelists(pg_data_t *pgdat)
         int j;
         struct zonelist *zonelist;
  
-       zonelist = &pgdat->node_zonelists[1];
+       zonelist = &pgdat->node_zonelists[ZONELIST_NOFALLBACK];
         j = build_zonelists_node(pgdat, zonelist, 0);
         zonelist->_zonerefs[j].zone = NULL;
         zonelist->_zonerefs[j].zone_idx = 0;
@@ -4618,7 +4691,7 @@ static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
         struct zone *z;
         struct zonelist *zonelist;
  
-       zonelist = &pgdat->node_zonelists[0];
+       zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
         pos = 0;
         for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
                 for (j = 0; j < nr_nodes; j++) {
@@ -4753,7 +4826,7 @@ static void build_zonelists(pg_data_t *pgdat)
  
         local_node = pgdat->node_id;
  
-       zonelist = &pgdat->node_zonelists[0];
+       zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
         j = build_zonelists_node(pgdat, zonelist, 0);
  
         /*
@@ -5025,15 +5098,6 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
                         break;
  
  #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
-               /*
-                * If not mirrored_kernelcore and ZONE_MOVABLE exists, range
-                * from zone_movable_pfn[nid] to end of each node should be
-                * ZONE_MOVABLE not ZONE_NORMAL. skip it.
-                */
-               if (!mirrored_kernelcore && zone_movable_pfn[nid])
-                       if (zone == ZONE_NORMAL && pfn >= zone_movable_pfn[nid])
-                               continue;
-
                 /*
                  * Check given memblock attribute by firmware which can affect
                  * kernel memory layout.  If zone==ZONE_MOVABLE but memory is
@@ -5477,6 +5541,12 @@ static void __meminit adjust_zone_range_for_zone_movable(int nid,
                         *zone_end_pfn = min(node_end_pfn,
                                 arch_zone_highest_possible_pfn[movable_zone]);
  
+               /* Adjust for ZONE_MOVABLE starting within this range */
+               } else if (!mirrored_kernelcore &&
+                       *zone_start_pfn < zone_movable_pfn[nid] &&
+                       *zone_end_pfn > zone_movable_pfn[nid]) {
+                       *zone_end_pfn = zone_movable_pfn[nid];
+
                 /* Check if this whole range is within ZONE_MOVABLE */
                 } else if (*zone_start_pfn >= zone_movable_pfn[nid])
                         *zone_start_pfn = *zone_end_pfn;
@@ -5580,28 +5650,23 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
          * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages
          * and vice versa.
          */
-       if (zone_movable_pfn[nid]) {
-               if (mirrored_kernelcore) {
-                       unsigned long start_pfn, end_pfn;
-                       struct memblock_region *r;
-
-                       for_each_memblock(memory, r) {
-                               start_pfn = clamp(memblock_region_memory_base_pfn(r),
-                                                 zone_start_pfn, zone_end_pfn);
-                               end_pfn = clamp(memblock_region_memory_end_pfn(r),
-                                               zone_start_pfn, zone_end_pfn);
-
-                               if (zone_type == ZONE_MOVABLE &&
-                                   memblock_is_mirror(r))
-                                       nr_absent += end_pfn - start_pfn;
-
-                               if (zone_type == ZONE_NORMAL &&
-                                   !memblock_is_mirror(r))
-                                       nr_absent += end_pfn - start_pfn;
-                       }
-               } else {
-                       if (zone_type == ZONE_NORMAL)
-                               nr_absent += node_end_pfn - zone_movable_pfn[nid];
+       if (mirrored_kernelcore && zone_movable_pfn[nid]) {
+               unsigned long start_pfn, end_pfn;
+               struct memblock_region *r;
+
+               for_each_memblock(memory, r) {
+                       start_pfn = clamp(memblock_region_memory_base_pfn(r),
+                                         zone_start_pfn, zone_end_pfn);
+                       end_pfn = clamp(memblock_region_memory_end_pfn(r),
+                                       zone_start_pfn, zone_end_pfn);
+
+                       if (zone_type == ZONE_MOVABLE &&
+                           memblock_is_mirror(r))
+                               nr_absent += end_pfn - start_pfn;
+
+                       if (zone_type == ZONE_NORMAL &&
+                           !memblock_is_mirror(r))
+                               nr_absent += end_pfn - start_pfn;
                 }
         }
  
@@ -5838,10 +5903,10 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
                 }
  
                 /* Account for reserved pages */
-               if (j == 0 && freesize > dma_reserve) {
-                       freesize -= dma_reserve;
+               if (j == 0 && freesize > nr_memory_reserve) {
+                       freesize -= nr_memory_reserve;
                         printk(KERN_DEBUG "  %s zone: %lu pages reserved\n",
-                                       zone_names[0], dma_reserve);
+                                       zone_names[0], nr_memory_reserve);
                 }
  
                 if (!is_highmem_idx(j))
@@ -6527,8 +6592,9 @@ void __init mem_init_print_info(const char *str)
  }
  
  /**
- * set_dma_reserve - set the specified number of pages reserved in the first zone
- * @new_dma_reserve: The number of pages to mark reserved
+ * set_memory_reserve - set number of pages reserved in the first zone
+ * @nr_reserve: The number of pages to mark reserved
+ * @inc: true increment to existing value; false set new value.
   *
   * The per-cpu batchsize and zone watermarks are determined by managed_pages.
   * In the DMA zone, a significant percentage may be consumed by kernel image
@@ -6537,9 +6603,12 @@ void __init mem_init_print_info(const char *str)
   * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
   * smaller per-cpu batchsize.
   */
-void __init set_dma_reserve(unsigned long new_dma_reserve)
+void __init set_memory_reserve(unsigned long nr_reserve, bool inc)
  {
-       dma_reserve = new_dma_reserve;
+       if (inc)
+               nr_memory_reserve += nr_reserve;
+       else
+               nr_memory_reserve = nr_reserve;
  }
  
  void __init free_area_init(unsigned long *zones_size)
@@ -6955,6 +7024,17 @@ static int __init set_hashdist(char *str)
  __setup("hashdist=", set_hashdist);
  #endif
  
+#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES
+/*
+ * Returns the number of pages that arch has reserved but
+ * is not known to alloc_large_system_hash().
+ */
+static unsigned long __init arch_reserved_kernel_pages(void)
+{
+       return 0;
+}
+#endif
+
  /*
   * allocate a large system hash table from bootmem
   * - it is assumed that the hash table must contain an exact power-of-2
@@ -6979,6 +7059,7 @@ void *__init alloc_large_system_hash(const char *tablename,
         if (!numentries) {
                 /* round applicable memory size up to nearest megabyte */
                 numentries = nr_kernel_pages;
+               numentries -= arch_reserved_kernel_pages();
  
                 /* It isn't necessary when PAGE_SIZE >= 1MB */
                 if (PAGE_SHIFT < 20)