mm: remove offlining arg to migrate_pages
[deliverable/linux.git] / mm / memory_hotplug.c
index 6c90d222ec0ac4529598f7aae3b2d2e26eaad289..dda1ca695a08be1bda136e811b4d64fca24a6683 100644 (file)
@@ -29,6 +29,7 @@
 #include <linux/suspend.h>
 #include <linux/mm_inline.h>
 #include <linux/firmware-map.h>
+#include <linux/stop_machine.h>
 
 #include <asm/tlbflush.h>
 
@@ -123,6 +124,7 @@ void __ref put_page_bootmem(struct page *page)
                mutex_lock(&ppb_lock);
                __free_pages_bootmem(page, 0);
                mutex_unlock(&ppb_lock);
+               totalram_pages++;
        }
 
 }
@@ -432,20 +434,211 @@ static int __meminit __add_section(int nid, struct zone *zone,
        return register_new_memory(nid, __pfn_to_section(phys_start_pfn));
 }
 
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
-static int __remove_section(struct zone *zone, struct mem_section *ms)
+/* find the smallest valid pfn in the range [start_pfn, end_pfn) */
+static int find_smallest_section_pfn(int nid, struct zone *zone,
+                                    unsigned long start_pfn,
+                                    unsigned long end_pfn)
+{
+       struct mem_section *ms;
+
+       for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) {
+               ms = __pfn_to_section(start_pfn);
+
+               if (unlikely(!valid_section(ms)))
+                       continue;
+
+               if (unlikely(pfn_to_nid(start_pfn) != nid))
+                       continue;
+
+               if (zone && zone != page_zone(pfn_to_page(start_pfn)))
+                       continue;
+
+               return start_pfn;
+       }
+
+       return 0;
+}
+
+/* find the biggest valid pfn in the range [start_pfn, end_pfn). */
+static int find_biggest_section_pfn(int nid, struct zone *zone,
+                                   unsigned long start_pfn,
+                                   unsigned long end_pfn)
+{
+       struct mem_section *ms;
+       unsigned long pfn;
+
+       /* pfn is the end pfn of a memory section. */
+       pfn = end_pfn - 1;
+       for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) {
+               ms = __pfn_to_section(pfn);
+
+               if (unlikely(!valid_section(ms)))
+                       continue;
+
+               if (unlikely(pfn_to_nid(pfn) != nid))
+                       continue;
+
+               if (zone && zone != page_zone(pfn_to_page(pfn)))
+                       continue;
+
+               return pfn;
+       }
+
+       return 0;
+}
+
+static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
+                            unsigned long end_pfn)
 {
+       unsigned long zone_start_pfn =  zone->zone_start_pfn;
+       unsigned long zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
+       unsigned long pfn;
+       struct mem_section *ms;
+       int nid = zone_to_nid(zone);
+
+       zone_span_writelock(zone);
+       if (zone_start_pfn == start_pfn) {
+               /*
+                * If the section is smallest section in the zone, it need
+                * shrink zone->zone_start_pfn and zone->zone_spanned_pages.
+                * In this case, we find second smallest valid mem_section
+                * for shrinking zone.
+                */
+               pfn = find_smallest_section_pfn(nid, zone, end_pfn,
+                                               zone_end_pfn);
+               if (pfn) {
+                       zone->zone_start_pfn = pfn;
+                       zone->spanned_pages = zone_end_pfn - pfn;
+               }
+       } else if (zone_end_pfn == end_pfn) {
+               /*
+                * If the section is biggest section in the zone, it need
+                * shrink zone->spanned_pages.
+                * In this case, we find second biggest valid mem_section for
+                * shrinking zone.
+                */
+               pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn,
+                                              start_pfn);
+               if (pfn)
+                       zone->spanned_pages = pfn - zone_start_pfn + 1;
+       }
+
        /*
-        * XXX: Freeing memmap with vmemmap is not implement yet.
-        *      This should be removed later.
+        * The section is not biggest or smallest mem_section in the zone, it
+        * only creates a hole in the zone. So in this case, we need not
+        * change the zone. But perhaps, the zone has only hole data. Thus
+        * it check the zone has only hole or not.
         */
-       return -EBUSY;
+       pfn = zone_start_pfn;
+       for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) {
+               ms = __pfn_to_section(pfn);
+
+               if (unlikely(!valid_section(ms)))
+                       continue;
+
+               if (page_zone(pfn_to_page(pfn)) != zone)
+                       continue;
+
+                /* If the section is current section, it continues the loop */
+               if (start_pfn == pfn)
+                       continue;
+
+               /* If we find valid section, we have nothing to do */
+               zone_span_writeunlock(zone);
+               return;
+       }
+
+       /* The zone has no valid section */
+       zone->zone_start_pfn = 0;
+       zone->spanned_pages = 0;
+       zone_span_writeunlock(zone);
 }
-#else
-static int __remove_section(struct zone *zone, struct mem_section *ms)
+
+static void shrink_pgdat_span(struct pglist_data *pgdat,
+                             unsigned long start_pfn, unsigned long end_pfn)
+{
+       unsigned long pgdat_start_pfn =  pgdat->node_start_pfn;
+       unsigned long pgdat_end_pfn =
+               pgdat->node_start_pfn + pgdat->node_spanned_pages;
+       unsigned long pfn;
+       struct mem_section *ms;
+       int nid = pgdat->node_id;
+
+       if (pgdat_start_pfn == start_pfn) {
+               /*
+                * If the section is smallest section in the pgdat, it need
+                * shrink pgdat->node_start_pfn and pgdat->node_spanned_pages.
+                * In this case, we find second smallest valid mem_section
+                * for shrinking zone.
+                */
+               pfn = find_smallest_section_pfn(nid, NULL, end_pfn,
+                                               pgdat_end_pfn);
+               if (pfn) {
+                       pgdat->node_start_pfn = pfn;
+                       pgdat->node_spanned_pages = pgdat_end_pfn - pfn;
+               }
+       } else if (pgdat_end_pfn == end_pfn) {
+               /*
+                * If the section is biggest section in the pgdat, it need
+                * shrink pgdat->node_spanned_pages.
+                * In this case, we find second biggest valid mem_section for
+                * shrinking zone.
+                */
+               pfn = find_biggest_section_pfn(nid, NULL, pgdat_start_pfn,
+                                              start_pfn);
+               if (pfn)
+                       pgdat->node_spanned_pages = pfn - pgdat_start_pfn + 1;
+       }
+
+       /*
+        * If the section is not biggest or smallest mem_section in the pgdat,
+        * it only creates a hole in the pgdat. So in this case, we need not
+        * change the pgdat.
+        * But perhaps, the pgdat has only hole data. Thus it check the pgdat
+        * has only hole or not.
+        */
+       pfn = pgdat_start_pfn;
+       for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) {
+               ms = __pfn_to_section(pfn);
+
+               if (unlikely(!valid_section(ms)))
+                       continue;
+
+               if (pfn_to_nid(pfn) != nid)
+                       continue;
+
+                /* If the section is current section, it continues the loop */
+               if (start_pfn == pfn)
+                       continue;
+
+               /* If we find valid section, we have nothing to do */
+               return;
+       }
+
+       /* The pgdat has no valid section */
+       pgdat->node_start_pfn = 0;
+       pgdat->node_spanned_pages = 0;
+}
+
+static void __remove_zone(struct zone *zone, unsigned long start_pfn)
 {
-       unsigned long flags;
        struct pglist_data *pgdat = zone->zone_pgdat;
+       int nr_pages = PAGES_PER_SECTION;
+       int zone_type;
+       unsigned long flags;
+
+       zone_type = zone - pgdat->node_zones;
+
+       pgdat_resize_lock(zone->zone_pgdat, &flags);
+       shrink_zone_span(zone, start_pfn, start_pfn + nr_pages);
+       shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages);
+       pgdat_resize_unlock(zone->zone_pgdat, &flags);
+}
+
+static int __remove_section(struct zone *zone, struct mem_section *ms)
+{
+       unsigned long start_pfn;
+       int scn_nr;
        int ret = -EINVAL;
 
        if (!valid_section(ms))
@@ -455,12 +648,13 @@ static int __remove_section(struct zone *zone, struct mem_section *ms)
        if (ret)
                return ret;
 
-       pgdat_resize_lock(pgdat, &flags);
+       scn_nr = __section_nr(ms);
+       start_pfn = section_nr_to_pfn(scn_nr);
+       __remove_zone(zone, start_pfn);
+
        sparse_remove_one_section(zone, ms);
-       pgdat_resize_unlock(pgdat, &flags);
        return 0;
 }
-#endif
 
 /*
  * Reasonably generic function for adding memory.  It is
@@ -824,11 +1018,14 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
        unsigned long zholes_size[MAX_NR_ZONES] = {0};
        unsigned long start_pfn = start >> PAGE_SHIFT;
 
-       pgdat = arch_alloc_nodedata(nid);
-       if (!pgdat)
-               return NULL;
+       pgdat = NODE_DATA(nid);
+       if (!pgdat) {
+               pgdat = arch_alloc_nodedata(nid);
+               if (!pgdat)
+                       return NULL;
 
-       arch_refresh_nodedata(nid, pgdat);
+               arch_refresh_nodedata(nid, pgdat);
+       }
 
        /* we can use NODE_DATA(nid) from here */
 
@@ -881,7 +1078,8 @@ out:
 int __ref add_memory(int nid, u64 start, u64 size)
 {
        pg_data_t *pgdat = NULL;
-       int new_pgdat = 0;
+       bool new_pgdat;
+       bool new_node;
        struct resource *res;
        int ret;
 
@@ -892,12 +1090,16 @@ int __ref add_memory(int nid, u64 start, u64 size)
        if (!res)
                goto out;
 
-       if (!node_online(nid)) {
+       {       /* Stupid hack to suppress address-never-null warning */
+               void *p = NODE_DATA(nid);
+               new_pgdat = !p;
+       }
+       new_node = !node_online(nid);
+       if (new_node) {
                pgdat = hotadd_new_pgdat(nid, start);
                ret = -ENOMEM;
                if (!pgdat)
                        goto error;
-               new_pgdat = 1;
        }
 
        /* call arch's memory hotadd */
@@ -909,7 +1111,7 @@ int __ref add_memory(int nid, u64 start, u64 size)
        /* we online node here. we can't roll back from here. */
        node_set_online(nid);
 
-       if (new_pgdat) {
+       if (new_node) {
                ret = register_one_node(nid);
                /*
                 * If sysfs file of new node can't create, cpu on the node
@@ -1084,8 +1286,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
                 * migrate_pages returns # of failed pages.
                 */
                ret = migrate_pages(&source, alloc_migrate_target, 0,
-                                                       true, MIGRATE_SYNC,
-                                                       MR_MEMORY_HOTPLUG);
+                                       MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
                if (ret)
                        putback_lru_pages(&source);
        }
@@ -1487,7 +1688,111 @@ static int is_memblock_offlined_cb(struct memory_block *mem, void *arg)
        return ret;
 }
 
-int __ref remove_memory(u64 start, u64 size)
+static int check_cpu_on_node(void *data)
+{
+       struct pglist_data *pgdat = data;
+       int cpu;
+
+       for_each_present_cpu(cpu) {
+               if (cpu_to_node(cpu) == pgdat->node_id)
+                       /*
+                        * the cpu on this node isn't removed, and we can't
+                        * offline this node.
+                        */
+                       return -EBUSY;
+       }
+
+       return 0;
+}
+
+static void unmap_cpu_on_node(void *data)
+{
+#ifdef CONFIG_ACPI_NUMA
+       struct pglist_data *pgdat = data;
+       int cpu;
+
+       for_each_possible_cpu(cpu)
+               if (cpu_to_node(cpu) == pgdat->node_id)
+                       numa_clear_node(cpu);
+#endif
+}
+
+static int check_and_unmap_cpu_on_node(void *data)
+{
+       int ret = check_cpu_on_node(data);
+
+       if (ret)
+               return ret;
+
+       /*
+        * the node will be offlined when we come here, so we can clear
+        * the cpu_to_node() now.
+        */
+
+       unmap_cpu_on_node(data);
+       return 0;
+}
+
+/* offline the node if all memory sections of this node are removed */
+void try_offline_node(int nid)
+{
+       pg_data_t *pgdat = NODE_DATA(nid);
+       unsigned long start_pfn = pgdat->node_start_pfn;
+       unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages;
+       unsigned long pfn;
+       struct page *pgdat_page = virt_to_page(pgdat);
+       int i;
+
+       for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+               unsigned long section_nr = pfn_to_section_nr(pfn);
+
+               if (!present_section_nr(section_nr))
+                       continue;
+
+               if (pfn_to_nid(pfn) != nid)
+                       continue;
+
+               /*
+                * some memory sections of this node are not removed, and we
+                * can't offline node now.
+                */
+               return;
+       }
+
+       if (stop_machine(check_and_unmap_cpu_on_node, pgdat, NULL))
+               return;
+
+       /*
+        * all memory/cpu of this node are removed, we can offline this
+        * node now.
+        */
+       node_set_offline(nid);
+       unregister_one_node(nid);
+
+       if (!PageSlab(pgdat_page) && !PageCompound(pgdat_page))
+               /* node data is allocated from boot memory */
+               return;
+
+       /* free waittable in each zone */
+       for (i = 0; i < MAX_NR_ZONES; i++) {
+               struct zone *zone = pgdat->node_zones + i;
+
+               if (zone->wait_table)
+                       vfree(zone->wait_table);
+       }
+
+       /*
+        * Since there is no way to guarentee the address of pgdat/zone is not
+        * on stack of any kernel threads or used by other kernel objects
+        * without reference counting or other symchronizing method, do not
+        * reset node_data and free pgdat here. Just reset it to 0 and reuse
+        * the memory when the node is online again.
+        */
+       memset(pgdat, 0, sizeof(*pgdat));
+}
+EXPORT_SYMBOL(try_offline_node);
+
+int __ref remove_memory(int nid, u64 start, u64 size)
 {
        unsigned long start_pfn, end_pfn;
        int ret = 0;
@@ -1542,6 +1847,8 @@ repeat:
 
        arch_remove_memory(start, size);
 
+       try_offline_node(nid);
+
        unlock_memory_hotplug();
 
        return 0;
@@ -1551,7 +1858,7 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
 {
        return -EINVAL;
 }
-int remove_memory(u64 start, u64 size)
+int remove_memory(int nid, u64 start, u64 size)
 {
        return -EINVAL;
 }
This page took 0.027984 seconds and 5 git commands to generate.