[PATCH] Make high and batch sizes of per_cpu_pagelists configurable
[deliverable/linux.git] / mm / page_alloc.c
index 512e3f4d4963ecf4e255ab74f7fdc1aa1b5ff494..2c46f697e8ff805a8bc2bdf8d1229ff101b059bf 100644 (file)
@@ -53,6 +53,7 @@ struct pglist_data *pgdat_list __read_mostly;
 unsigned long totalram_pages __read_mostly;
 unsigned long totalhigh_pages __read_mostly;
 long nr_swap_pages;
+int percpu_pagelist_fraction;
 
 static void fastcall free_hot_cold_page(struct page *page, int cold);
 
@@ -424,9 +425,9 @@ void __free_pages_ok(struct page *page, unsigned int order)
                return;
 
        list_add(&page->lru, &list);
-       mod_page_state(pgfree, 1 << order);
        kernel_map_pages(page, 1<<order, 0);
        local_irq_save(flags);
+       __mod_page_state(pgfree, 1 << order);
        free_pages_bulk(page_zone(page), 1, &list, order);
        local_irq_restore(flags);
 }
@@ -674,18 +675,14 @@ void drain_local_pages(void)
 }
 #endif /* CONFIG_PM */
 
-static void zone_statistics(struct zonelist *zonelist, struct zone *z)
+static void zone_statistics(struct zonelist *zonelist, struct zone *z, int cpu)
 {
 #ifdef CONFIG_NUMA
-       unsigned long flags;
-       int cpu;
        pg_data_t *pg = z->zone_pgdat;
        pg_data_t *orig = zonelist->zones[0]->zone_pgdat;
        struct per_cpu_pageset *p;
 
-       local_irq_save(flags);
-       cpu = smp_processor_id();
-       p = zone_pcp(z,cpu);
+       p = zone_pcp(z, cpu);
        if (pg == orig) {
                p->numa_hit++;
        } else {
@@ -696,7 +693,6 @@ static void zone_statistics(struct zonelist *zonelist, struct zone *z)
                p->local_node++;
        else
                p->other_node++;
-       local_irq_restore(flags);
 #endif
 }
 
@@ -716,11 +712,11 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
        if (free_pages_check(page))
                return;
 
-       inc_page_state(pgfree);
        kernel_map_pages(page, 1, 0);
 
        pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
        local_irq_save(flags);
+       __inc_page_state(pgfree);
        list_add(&page->lru, &pcp->list);
        pcp->count++;
        if (pcp->count >= pcp->high)
@@ -753,49 +749,58 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
  * we cheat by calling it from here, in the order > 0 path.  Saves a branch
  * or two.
  */
-static struct page *
-buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags)
+static struct page *buffered_rmqueue(struct zonelist *zonelist,
+                       struct zone *zone, int order, gfp_t gfp_flags)
 {
        unsigned long flags;
        struct page *page;
        int cold = !!(gfp_flags & __GFP_COLD);
+       int cpu;
 
 again:
+       cpu  = get_cpu();
        if (order == 0) {
                struct per_cpu_pages *pcp;
 
-               page = NULL;
-               pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
+               pcp = &zone_pcp(zone, cpu)->pcp[cold];
                local_irq_save(flags);
-               if (!pcp->count)
+               if (!pcp->count) {
                        pcp->count += rmqueue_bulk(zone, 0,
                                                pcp->batch, &pcp->list);
-               if (likely(pcp->count)) {
-                       page = list_entry(pcp->list.next, struct page, lru);
-                       list_del(&page->lru);
-                       pcp->count--;
+                       if (unlikely(!pcp->count))
+                               goto failed;
                }
-               local_irq_restore(flags);
-               put_cpu();
+               page = list_entry(pcp->list.next, struct page, lru);
+               list_del(&page->lru);
+               pcp->count--;
        } else {
                spin_lock_irqsave(&zone->lock, flags);
                page = __rmqueue(zone, order);
-               spin_unlock_irqrestore(&zone->lock, flags);
+               spin_unlock(&zone->lock);
+               if (!page)
+                       goto failed;
        }
 
-       if (page != NULL) {
-               BUG_ON(bad_range(zone, page));
-               mod_page_state_zone(zone, pgalloc, 1 << order);
-               if (prep_new_page(page, order))
-                       goto again;
+       __mod_page_state_zone(zone, pgalloc, 1 << order);
+       zone_statistics(zonelist, zone, cpu);
+       local_irq_restore(flags);
+       put_cpu();
 
-               if (gfp_flags & __GFP_ZERO)
-                       prep_zero_page(page, order, gfp_flags);
+       BUG_ON(bad_range(zone, page));
+       if (prep_new_page(page, order))
+               goto again;
 
-               if (order && (gfp_flags & __GFP_COMP))
-                       prep_compound_page(page, order);
-       }
+       if (gfp_flags & __GFP_ZERO)
+               prep_zero_page(page, order, gfp_flags);
+
+       if (order && (gfp_flags & __GFP_COMP))
+               prep_compound_page(page, order);
        return page;
+
+failed:
+       local_irq_restore(flags);
+       put_cpu();
+       return NULL;
 }
 
 #define ALLOC_NO_WATERMARKS    0x01 /* don't check watermarks at all */
@@ -871,9 +876,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
                                continue;
                }
 
-               page = buffered_rmqueue(*z, order, gfp_mask);
+               page = buffered_rmqueue(zonelist, *z, order, gfp_mask);
                if (page) {
-                       zone_statistics(zonelist, *z);
                        break;
                }
        } while (*(++z) != NULL);
@@ -1201,6 +1205,7 @@ static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
        int cpu = 0;
 
        memset(ret, 0, sizeof(*ret));
+       cpus_and(*cpumask, *cpumask, cpu_online_map);
 
        cpu = first_cpu(*cpumask);
        while (cpu < NR_CPUS) {
@@ -1248,12 +1253,12 @@ void get_full_page_state(struct page_state *ret)
        __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask);
 }
 
-unsigned long __read_page_state(unsigned long offset)
+unsigned long read_page_state_offset(unsigned long offset)
 {
        unsigned long ret = 0;
        int cpu;
 
-       for_each_cpu(cpu) {
+       for_each_online_cpu(cpu) {
                unsigned long in;
 
                in = (unsigned long)&per_cpu(page_states, cpu) + offset;
@@ -1262,18 +1267,26 @@ unsigned long __read_page_state(unsigned long offset)
        return ret;
 }
 
-void __mod_page_state(unsigned long offset, unsigned long delta)
+void __mod_page_state_offset(unsigned long offset, unsigned long delta)
+{
+       void *ptr;
+
+       ptr = &__get_cpu_var(page_states);
+       *(unsigned long *)(ptr + offset) += delta;
+}
+EXPORT_SYMBOL(__mod_page_state_offset);
+
+void mod_page_state_offset(unsigned long offset, unsigned long delta)
 {
        unsigned long flags;
-       voidptr;
+       void *ptr;
 
        local_irq_save(flags);
        ptr = &__get_cpu_var(page_states);
-       *(unsigned long*)(ptr + offset) += delta;
+       *(unsigned long *)(ptr + offset) += delta;
        local_irq_restore(flags);
 }
-
-EXPORT_SYMBOL(__mod_page_state);
+EXPORT_SYMBOL(mod_page_state_offset);
 
 void __get_zone_counts(unsigned long *active, unsigned long *inactive,
                        unsigned long *free, struct pglist_data *pgdat)
@@ -1460,21 +1473,25 @@ void show_free_areas(void)
  * Add all populated zones of a node to the zonelist.
  */
 static int __init build_zonelists_node(pg_data_t *pgdat,
-                       struct zonelist *zonelist, int j, int k)
+                       struct zonelist *zonelist, int nr_zones, int zone_type)
 {
        struct zone *zone;
 
-       BUG_ON(k > ZONE_HIGHMEM);
-       for (zone = pgdat->node_zones + k; zone >= pgdat->node_zones; zone--) {
+       BUG_ON(zone_type > ZONE_HIGHMEM);
+
+       do {
+               zone = pgdat->node_zones + zone_type;
                if (populated_zone(zone)) {
 #ifndef CONFIG_HIGHMEM
-                       BUG_ON(zone - pgdat->node_zones > ZONE_NORMAL);
+                       BUG_ON(zone_type > ZONE_NORMAL);
 #endif
-                       zonelist->zones[j++] = zone;
-                       check_highest_zone(k);
+                       zonelist->zones[nr_zones++] = zone;
+                       check_highest_zone(zone_type);
                }
-       }
-       return j;
+               zone_type--;
+
+       } while (zone_type >= 0);
+       return nr_zones;
 }
 
 static inline int highest_zone(int zone_bits)
@@ -1815,6 +1832,24 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
        INIT_LIST_HEAD(&pcp->list);
 }
 
+/*
+ * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
+ * to the value high for the pageset p.
+ */
+
+static void setup_pagelist_highmark(struct per_cpu_pageset *p,
+                               unsigned long high)
+{
+       struct per_cpu_pages *pcp;
+
+       pcp = &p->pcp[0]; /* hot list */
+       pcp->high = high;
+       pcp->batch = max(1UL, high/4);
+       if ((high/4) > (PAGE_SHIFT * 8))
+               pcp->batch = PAGE_SHIFT * 8;
+}
+
+
 #ifdef CONFIG_NUMA
 /*
  * Boot pageset table. One per cpu which is going to be used for all
@@ -1852,6 +1887,10 @@ static int __devinit process_zones(int cpu)
                        goto bad;
 
                setup_pageset(zone->pageset[cpu], zone_batchsize(zone));
+
+               if (percpu_pagelist_fraction)
+                       setup_pagelist_highmark(zone_pcp(zone, cpu),
+                               (zone->present_pages / percpu_pagelist_fraction));
        }
 
        return 0;
@@ -1867,7 +1906,6 @@ bad:
 
 static inline void free_zone_pagesets(int cpu)
 {
-#ifdef CONFIG_NUMA
        struct zone *zone;
 
        for_each_zone(zone) {
@@ -1876,7 +1914,6 @@ static inline void free_zone_pagesets(int cpu)
                zone_pcp(zone, cpu) = NULL;
                kfree(pset);
        }
-#endif
 }
 
 static int __devinit pageset_cpuup_callback(struct notifier_block *nfb,
@@ -2553,6 +2590,32 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
        return 0;
 }
 
+/*
+ * percpu_pagelist_fraction - changes the pcp->high for each zone on each
+ * cpu.  It is the fraction of total pages in each zone that a hot per cpu pagelist
+ * can have before it gets flushed back to buddy allocator.
+ */
+
+int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
+       struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+       struct zone *zone;
+       unsigned int cpu;
+       int ret;
+
+       ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+       if (!write || (ret == -EINVAL))
+               return ret;
+       for_each_zone(zone) {
+               for_each_online_cpu(cpu) {
+                       unsigned long  high;
+                       high = zone->present_pages / percpu_pagelist_fraction;
+                       setup_pagelist_highmark(zone_pcp(zone, cpu), high);
+               }
+       }
+       return 0;
+}
+
 __initdata int hashdist = HASHDIST_DEFAULT;
 
 #ifdef CONFIG_NUMA
This page took 0.030554 seconds and 5 git commands to generate.