logo       

memory hotremove prototype, take 3: msg#00000

Subject: memory hotremove prototype, take 3
Hi,

this is a new version of my memory hotplug prototype patch, against
linux-2.6.0-test11.

Freeing 100% of a specified memory zone is non-trivial and necessary
for memory hot removal.  This patch splits memory into 1GB zones, and
implements complete zone memory freeing using kswapd or "remapping".

A bit more detailed explanation and some test scripts are at:
        http://people.valinux.co.jp/~iwamoto/mh.html

Main changes against previous versions are:
- The stability is greatly improved.  Kernel crashes (probably related
  with kswapd) still happen, but they are rather rare so that I'm
  having difficulty reproducing crashes.
  Page remapping under simultaneous tar + rm -rf works.
- Implemented a solution to a deadlock caused by ext2_rename, which
  increments a refcount of a directory page twice.

Questions and comments are welcome.

$Id: memoryhotplug.patch,v 1.26 2003/11/28 09:12:12 iwamoto Exp $

diff -dpur linux-2.6.0-test11/arch/i386/Kconfig 
linux-2.6.0-test11-mh/arch/i386/Kconfig
--- linux-2.6.0-test11/arch/i386/Kconfig        Thu Nov 27 05:43:07 2003
+++ linux-2.6.0-test11-mh/arch/i386/Kconfig     Fri Nov 28 17:45:42 2003
@@ -706,14 +706,18 @@ comment "NUMA (NUMA-Q) requires SMP, 64G
 comment "NUMA (Summit) requires SMP, 64GB highmem support, full ACPI"
        depends on X86_SUMMIT && (!HIGHMEM64G || !ACPI || ACPI_HT_ONLY)
 
+config MEMHOTPLUGTEST
+       bool "Memory hotplug test"
+       default n
+
 config DISCONTIGMEM
        bool
-       depends on NUMA
+       depends on NUMA || MEMHOTPLUGTEST
        default y
 
 config HAVE_ARCH_BOOTMEM_NODE
        bool
-       depends on NUMA
+       depends on NUMA || MEMHOTPLUGTEST
        default y
 
 config HIGHPTE
diff -dpur linux-2.6.0-test11/arch/i386/mm/discontig.c 
linux-2.6.0-test11-mh/arch/i386/mm/discontig.c
--- linux-2.6.0-test11/arch/i386/mm/discontig.c Thu Nov 27 05:44:20 2003
+++ linux-2.6.0-test11-mh/arch/i386/mm/discontig.c      Fri Nov 28 17:45:42 2003
@@ -28,6 +28,7 @@
 #include <linux/mmzone.h>
 #include <linux/highmem.h>
 #include <linux/initrd.h>
+#include <linux/proc_fs.h>
 #include <asm/e820.h>
 #include <asm/setup.h>
 #include <asm/mmzone.h>
@@ -111,6 +112,49 @@ int __init get_memcfg_numa_flat(void)
        return 1;
 }
 
+int __init get_memcfg_numa_blks(void)
+{
+       int i, pfn;
+
+       printk("NUMA - single node, flat memory mode, but broken in several 
blocks\n");
+
+       /* Run the memory configuration and find the top of memory. */
+       find_max_pfn();
+       if (max_pfn & (PTRS_PER_PTE - 1)) {
+               pfn = max_pfn & ~(PTRS_PER_PTE - 1);
+               printk("Rounding down maxpfn %d -> %d\n", max_pfn, pfn);
+               max_pfn = pfn;
+       }
+       for(i = 0; i < MAX_NUMNODES; i++) {
+               pfn = PFN_DOWN(1 << 30) * i;
+               node_start_pfn[i]  = pfn;
+               pfn += PFN_DOWN(1 << 30);
+               if (pfn < max_pfn)
+                       node_end_pfn[i]   = pfn;
+               else {
+                       node_end_pfn[i]   = max_pfn;
+                       i++;
+                       printk("total %d blocks, max %d\n", i, max_pfn);
+                       break;
+               }
+       }
+
+       /* Fill in the physnode_map with our simplistic memory model,
+       * all memory is in node 0.
+       */
+       for (pfn = node_start_pfn[0]; pfn <= max_pfn;
+              pfn += PAGES_PER_ELEMENT)
+       {
+               physnode_map[pfn / PAGES_PER_ELEMENT] = pfn / PFN_DOWN(1 << 30);
+       }
+
+         /* Indicate there is one node available. */
+       node_set_online(0);
+       numnodes = i;
+
+       return 1;
+}
+
 /*
  * Find the highest page frame number we have available for the node
  */
@@ -183,6 +227,8 @@ static void __init register_bootmem_low_
        }
 }
 
+static struct kcore_list numa_kc;
+
 void __init remap_numa_kva(void)
 {
        void *vaddr;
@@ -196,7 +242,11 @@ void __init remap_numa_kva(void)
                                node_remap_start_pfn[node] + pfn, 
                                PAGE_KERNEL_LARGE);
                }
+               memset(node_remap_start_vaddr[node], 0,
+                   node_remap_size[node] * PAGE_SIZE);
        }
+       kclist_add(&numa_kc, node_remap_start_vaddr[numnodes - 1],
+           node_remap_offset[numnodes - 1] << PAGE_SHIFT);
 }
 
 static unsigned long calculate_numa_remap_pages(void)
diff -dpur linux-2.6.0-test11/include/asm-i386/kmap_types.h 
linux-2.6.0-test11-mh/include/asm-i386/kmap_types.h
--- linux-2.6.0-test11/include/asm-i386/kmap_types.h    Thu Nov 27 05:44:56 2003
+++ linux-2.6.0-test11-mh/include/asm-i386/kmap_types.h Fri Nov 28 17:52:08 2003
@@ -24,7 +24,13 @@ D(10)        KM_IRQ0,
 D(11)  KM_IRQ1,
 D(12)  KM_SOFTIRQ0,
 D(13)  KM_SOFTIRQ1,
+#ifdef CONFIG_MEMHOTPLUGTEST
+D(14)  KM_REMAP0,
+D(15)  KM_REMAP1,
+D(16)  KM_TYPE_NR,
+#else
 D(14)  KM_TYPE_NR
+#endif
 };
 
 #undef D
diff -dpur linux-2.6.0-test11/include/asm-i386/mmzone.h 
linux-2.6.0-test11-mh/include/asm-i386/mmzone.h
--- linux-2.6.0-test11/include/asm-i386/mmzone.h        Thu Nov 27 05:44:10 2003
+++ linux-2.6.0-test11-mh/include/asm-i386/mmzone.h     Fri Nov 28 17:45:42 2003
@@ -128,6 +128,10 @@ static inline struct pglist_data *pfn_to
 #endif /* CONFIG_X86_NUMAQ */
 
 extern int get_memcfg_numa_flat(void );
+#ifdef CONFIG_MEMHOTPLUGTEST
+extern int get_memcfg_numa_blks(void);
+#endif
+
 /*
  * This allows any one NUMA architecture to be compiled
  * for, and still fall back to the flat function if it
@@ -143,6 +147,10 @@ static inline void get_memcfg_numa(void)
                return;
 #endif
 
+#ifdef CONFIG_MEMHOTPLUGTEST
+       get_memcfg_numa_blks();
+       return;
+#endif
        get_memcfg_numa_flat();
 }
 
diff -dpur linux-2.6.0-test11/include/asm-i386/numnodes.h 
linux-2.6.0-test11-mh/include/asm-i386/numnodes.h
--- linux-2.6.0-test11/include/asm-i386/numnodes.h      Thu Nov 27 05:43:09 2003
+++ linux-2.6.0-test11-mh/include/asm-i386/numnodes.h   Fri Nov 28 17:45:42 2003
@@ -13,6 +13,10 @@
 /* Max 8 Nodes */
 #define NODES_SHIFT    3
 
+#elif defined(CONFIG_MEMHOTPLUGTEST)
+
+#define NODES_SHIFT    3
+
 #endif /* CONFIG_X86_NUMAQ */
 
 #endif /* _ASM_MAX_NUMNODES_H */
diff -dpur linux-2.6.0-test11/include/linux/mm.h 
linux-2.6.0-test11-mh/include/linux/mm.h
--- linux-2.6.0-test11/include/linux/mm.h       Thu Nov 27 05:42:55 2003
+++ linux-2.6.0-test11-mh/include/linux/mm.h    Fri Nov 28 17:45:42 2003
@@ -219,7 +219,14 @@ struct page {
  */
 #define put_page_testzero(p)                           \
        ({                                              \
-               BUG_ON(page_count(p) == 0);             \
+               if (page_count(p) == 0) {               \
+                       int i;                                          \
+                       printk("Page: %lx ", (long)p);                  \
+                       for(i = 0; i < sizeof(struct page); i++)        \
+                               printk(" %02x", ((unsigned char *)p)[i]); \
+                       printk("\n");                                   \
+                       BUG();                          \
+               }                                       \
                atomic_dec_and_test(&(p)->count);       \
        })
 
diff -dpur linux-2.6.0-test11/include/linux/mmzone.h 
linux-2.6.0-test11-mh/include/linux/mmzone.h
--- linux-2.6.0-test11/include/linux/mmzone.h   Thu Nov 27 05:44:20 2003
+++ linux-2.6.0-test11-mh/include/linux/mmzone.h        Fri Nov 28 17:45:42 2003
@@ -360,6 +360,10 @@ static inline unsigned int num_online_me
        return num;
 }
 
+#ifdef CONFIG_MEMHOTPLUGTEST
+int zone_activep(const struct zone *);
+int remapd(void *p);
+#endif
 #else /* !CONFIG_DISCONTIGMEM && !CONFIG_NUMA */
 
 #define node_online(node) \
diff -dpur linux-2.6.0-test11/include/linux/page-flags.h 
linux-2.6.0-test11-mh/include/linux/page-flags.h
--- linux-2.6.0-test11/include/linux/page-flags.h       Thu Nov 27 05:44:52 2003
+++ linux-2.6.0-test11-mh/include/linux/page-flags.h    Fri Nov 28 17:45:42 2003
@@ -76,6 +76,8 @@
 #define PG_reclaim             18      /* To be reclaimed asap */
 #define PG_compound            19      /* Part of a compound page */
 
+#define        PG_again                20
+
 
 /*
  * Global page accounting.  One instance per CPU.  Only unsigned longs are
@@ -268,6 +270,10 @@ extern void get_full_page_state(struct p
 #define PageCompound(page)     test_bit(PG_compound, &(page)->flags)
 #define SetPageCompound(page)  set_bit(PG_compound, &(page)->flags)
 #define ClearPageCompound(page)        clear_bit(PG_compound, &(page)->flags)
+
+#define PageAgain(page)        test_bit(PG_again, &(page)->flags)
+#define SetPageAgain(page)     set_bit(PG_again, &(page)->flags)
+#define ClearPageAgain(page)   clear_bit(PG_again, &(page)->flags)
 
 /*
  * The PageSwapCache predicate doesn't use a PG_flag at this time,
diff -dpur linux-2.6.0-test11/mm/filemap.c linux-2.6.0-test11-mh/mm/filemap.c
--- linux-2.6.0-test11/mm/filemap.c     Thu Nov 27 05:43:33 2003
+++ linux-2.6.0-test11-mh/mm/filemap.c  Fri Nov 28 17:45:42 2003
@@ -448,7 +448,8 @@ repeat:
                        spin_lock(&mapping->page_lock);
 
                        /* Has the page been truncated while we slept? */
-                       if (page->mapping != mapping || page->index != offset) {
+                       if (page->mapping != mapping || page->index != offset ||
+                           PageAgain(page)) {
                                unlock_page(page);
                                page_cache_release(page);
                                goto repeat;
@@ -677,6 +678,12 @@ page_not_up_to_date:
                        goto page_ok;
                }
 
+               if (PageAgain(page)) {
+                       unlock_page(page);
+                       page_cache_release(page);
+                       goto find_page;
+               }
+
 readpage:
                /* ... and start the actual read. The read will unlock the 
page. */
                error = mapping->a_ops->readpage(filp, page);
@@ -1120,6 +1127,12 @@ page_not_uptodate:
                goto success;
        }
 
+       if (PageAgain(page)) {
+               unlock_page(page);
+               page_cache_release(page);
+               goto retry_find;
+       }
+
        if (!mapping->a_ops->readpage(file, page)) {
                wait_on_page_locked(page);
                if (PageUptodate(page))
@@ -1228,6 +1241,12 @@ page_not_uptodate:
                goto success;
        }
 
+       if (PageAgain(page)) {
+               unlock_page(page);
+               page_cache_release(page);
+               goto retry_find;
+       }
+
        if (!mapping->a_ops->readpage(file, page)) {
                wait_on_page_locked(page);
                if (PageUptodate(page))
@@ -1436,6 +1455,11 @@ retry:
        if (PageUptodate(page)) {
                unlock_page(page);
                goto out;
+       }
+       if (PageAgain(page)) {
+               unlock_page(page);
+               page_cache_release(page);
+               goto retry;
        }
        err = filler(data, page);
        if (err < 0) {
diff -dpur linux-2.6.0-test11/mm/page_alloc.c 
linux-2.6.0-test11-mh/mm/page_alloc.c
--- linux-2.6.0-test11/mm/page_alloc.c  Thu Nov 27 05:42:56 2003
+++ linux-2.6.0-test11-mh/mm/page_alloc.c       Fri Nov 28 17:45:42 2003
@@ -31,6 +31,7 @@
 #include <linux/topology.h>
 #include <linux/sysctl.h>
 #include <linux/cpu.h>
+#include <linux/proc_fs.h>
 
 #include <asm/tlbflush.h>
 
@@ -52,6 +53,9 @@ EXPORT_SYMBOL(nr_swap_pages);
  */
 struct zone *zone_table[MAX_NR_ZONES*MAX_NUMNODES];
 EXPORT_SYMBOL(zone_table);
+#ifdef CONFIG_MEMHOTPLUGTEST
+static char zone_active[MAX_NR_ZONES*MAX_NUMNODES];
+#endif
 
 static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
 int min_free_kbytes = 1024;
@@ -411,7 +415,9 @@ int is_head_of_free_region(struct page *
        spin_unlock_irqrestore(&zone->lock, flags);
         return 0;
 }
+#endif
 
+#if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_MEMHOTPLUGTEST)
 /*
  * Spill all of this CPU's per-cpu pages back into the buddy allocator.
  */
@@ -512,9 +518,28 @@ static struct page *buffered_rmqueue(str
                mod_page_state(pgalloc, 1 << order);
                prep_new_page(page, order);
        }
+#ifdef CONFIG_MEMHOTPLUGTEST
+       if (page != NULL && ! zone_active[page->flags >> ZONE_SHIFT])
+               printk("alloc_page from disabled zone: %p\n", page);
+#endif
        return page;
 }
 
+#ifdef CONFIG_MEMHOTPLUGTEST
+int
+zone_activep(const struct zone *z)
+{
+       int i;
+
+       for(i = 0; ; i++) {
+               if (zone_table[i] == z)
+                       return zone_active[i];
+               if (zone_table[i] == NULL)
+                       BUG();
+       }
+}
+#endif
+
 /*
  * This is the 'heart' of the zoned buddy allocator.
  *
@@ -562,6 +587,10 @@ __alloc_pages(unsigned int gfp_mask, uns
                struct zone *z = zones[i];
                unsigned long local_low;
 
+#ifdef CONFIG_MEMHOTPLUGTEST
+               if (! zone_activep(z))
+                       continue;
+#endif
                /*
                 * This is the fabled 'incremental min'. We let real-time tasks
                 * dip their real-time paws a little deeper into reserves.
@@ -590,6 +619,10 @@ __alloc_pages(unsigned int gfp_mask, uns
                unsigned long local_min;
                struct zone *z = zones[i];
 
+#ifdef CONFIG_MEMHOTPLUGTEST
+               if (! zone_activep(z))
+                       continue;
+#endif
                local_min = z->pages_min;
                if (gfp_mask & __GFP_HIGH)
                        local_min >>= 2;
@@ -613,6 +646,10 @@ rebalance:
                for (i = 0; zones[i] != NULL; i++) {
                        struct zone *z = zones[i];
 
+#ifdef CONFIG_MEMHOTPLUGTEST
+                       if (! zone_activep(z))
+                               continue;
+#endif
                        page = buffered_rmqueue(z, order, cold);
                        if (page)
                                goto got_pg;
@@ -638,6 +675,10 @@ rebalance:
        for (i = 0; zones[i] != NULL; i++) {
                struct zone *z = zones[i];
 
+#ifdef CONFIG_MEMHOTPLUGTEST
+               if (! zone_activep(z))
+                       continue;
+#endif
                min += z->pages_min;
                if (z->free_pages >= min ||
                                (!wait && z->free_pages >= z->pages_high)) {
@@ -1076,6 +1117,9 @@ static int __init build_zonelists_node(p
 static void __init build_zonelists(pg_data_t *pgdat)
 {
        int i, j, k, node, local_node;
+#ifdef CONFIG_MEMHOTPLUGTEST
+       struct zone *zone;
+#endif
 
        local_node = pgdat->node_id;
        printk("Building zonelist for node : %d\n", local_node);
@@ -1091,7 +1135,7 @@ static void __init build_zonelists(pg_da
                        k = ZONE_HIGHMEM;
                if (i & __GFP_DMA)
                        k = ZONE_DMA;
-
+#ifndef CONFIG_MEMHOTPLUGTEST
                j = build_zonelists_node(pgdat, zonelist, j, k);
                /*
                 * Now we build the zonelist so that it contains the zones
@@ -1107,6 +1151,23 @@ static void __init build_zonelists(pg_da
                        j = build_zonelists_node(NODE_DATA(node), zonelist, j, 
k);
  
                zonelist->zones[j++] = NULL;
+#else
+               for(; k >= 0; k--) {
+                       zone = pgdat->node_zones + k;
+                       if (zone->present_pages)
+                               zonelist->zones[j++] = zone;
+                       for (node = local_node + 1; node < numnodes; node++) {
+                               zone = NODE_DATA(node)->node_zones + k;
+                               if (zone->present_pages)
+                                       zonelist->zones[j++] = zone;
+                       }
+                       for (node = 0; node < local_node; node++) {
+                               zone = NODE_DATA(node)->node_zones + k;
+                               if (zone->present_pages)
+                                       zonelist->zones[j++] = zone;
+                       }
+               }
+#endif
        } 
 }
 
@@ -1252,6 +1313,9 @@ static void __init free_area_init_core(s
                unsigned long batch;
 
                zone_table[nid * MAX_NR_ZONES + j] = zone;
+#ifdef CONFIG_MEMHOTPLUGTEST
+               zone_active[nid * MAX_NR_ZONES + j] = 1;
+#endif
                realsize = size = zones_size[j];
                if (zholes_size)
                        realsize -= zholes_size[j];
@@ -1644,3 +1708,145 @@ int min_free_kbytes_sysctl_handler(ctl_t
        setup_per_zone_pages_min();
        return 0;
 }
+
+#ifdef CONFIG_MEMHOTPLUGTEST
+static int mhtest_read(char *page, char **start, off_t off, int count,
+    int *eof, void *data)
+{
+       char *p;
+       int i, len;
+       const struct zone *z;
+
+       p = page;
+       for(i = 0; ; i++) {
+               z = zone_table[i];
+               if (z == NULL)
+                       break;
+               if (! z->present_pages)
+                       /* skip empty zone */
+                       continue;
+               len = sprintf(p, "Zone %d: %sabled free %d, active %d, present 
%d\n", i,
+                   zone_active[i] ? "en" : "dis", z->free_pages, z->nr_active,
+                   z->present_pages);
+               p += len;
+       }
+       len = p - page;
+
+       if (len <= off + count)
+               *eof = 1;
+       *start = page + off;
+       len -= off;
+       if (len < 0)
+               len = 0;
+       if (len > count)
+               len = count;
+
+       return len;
+}
+
+static int mhtest_write(struct file *file, const char *buffer,
+    unsigned long count, void *data)
+{
+       unsigned long idx;
+       char buf[64], *p;
+       int i;
+       struct list_head *l;
+
+       if (count > sizeof(buf) - 1)
+               count = sizeof(buf) - 1;
+       if (copy_from_user(buf, buffer, count))
+               return -EFAULT;
+
+       buf[count] = 0;
+
+       p = strchr(buf, ' ');
+       if (p == NULL)
+               goto out;
+
+       *p++ = '\0';
+       idx = simple_strtoul(p, NULL, 0);
+
+       if (idx > MAX_NR_ZONES*MAX_NUMNODES) {
+               printk("Argument out of range\n");
+               goto out;
+       }
+       if (strcmp(buf, "disable") == 0) {
+               printk("disable %d\n", idx);
+               /* XXX */
+               for (i = 0; i < NR_CPUS; i++) {
+                       struct per_cpu_pages *pcp;
+
+                       pcp = &zone_table[idx]->pageset[i].pcp[0];      /* hot 
*/
+                       pcp->low = pcp->high = 0;
+
+                       pcp = &zone_table[idx]->pageset[i].pcp[1];      /* cold 
*/
+                       pcp->low = pcp->high = 0;
+               }
+               zone_active[idx] = 0;
+               zone_table[idx]->pages_high = zone_table[idx]->present_pages;
+       } else if (strcmp(buf, "purge") == 0) {
+               if (zone_active[idx])
+                       printk("Zone %d still active (proceeding anyway)\n",
+                           idx);
+               printk("purge %d\n", idx);
+               
wake_up_interruptible(&zone_table[idx]->zone_pgdat->kswapd_wait);
+               /* XXX overkill, but who cares? */
+               on_each_cpu(drain_local_pages, NULL, 1, 1);
+       } else if (strcmp(buf, "enable") == 0) {
+               printk("enable %d\n", idx);
+               zone_active[idx] = 1;
+               zone_table[idx]->pages_high = 
+                   zone_table[idx]->pages_min * 3;
+               /* XXX */
+               for (i = 0; i < NR_CPUS; i++) {
+                       struct per_cpu_pages *pcp;
+
+                       pcp = &zone_table[idx]->pageset[i].pcp[0];      /* hot 
*/
+                       pcp->low = 2 * pcp->batch;
+                       pcp->high = 6 * pcp->batch;
+
+                       pcp = &zone_table[idx]->pageset[i].pcp[1];      /* cold 
*/
+                       pcp->high = 2 * pcp->batch;
+               }
+       } else if (strcmp(buf, "remap") == 0) {
+               on_each_cpu(drain_local_pages, NULL, 1, 1);
+               kernel_thread(remapd, zone_table[idx], CLONE_KERNEL);
+       } else if (strcmp(buf, "active") == 0) {
+               if (zone_table[idx] == NULL)
+                       goto out;
+               spin_lock_irq(&zone_table[idx]->lru_lock);
+               i = 0;
+               list_for_each(l, &zone_table[idx]->active_list) {
+                       printk(" %lx", (unsigned long)list_entry(l, struct 
page, lru));
+                       i++;
+                       if (i == 10)
+                               break;
+               }
+               spin_unlock_irq(&zone_table[idx]->lru_lock);
+               printk("\n");
+       } else if (strcmp(buf, "inuse") == 0) {
+               if (zone_table[idx] == NULL)
+                       goto out;
+               for(i = 0; i < zone_table[idx]->spanned_pages; i++)
+                       if (page_count(&zone_table[idx]->zone_mem_map[i]))
+                               printk(" %lx", (unsigned 
long)&zone_table[idx]->zone_mem_map[i]);
+               printk("\n");
+       }
+out:
+       return count;
+}
+
+static int __init procmhtest_init(void)
+{
+       struct proc_dir_entry *entry;
+
+       entry = create_proc_entry("memhotplug", 0, NULL);
+       if (entry == NULL)
+               return -1;
+
+       entry->read_proc = &mhtest_read;
+       entry->write_proc = &mhtest_write;
+       return 0;
+}
+__initcall(procmhtest_init);
+#endif
diff -dpur linux-2.6.0-test11/mm/shmem.c linux-2.6.0-test11-mh/mm/shmem.c
--- linux-2.6.0-test11/mm/shmem.c       Thu Nov 27 05:43:41 2003
+++ linux-2.6.0-test11-mh/mm/shmem.c    Fri Nov 28 17:45:42 2003
@@ -80,7 +80,15 @@ static inline struct page *shmem_dir_all
         * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE:
         * might be reconsidered if it ever diverges from PAGE_SIZE.
         */
-       return alloc_pages(gfp_mask, PAGE_CACHE_SHIFT-PAGE_SHIFT);
+#ifdef CONFIG_MEMHOTPLUGTEST
+       struct page* p = alloc_pages(gfp_mask & ~__GFP_HIGHMEM,
+           PAGE_CACHE_SHIFT-PAGE_SHIFT);
+       printk("shmem_dir_alloc: %lx\n", (unsigned long)p);
+       return p;
+#else
+       return alloc_pages(gfp_mask & ~__GFP_HIGHMEM,
+           PAGE_CACHE_SHIFT-PAGE_SHIFT);
+#endif
 }
 
 static inline void shmem_dir_free(struct page *page)
diff -dpur linux-2.6.0-test11/mm/truncate.c linux-2.6.0-test11-mh/mm/truncate.c
--- linux-2.6.0-test11/mm/truncate.c    Thu Nov 27 05:45:39 2003
+++ linux-2.6.0-test11-mh/mm/truncate.c Fri Nov 28 17:45:42 2003
@@ -132,6 +132,10 @@ void truncate_inode_pages(struct address
                        next++;
                        if (TestSetPageLocked(page))
                                continue;
+                       if (PageAgain(page)) {
+                               unlock_page(page);
+                               continue;
+                       }
                        if (PageWriteback(page)) {
                                unlock_page(page);
                                continue;
@@ -165,6 +169,14 @@ void truncate_inode_pages(struct address
                        struct page *page = pvec.pages[i];
 
                        lock_page(page);
+                       if (PageAgain(page)) {
+                               unsigned long index = page->index;
+
+                               unlock_page(page);
+                               put_page(page);
+                               page = find_lock_page(mapping, index);
+                               pvec.pages[i] = page;
+                       }
                        wait_on_page_writeback(page);
                        if (page->index > next)
                                next = page->index;
@@ -255,6 +267,14 @@ void invalidate_inode_pages2(struct addr
                        struct page *page = pvec.pages[i];
 
                        lock_page(page);
+                       if (PageAgain(page)) {
+                               unsigned long index = page->index;
+
+                               unlock_page(page);
+                               put_page(page);
+                               page = find_lock_page(mapping, index);
+                               pvec.pages[i] = page;
+                       }
                        if (page->mapping == mapping) { /* truncate race? */
                                wait_on_page_writeback(page);
                                next = page->index + 1;
diff -dpur linux-2.6.0-test11/mm/vmalloc.c linux-2.6.0-test11-mh/mm/vmalloc.c
--- linux-2.6.0-test11/mm/vmalloc.c     Thu Nov 27 05:44:23 2003
+++ linux-2.6.0-test11-mh/mm/vmalloc.c  Fri Nov 28 17:45:42 2003
@@ -447,7 +447,11 @@ EXPORT_SYMBOL(__vmalloc);
  */
 void *vmalloc(unsigned long size)
 {
+#ifdef CONFIG_MEMHOTPLUGTEST
+       return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL);
+#else
        return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
+#endif
 }
 
 EXPORT_SYMBOL(vmalloc);
diff -dpur linux-2.6.0-test11/mm/vmscan.c linux-2.6.0-test11-mh/mm/vmscan.c
--- linux-2.6.0-test11/mm/vmscan.c      Thu Nov 27 05:43:06 2003
+++ linux-2.6.0-test11-mh/mm/vmscan.c   Fri Nov 28 17:55:35 2003
@@ -36,6 +36,9 @@
 #include <asm/div64.h>
 
 #include <linux/swapops.h>
+#ifdef CONFIG_KDB
+#include <linux/kdb.h>
+#endif
 
 /*
  * The "priority" of VM scanning is how much of the queues we will scan in one
@@ -285,6 +288,8 @@ shrink_list(struct list_head *page_list,
                        goto keep_locked;
 
                pte_chain_lock(page);
+               if ((! zone_activep(page_zone(page))) && page_mapped(page))
+                       page_referenced(page);
                referenced = page_referenced(page);
                if (referenced && page_mapping_inuse(page)) {
                        /* In active use or really unfreeable.  Activate it. */
@@ -589,7 +594,7 @@ done:
  * But we had to alter page->flags anyway.
  */
 static void
-refill_inactive_zone(struct zone *zone, const int nr_pages_in,
+refill_inactive_zone(struct zone *zone, int nr_pages_in,
                        struct page_state *ps, int priority)
 {
        int pgmoved;
@@ -607,6 +612,12 @@ refill_inactive_zone(struct zone *zone, 
 
        lru_add_drain();
        pgmoved = 0;
+#ifdef CONFIG_MEMHOTPLUGTEST
+       if (! zone_activep(zone)) {
+               nr_pages = nr_pages_in = zone->present_pages - zone->free_pages;
+               printk("Purging active list of disabled zone\n");
+       }
+#endif
        spin_lock_irq(&zone->lru_lock);
        while (nr_pages && !list_empty(&zone->active_list)) {
                page = list_entry(zone->active_list.prev, struct page, lru);
@@ -658,12 +669,20 @@ refill_inactive_zone(struct zone *zone, 
         */
        if (swap_tendency >= 100)
                reclaim_mapped = 1;
+#ifdef CONFIG_MEMHOTPLUGTEST
+       if (! zone_activep(zone))
+               reclaim_mapped = 1;
+#endif
 
        while (!list_empty(&l_hold)) {
                page = list_entry(l_hold.prev, struct page, lru);
                list_del(&page->lru);
                if (page_mapped(page)) {
                        pte_chain_lock(page);
+#ifdef CONFIG_MEMHOTPLUGTEST
+                       if (! zone_activep(zone))
+                               page_referenced(page);  /* XXX */
+#endif
                        if (page_mapped(page) && page_referenced(page)) {
                                pte_chain_unlock(page);
                                list_add(&page->lru, &l_active);
@@ -767,6 +786,11 @@ shrink_zone(struct zone *zone, int max_s
        ratio = (unsigned long)nr_pages * zone->nr_active /
                                ((zone->nr_inactive | 1) * 2);
        atomic_add(ratio+1, &zone->refill_counter);
+#ifdef CONFIG_MEMHOTPLUGTEST
+       if (! zone_activep(zone))
+               /* XXX */
+               atomic_add(SWAP_CLUSTER_MAX, &zone->refill_counter);
+#endif
        if (atomic_read(&zone->refill_counter) > SWAP_CLUSTER_MAX) {
                int count;
 
@@ -1048,6 +1072,439 @@ int kswapd(void *p)
                balance_pgdat(pgdat, 0, &ps);
        }
 }
+
+#ifdef CONFIG_MEMHOTPLUGTEST
+static void
+print_buffer(struct page* page)
+{
+       struct address_space* mapping = page->mapping;
+       struct buffer_head *bh, *head;
+
+       spin_lock(&mapping->private_lock);
+       bh = head = page_buffers(page);
+       printk("buffers:");
+       do {
+               printk(" %lx %d\n", bh->b_state, atomic_read(&bh->b_count));
+
+               bh = bh->b_this_page;
+       } while (bh != head);
+       printk("\n");
+       spin_unlock(&mapping->private_lock);
+}
+/* try to remap a page. returns non-zero on failure */
+int remap_onepage(struct page *page)
+{
+       struct page *newpage;
+       struct zone *zone;
+       struct address_space *mapping = page->mapping;
+       char *np, *op;
+       void *p;
+       int waitcnt, error = -1;
+
+       newpage = alloc_page(GFP_HIGHUSER);
+       if (newpage == NULL)
+               return -ENOMEM;
+       if (TestSetPageLocked(newpage))
+               BUG();
+       lock_page(page);
+
+       if (! PagePrivate(page) && PageWriteback(page))
+#ifdef CONFIG_KDB
+               KDB_ENTER();
+#else
+               BUG();
+#endif
+       if (PagePrivate(page)) {
+               waitcnt = 100;
+               while (PageWriteback(page)) {
+                       __set_current_state(TASK_INTERRUPTIBLE);
+                       schedule_timeout(10);
+                       __set_current_state(TASK_RUNNING);
+                       if (! --waitcnt)
+                               goto radixfail;
+               }
+
+               /* XXX copied from shrink_list() */
+               if (PageDirty(page) &&
+                   is_page_cache_freeable(page) &&
+                   mapping != NULL &&
+                   mapping->a_ops->writepage != NULL) {
+                       spin_lock(&mapping->page_lock);
+                       if (test_clear_page_dirty(page)) {
+                               int res;
+                               struct writeback_control wbc = {
+                                       .sync_mode = WB_SYNC_NONE,
+                                       .nr_to_write = SWAP_CLUSTER_MAX,
+                                       .nonblocking = 1,
+                                       .for_reclaim = 1,
+                               };
+
+                               list_move(&page->list, &mapping->locked_pages);
+                               spin_unlock(&mapping->page_lock);
+
+                               SetPageReclaim(page);
+                               res = mapping->a_ops->writepage(page, &wbc);
+
+                               if (res == WRITEPAGE_ACTIVATE) {
+                                       ClearPageReclaim(page);
+                                       goto radixfail;
+                               }
+                               if (!PageWriteback(page)) {
+                                       /* synchronous write or broken a_ops? */
+                                       ClearPageReclaim(page);
+                               }
+                               lock_page(page);
+                               if (! PagePrivate(page))
+                                       goto bufferdone;
+                       } else
+                               spin_unlock(&mapping->page_lock);
+               }
+
+               waitcnt = 100;
+               while (1) {
+                       if (try_to_release_page(page, GFP_KERNEL))
+                               break;
+                       __set_current_state(TASK_INTERRUPTIBLE);
+                       schedule_timeout(10);
+                       __set_current_state(TASK_RUNNING);
+                       if (! --waitcnt) {
+                               print_buffer(page);
+                               goto radixfail;
+                       }
+               }
+       }
+bufferdone:
+       if (mapping == NULL) {
+               /* The page is an anon page. Allocate swap entry. */
+               if (!add_to_swap(page))
+                       goto radixfail;
+               mapping = page->mapping;
+       }
+       error = radix_tree_preload(GFP_KERNEL);
+       if (error)
+               goto radixfail;
+       if (PagePrivate(page)) /* XXX */
+               BUG();
+
+       /* should {__add_to,__remove_from}_page_cache be used instead? */
+       spin_lock(&mapping->page_lock);
+       if (mapping != page->mapping)
+               printk("mapping changed %p -> %p, page %p\n",
+                   mapping, page->mapping, page);
+       if (radix_tree_delete(&mapping->page_tree, page->index) == NULL) {
+               /* Page truncated. */
+               spin_unlock(&mapping->page_lock);
+               radix_tree_preload_end();
+               goto radixfail;
+       }
+       /* don't __put_page(page) here. truncate may be in progress */
+       newpage->flags |= page->flags & ~(1 << PG_uptodate) &
+           ~(1 << PG_highmem) & ~(1 << PG_chainlock) &
+           ~(1 << PG_direct) & ~(~0UL << ZONE_SHIFT);
+
+       /* list_del(&page->list); XXX */
+       radix_tree_insert(&mapping->page_tree, page->index, newpage);
+       page_cache_get(newpage);
+       newpage->mapping = mapping;
+       newpage->index = page->index;
+       if (PageDirty(page))
+               list_add(&newpage->list, &mapping->dirty_pages);
+       else
+               list_add(&newpage->list, &mapping->clean_pages);
+       spin_unlock(&mapping->page_lock);
+       radix_tree_preload_end();
+
+       pte_chain_lock(page);
+       if (page_mapped(page)) {
+               while ((error = try_to_unmap(page)) == SWAP_AGAIN) {
+                       pte_chain_unlock(page);
+                       __set_current_state(TASK_INTERRUPTIBLE);
+                       schedule_timeout(1);
+                       __set_current_state(TASK_RUNNING);
+                       pte_chain_lock(page);
+               }
+               if (error == SWAP_FAIL) {
+                       pte_chain_unlock(page); /* XXX */
+                       /* either during mremap or mlocked */
+                       goto unmapfail;
+               }
+       }
+       pte_chain_unlock(page);
+       if (PagePrivate(page))
+               printk("buffer reappeared\n");
+
+       unlock_page(page);      /* no lock needed while waiting page count */
+
+       waitcnt = 1;
+wait_again:
+       while (page_count(page) > 2) {
+               waitcnt++;
+               current->state = TASK_INTERRUPTIBLE;
+               schedule_timeout(1);
+               if ((waitcnt % 5000) == 0) {
+                       printk("remap_onepage: still waiting on %p %d\n", page, 
waitcnt);
+                       break;
+               }
+               if (PagePrivate(page))
+                       break;          /* see below */
+       }
+
+       lock_page(page);
+       BUG_ON(page_count(page) == 0);
+       if (PagePrivate(page))
+               try_to_release_page(page, GFP_KERNEL);
+       if (page_count(page) > 2) {
+               if (waitcnt > 50000)
+                       goto unmapfail;
+               unlock_page(page);
+               goto wait_again;
+       }
+       if (PageReclaim(page) || PageWriteback(page) || PagePrivate(page))
+#ifdef CONFIG_KDB
+               KDB_ENTER();
+#else
+               BUG();
+#endif
+       if (page_count(page) == 1) {
+               /* page has been truncated.  free both pages. */
+               spin_lock(&mapping->page_lock);
+               p = radix_tree_lookup(&mapping->page_tree, newpage->index);
+               if (p != NULL) {
+                       /* new cache page appeared after truncation */
+                       printk("page %p newpage %p radix %p\n",
+                           page, newpage, p);
+                       BUG_ON(p == newpage);
+               }
+               list_del(&newpage->list);
+               put_page(newpage);
+               if (page_count(newpage) != 1) {
+                       printk("newpage count %d != 1, %p\n",
+                           page_count(newpage), newpage);
+                       BUG();
+               }
+               /* No need to do page->list. remove_from_page_cache did. */
+               newpage->mapping = page->mapping = NULL;
+               spin_unlock(&mapping->page_lock);
+               ClearPageActive(page);
+               ClearPageActive(newpage);
+               unlock_page(page);
+               unlock_page(newpage);
+               put_page(page);
+               put_page(newpage);
+               return 0;
+       }
+
+       spin_lock(&mapping->page_lock);
+       list_del(&page->list); /* XXX */
+       page->mapping = NULL;
+       spin_unlock(&mapping->page_lock);
+       unlock_page(page);
+
+       np = kmap_atomic(newpage, KM_REMAP0);
+       op = kmap_atomic(page, KM_REMAP1);
+       if (np == NULL || op == NULL) { /* XXX */
+               printk("%p %p %p %p\n", np, op, newpage, page);
+               BUG();
+       }
+       memcpy(np, op, PAGE_SIZE);
+       kunmap_atomic(page, KM_REMAP1);
+       kunmap_atomic(newpage, KM_REMAP0);
+       ClearPageActive(page);
+       __put_page(page);
+       put_page(page);
+
+       /* We are done. Finish and let the waiters run. */
+       SetPageUptodate(newpage);
+       /* XXX locking order correct? */
+       zone = page_zone(newpage);
+       spin_lock_irq(&zone->lru_lock);
+       if (PageActive(newpage)) {
+               list_add(&newpage->lru, &zone->active_list);
+               zone->nr_active++;
+       } else {
+               list_add(&newpage->lru, &zone->inactive_list);
+               zone->nr_inactive++;
+       }
+       SetPageLRU(newpage);
+       spin_unlock_irq(&zone->lru_lock);
+       unlock_page(newpage);
+       page_cache_release(newpage);
+       return 0;
+
+unmapfail:
+       /*
+        * Try to unwind by notifying waiters.  If someone misbehaves,
+        * we die.
+        */
+       error = radix_tree_preload(GFP_KERNEL);
+       if (error)
+               BUG();
+       /* should {__add_to,__remove_from}_page_cache be used instead? */
+       spin_lock(&mapping->page_lock);
+       /* list_del(&newpage->list); */
+       if (radix_tree_delete(&mapping->page_tree, page->index) == NULL)
+               /* Hold extra count to handle truncate */
+               page_cache_get(newpage);
+       radix_tree_insert(&mapping->page_tree, page->index, page);
+       /* no page_cache_get(page); needed */
+       radix_tree_preload_end();
+       spin_unlock(&mapping->page_lock);
+
+       SetPageAgain(newpage);
+       /* XXX unmap needed?  No, it shouldn't.  Handled by fault handlers. */
+       unlock_page(newpage);
+
+       waitcnt = 1;
+       for(; page_count(newpage) > 2; waitcnt++) {
+               current->state = TASK_INTERRUPTIBLE;
+               schedule_timeout(1);
+               if ((waitcnt % 10000) == 0) {
+                       printk("You are hosed.\n");
+                       printk("newpage %p\n", newpage);
+                       BUG();
+               }
+       }
+       BUG_ON(PageUptodate(newpage));
+       ClearPageDirty(newpage);
+       ClearPageActive(newpage);
+       spin_lock(&mapping->page_lock);
+       newpage->mapping = NULL;
+       if (page_count(newpage) == 1) {
+               printk("newpage %p truncated. page %p\n", newpage, page);
+               BUG();
+       }
+       list_del(&newpage->list);
+       spin_unlock(&mapping->page_lock);
+       unlock_page(page);
+       __put_page(newpage);
+       __free_page(newpage);
+       return 1;
+       
+radixfail:
+       unlock_page(page);
+       unlock_page(newpage);
+       __free_page(newpage);
+       return 1;
+}
+
+static struct work_struct lru_drain_wq[NR_CPUS];
+static void
+lru_drain_schedule(void *p)
+{
+       int cpu = get_cpu();
+
+       schedule_work(&lru_drain_wq[cpu]);
+       put_cpu();
+}
+
+atomic_t remapd_count;
+int remapd(void *p)
+{
+       struct zone *zone = p;
+       struct page *page, *page1;
+       struct list_head *l;
+       int active, i, nr_failed = 0;
+       int fastmode = 100;
+       LIST_HEAD(failedp);
+
+       daemonize("remap%d", zone->zone_start_pfn);
+       if (atomic_read(&remapd_count) > 0) {
+               printk("remapd already running\n");
+               return 0;
+       }
+       atomic_inc(&remapd_count);
+       on_each_cpu(lru_drain_schedule, NULL, 1, 1);
+       while(nr_failed < 100) {
+               spin_lock_irq(&zone->lru_lock);
+               for(active = 0; active < 2; active++) {
+                       l = active ? &zone->active_list :
+                           &zone->inactive_list;
+                       for(i = 0; ! list_empty(l) && i < 10; i++) {
+                               page = list_entry(l->prev, struct page, lru);
+                               if (fastmode && PageLocked(page)) {
+                                       page1 = page;
+                                       while (fastmode && PageLocked(page)) {
+                                               page =
+                                                   list_entry(page->lru.prev,
+                                                   struct page, lru);
+                                               fastmode--;
+                                               if (&page->lru == l) {
+                                                       /* scanned the whole
+                                                          list */
+                                                       page = page1;
+                                                       break;
+                                               }
+                                               if (page == page1)
+                                                       BUG();
+                                       }
+                                       if (! fastmode) {
+                                               printk("used up fastmode\n");
+                                               page = page1;
+                                       }
+                               }
+                               if (! TestClearPageLRU(page))
+                                       BUG();
+                               list_del(&page->lru);
+                               if (page_count(page) == 0) {
+                                       /* the page is in pagevec_release();
+                                          shrink_cache says so. */
+                                       SetPageLRU(page);
+                                       list_add(&page->lru, l);
+                                       continue;
+                               }
+                               if (active)
+                                       zone->nr_active--;
+                               else
+                                       zone->nr_inactive--;
+                               page_cache_get(page);
+                               spin_unlock_irq(&zone->lru_lock);
+                               goto got_page;
+                       }
+               }
+               spin_unlock_irq(&zone->lru_lock);
+               break;
+
+       got_page:
+               if (remap_onepage(page)) {
+                       nr_failed++;
+                       list_add(&page->lru, &failedp);
+               }
+       }
+       if (list_empty(&failedp))
+               goto out;
+
+       while (! list_empty(&failedp)) {
+               spin_lock_irq(&zone->lru_lock);
+               page = list_entry(failedp.prev, struct page, lru);
+               list_del(&page->lru);
+               if (PageActive(page)) {
+                       list_add(&page->lru, &zone->active_list);
+                       zone->nr_active++;
+               } else {
+                       list_add(&page->lru, &zone->inactive_list);
+                       zone->nr_inactive++;
+               }
+               if (TestSetPageLRU(page))
+                       BUG();
+               spin_unlock_irq(&zone->lru_lock);
+               page_cache_release(page);
+       }
+out:
+       atomic_dec(&remapd_count);
+       return 0;
+}
+                       
+static int __init remapd_init(void)
+{
+       int i;
+
+       for(i = 0; i < NR_CPUS; i++)
+               INIT_WORK(&lru_drain_wq[i], lru_add_drain, NULL);
+       return 0;
+}
+
+module_init(remapd_init);
+#endif
 
 /*
  * A zone is low on free memory, so wake its kswapd task to service it.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxxx  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@xxxxxxxxx";> aart@xxxxxxxxx </a>



<Prev in Thread] Current Thread [Next in Thread>