diff -Naur -X dontdiff linux-2.4.7.SuSE/Documentation/Configure.help linux-2.4.7.SuSE.hotadd/Documentation/Configure.help --- linux-2.4.7.SuSE/Documentation/Configure.help Thu Aug 16 12:28:40 2001 +++ linux-2.4.7.SuSE.hotadd/Documentation/Configure.help Thu Mar 6 17:45:50 2003 @@ -231,6 +231,11 @@ If unsure, say "off". +Hot-add memory support +CONFIG_X86_MEM_HOTADD + Say Y here if you have a Intel i386 based system with memory hot-plug + capability. + Normal PC floppy disk support CONFIG_BLK_DEV_FD If you want to use the floppy disk drive(s) of your PC under Linux, diff -Naur -X dontdiff linux-2.4.7.SuSE/arch/i386/config.in linux-2.4.7.SuSE.hotadd/arch/i386/config.in --- linux-2.4.7.SuSE/arch/i386/config.in Thu Aug 16 12:28:41 2001 +++ linux-2.4.7.SuSE.hotadd/arch/i386/config.in Thu Mar 6 17:45:50 2003 @@ -158,10 +158,16 @@ 64GB CONFIG_HIGHMEM64G" off if [ "$CONFIG_HIGHMEM4G" = "y" ]; then define_bool CONFIG_HIGHMEM y + define_bool CONFIG_NOHIGHMEM n fi if [ "$CONFIG_HIGHMEM64G" = "y" ]; then define_bool CONFIG_HIGHMEM y + define_bool CONFIG_NOHIGHMEM n define_bool CONFIG_X86_PAE y +fi + +if [ "$CONFIG_NOHIGHMEM" = "n" ]; then + bool 'Hot-add memory support' CONFIG_X86_MEM_HOTADD fi if [ "$CONFIG_NOHIGHMEM" = "y" ]; then define_bool CONFIG_NO_PAGE_VIRTUAL y diff -Naur -X dontdiff linux-2.4.7.SuSE/arch/i386/kernel/setup.c linux-2.4.7.SuSE.hotadd/arch/i386/kernel/setup.c --- linux-2.4.7.SuSE/arch/i386/kernel/setup.c Thu Aug 16 12:28:32 2001 +++ linux-2.4.7.SuSE.hotadd/arch/i386/kernel/setup.c Thu Mar 6 17:45:50 2003 @@ -130,6 +130,12 @@ /* For PCI or other memory-mapped resources */ unsigned long pci_mem_start = 0x10000000; +#ifdef CONFIG_X86_MEM_HOTADD +/* Virtaul address space reserved for memory hot-add operation */ +unsigned long hotadd_reserve_size; +unsigned long hotadd_reserve_start; +#endif /* CONFIG_X86_MEM_HOTADD */ + /* * Setup options */ @@ -695,6 +701,8 @@ } /* setup_memory_region */ +static int mem_hotadd_disabled = 0; + static inline void parse_mem_cmdline (char ** cmdline_p) { char c = ' ', *to = command_line, *from = COMMAND_LINE; @@ -723,6 +731,9 @@ from += 8+4; e820.nr_map = 0; usermem = 1; + } else if (!memcmp(from+4, "nohotadd", 8)) { + from += 8+4; + mem_hotadd_disabled = 1; } else { /* If the user specifies memory size, we * blow away any automatically generated @@ -765,6 +776,16 @@ } } +#if 0 +static int __init mem_hotadd_setup(char *str) +{ + mem_hotadd_disabled = 1; + return 1; +} + +__setup("no_mem_hotadd", mem_hotadd_setup); +#endif + void __init setup_arch(char **cmdline_p) { unsigned long bootmap_size, low_mem_size; @@ -816,7 +837,14 @@ * 128MB for vmalloc and initrd */ #define VMALLOC_RESERVE (unsigned long)(128 << 20) + +#ifdef CONFIG_X86_MEM_HOTADD +#define MAXMEM (unsigned long)(-PAGE_OFFSET-VMALLOC_RESERVE-hotadd_reserve_size) +#else #define MAXMEM (unsigned long)(-PAGE_OFFSET-VMALLOC_RESERVE) +#define MAX_NONPAE_PFN (1 << 20) +#endif + #ifdef CONFIG_HIGHMEM_DEBUG #define MAXMEM_PFN \ ({ \ @@ -830,7 +858,6 @@ #else #define MAXMEM_PFN PFN_DOWN(MAXMEM) #endif -#define MAX_NONPAE_PFN (1 << 20) /* * partially used pages are not usable - thus @@ -855,6 +882,36 @@ max_pfn = end; } +#ifdef CONFIG_X86_MEM_HOTADD + /* + * Reserve enough virtual addresses to store page structures for + * hot-added memory. + * Needs work: Should the reserve_size depend on a configuration + * option? + */ + if (! mem_hotadd_disabled) { + +#ifndef CONFIG_X86_PAE + if (max_pfn > MAX_NONPAE_PFN) + hotadd_reserve_size = 0; + else + hotadd_reserve_size = (MAX_NONPAE_PFN - max_pfn) * + sizeof(struct page); +#else /* CONFIG_X86_PAE */ + if (max_pfn >= MAX_PAE_PFN) + hotadd_reserve_size = 0; + else + hotadd_reserve_size = (MAX_PAE_PFN - max_pfn) * + sizeof(struct page); + +#endif /* CONFIG_X86_PAE */ + + if (hotadd_reserve_size % PMD_SIZE) + hotadd_reserve_size += (PMD_SIZE - + (hotadd_reserve_size % PMD_SIZE)); + } +#endif /* CONFIG_X86_MEM_HOTADD */ + /* * Determine low and high memory ranges: */ @@ -879,6 +936,13 @@ #endif /* !CONFIG_X86_PAE */ #endif /* !CONFIG_HIGHMEM */ } + +#ifdef CONFIG_X86_MEM_HOTADD + if (hotadd_reserve_size) + hotadd_reserve_start = (unsigned long)__va(max_low_pfn * PAGE_SIZE); + else + hotadd_reserve_start = 0; +#endif /* CONFIG_X86_MEM_HOTADD */ #ifdef CONFIG_HIGHMEM highstart_pfn = highend_pfn = max_pfn; diff -Naur -X dontdiff linux-2.4.7.SuSE/arch/i386/mm/Makefile linux-2.4.7.SuSE.hotadd/arch/i386/mm/Makefile --- linux-2.4.7.SuSE/arch/i386/mm/Makefile Fri Dec 29 16:07:20 2000 +++ linux-2.4.7.SuSE.hotadd/arch/i386/mm/Makefile Thu Mar 6 17:45:50 2003 @@ -11,4 +11,9 @@ obj-y := init.o fault.o ioremap.o extable.o +ifeq ($(CONFIG_X86_MEM_HOTADD),y) +obj-y += mem_hotadd.o +export-objs += mem_hotadd.o +endif + include $(TOPDIR)/Rules.make diff -Naur -X dontdiff linux-2.4.7.SuSE/arch/i386/mm/fault.c linux-2.4.7.SuSE.hotadd/arch/i386/mm/fault.c --- linux-2.4.7.SuSE/arch/i386/mm/fault.c Thu Aug 16 12:28:33 2001 +++ linux-2.4.7.SuSE.hotadd/arch/i386/mm/fault.c Thu Mar 6 17:45:50 2003 @@ -352,6 +352,9 @@ goto no_context; set_pmd(pmd, *pmd_k); + /* If PMD points to a 4MB page, no PTEs to update */ + if (pmd_has_pse(*pmd_k)) + return; pte_k = pte_offset(pmd_k, address); if (!pte_present(*pte_k)) goto no_context; diff -Naur -X dontdiff linux-2.4.7.SuSE/arch/i386/mm/init.c linux-2.4.7.SuSE.hotadd/arch/i386/mm/init.c --- linux-2.4.7.SuSE/arch/i386/mm/init.c Fri Apr 20 18:15:20 2001 +++ linux-2.4.7.SuSE.hotadd/arch/i386/mm/init.c Thu Mar 6 17:45:50 2003 @@ -37,8 +37,14 @@ #include unsigned long highstart_pfn, highend_pfn; + +#ifndef CONFIG_X86_MEM_HOTADD static unsigned long totalram_pages; static unsigned long totalhigh_pages; +#else +unsigned long totalram_pages; +unsigned long totalhigh_pages; +#endif int do_check_pgt_cache(int low, int high) { diff -Naur -X dontdiff linux-2.4.7.SuSE/arch/i386/mm/mem_hotadd.c linux-2.4.7.SuSE.hotadd/arch/i386/mm/mem_hotadd.c --- linux-2.4.7.SuSE/arch/i386/mm/mem_hotadd.c Wed Dec 31 18:00:00 1969 +++ linux-2.4.7.SuSE.hotadd/arch/i386/mm/mem_hotadd.c Thu Mar 6 17:45:50 2003 @@ -0,0 +1,420 @@ +/* + * arch/i386/mm/mem_hotadd.c + * (c) 2002 Hewlett-Packard Development Company, L.P. + */ + +#include +#include + +#ifdef CONFIG_MODULES +#include +#include +#endif + +/* + * The functions in this file integrate added memory into the system. + * Flow: + * 1. Calculate size of data structures needed + * 2. Make part of the new memory accesible by setting PGDs. Allocate + * data structures in this area + * 3. Initialize data structures - pg_data_t and mem map + * 4. Reserve memory used up by data structures + * 5. Add rest of the memory to free list + * 6. Update global variables + */ + +int mem_hotadd_count = 0; +static DECLARE_MUTEX(mem_hotadd_sem); + +extern int init_pgdat(pg_data_t *, unsigned long long, unsigned long, unsigned long *, unsigned long *, struct page *, int); + +extern unsigned long hotadd_reserve_size; +extern unsigned long hotadd_reserve_start; +extern unsigned long totalram_pages; +extern unsigned long totalhigh_pages; + +static unsigned long hotadd_vaddr_start = -1; +static unsigned long hotadd_vaddr_left = -1; + +static void hotadd_mem_cleanup(unsigned long, unsigned long); + + +/* + * hotadd_mem_bootstrap(): Map part of the newly added memory to the + * kernel address space, starting at vaddr_start. + * Return size of the mapped memory (in bytes). + */ + +static int +hotadd_mem_bootstrap (kaddr_t paddr_start, unsigned long totalpages) +{ + int required; + kaddr_t paddr; + unsigned long vaddr, vaddr_start, vaddr_end; + int i,j,k; + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte, *pte_base; + + /* + * Calculate data structure sizes: pg_data_t and memory map + * Also get the number of PGDs required to access that amount + * of memory + */ +#if 0 + int map_size; + map_size = (sizeof(struct page) * (totalpages+1)); + required = map_size + sizeof(pg_data_t); +#endif + + vaddr_start = hotadd_vaddr_start; + required = (sizeof(struct page) * totalpages); + + if (hotadd_vaddr_left < required) + return -ENOSPC; + + + printk ("hotadd_mem_bootstrap: Initializing %lu memory pages for data structures\n", (required / PAGE_SIZE)); + /* + * Create PGD and PMD entries and set them in the global page directory + */ + vaddr_end = vaddr_start + required; + vaddr = vaddr_start; + paddr = paddr_start; + + while (vaddr < vaddr_end) { + + i = __pgd_offset(vaddr); + j = __pmd_offset(vaddr); + k = __pte_offset(vaddr); + pgd = (pgd_t *)(swapper_pg_dir + i); + pmd = pmd_offset(pgd, vaddr); + + if ( ((vaddr % PMD_SIZE) == 0) && + ((paddr % PMD_SIZE) == 0) && + (required >= PMD_SIZE) && + (cpu_has_pse)) { + kaddr_t __pe; + + __pe = _KERNPG_TABLE + _PAGE_PSE + paddr; + + /* Make it "global" too if supported */ + if (cpu_has_pge) { + __pe += _PAGE_GLOBAL; + } + set_pmd(pmd, __pmd(__pe)); + paddr += PMD_SIZE; + vaddr += PMD_SIZE; + required -= PMD_SIZE; + continue; + } + else { + int k_max = (required + PAGE_SIZE - 1) / PAGE_SIZE; + if ((k + k_max) > PTRS_PER_PTE) + k_max = PTRS_PER_PTE; + else + k_max += k; + + if (pmd_val(*pmd)) + pte_base = (pte_t *) __va(pmd_val(*pmd) & PAGE_MASK); + else { + pte_base = (pte_t *) kmalloc(PAGE_SIZE,GFP_KERNEL); + if (pte_base == NULL) { + printk("hotadd_mem_bootstrap: " + "kmalloc failed. " + "Cleaning up..\n"); + hotadd_mem_cleanup(vaddr_start, vaddr); + return -ENOMEM; + } + memset(pte_base,0,PAGE_SIZE); + } + pte = pte_base + k; + for (; k < k_max; pte++, k++) { + set_pte(pte, mk_pte_phys(paddr, PAGE_KERNEL)); + paddr += PAGE_SIZE; + vaddr += PAGE_SIZE; + required -= PAGE_SIZE; + } + if (! pmd_val(*pmd)) + set_pmd(pmd, __pmd(_KERNPG_TABLE + + __pa(pte_base))); + } + } + + return (vaddr_end - vaddr_start); +} + +/* + * hotadd_init_pgdat(): Initialize the pg_data_t and page structures + * needed for newly added memory. Reserve used pages and add rest of + * the pages to free list. + */ +static int +hotadd_init_pgdat(unsigned long totalpages, kaddr_t paddr_start, int used) +{ + struct page *map; + pg_data_t *pgdat, *temp_pgdat; + unsigned long zone_sizes[MAX_NR_ZONES]; + unsigned long zhole_sizes[MAX_NR_ZONES]; + unsigned long vaddr_start; + int map_size; + int i, ret = 0; + + vaddr_start = hotadd_vaddr_start; + + for (i=0; inode_next) + temp_pgdat = temp_pgdat->node_next; + + temp_pgdat->node_next = pgdat; + mb(); + + mem_hotadd_count++; + mb(); + + /* Add all (un-reserved) pages to free list */ + for (i=0; iflags)); + if (! PageReserved(map+i)) { + set_page_count(map+i, 1); + __free_page(map+i); + } + } + return 0; +} + +static void +hotadd_init_done(unsigned long totalpages, int used) +{ + unsigned long next_start; + + printk("hotadd_init_done: Updating global variables\n"); + + next_start = (hotadd_vaddr_start + used + PAGE_SIZE - 1) & PAGE_MASK; + hotadd_vaddr_left -= (next_start - hotadd_vaddr_start); + hotadd_vaddr_start = next_start; + + /* + * Update globals. + */ + numnodes++; + num_physpages += totalpages; + totalram_pages += totalpages; + totalhigh_pages += totalpages; + max_mapnr += totalpages; + highend_pfn += totalpages; +} + +static void +hotadd_mem_cleanup(unsigned long vaddr_start, unsigned long vaddr_end) +{ + unsigned long vaddr = vaddr_start; + int i, j; + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte_base, *pte; + + i = __pgd_offset(vaddr); + j = __pte_offset(vaddr); + pgd = (pgd_t *)(swapper_pg_dir + i); + pmd = pmd_offset(pgd, vaddr); + + if (! pmd_val(*pmd)) + BUG(); + + if (pmd_has_pse(*pmd)) { + pmd_clear(pmd); + vaddr += PMD_SIZE; + } + else { + pte_base = (pte_t *) __va(pmd_val(*pmd) & PAGE_MASK); + + pte = pte_base + j; + for (; j < PTRS_PER_PTE; pte++, j++) { + pte_clear(pte); + vaddr += PAGE_SIZE; + } + } + + while (vaddr < vaddr_end) { + + i = __pgd_offset(vaddr); + pgd = (pgd_t *)(swapper_pg_dir + i); + pmd = pmd_offset(pgd, vaddr); + + if (! pmd_val(*pmd)) + BUG(); + + if (! pmd_has_pse(*pmd)) { + pte_base = (pte_t *) __va(pmd_val(*pmd) & PAGE_MASK); + kfree(pte_base); + } + pmd_clear(pmd); + vaddr += PMD_SIZE; + } +} + + +/* + * hotadd_mem_init(): Integrate hot-added memory into the system. + */ +int hotadd_mem_init(unsigned long long start, unsigned long long size, + int flags) +{ +#ifndef CONFIG_X86_PAE + unsigned long paddr_start = (unsigned long)start; +#else + unsigned long long paddr_start = start; +#endif + pg_data_t *pgdat; + unsigned long pages = size / PAGE_SIZE; + unsigned long totalpages; + unsigned long start_pfn = (paddr_start >> PAGE_SHIFT); + unsigned long end_pfn = start_pfn + pages - 1; + unsigned long old_flags; + int used; + int ret = 0; + + printk ("Attempting to hot-add memory. Start address = 0x%Lx, Size = 0x%Lx (%lu pages)\n", start, size, pages); + + printk ("Free pages before hot-add: %u\n\n", nr_free_pages()); + + if (paddr_start & ~PAGE_MASK) + return -EINVAL; + + /* + * Adding to DMA or Normal zones is not allowed. + */ + if (paddr_start < __pa(high_memory)) + return -EINVAL; + +#ifndef CONFIG_X86_PAE + if (start_pfn >= MAX_NONPAE_PFN) + return -E2BIG; +#else + if (start_pfn >= MAX_PAE_PFN) + return -E2BIG; +#endif + +#ifndef CONFIG_X86_PAE + totalpages = (end_pfn < MAX_NONPAE_PFN) ? pages : + (MAX_NONPAE_PFN - start_pfn); + +#else + totalpages = (end_pfn < MAX_PAE_PFN) ? pages : + (MAX_PAE_PFN - start_pfn); +#endif + end_pfn = start_pfn + totalpages - 1; + + if (totalpages < pages) { + printk ("hotadd_mem_init: Can only add %lu pages out of %lu\n", totalpages, pages); + } + + down(&mem_hotadd_sem); + + /* + * Check for overlapping address ranges (should never happen..) + */ + pgdat = pgdat_list; + while (pgdat) { + kaddr_t pgdat_start = (pgdat->node_start_paddr / PAGE_SIZE); + kaddr_t pgdat_end = pgdat_start + pgdat->node_size - 1; + + if ((start_pfn >= pgdat_start) && (start_pfn <= pgdat_end)) { + printk ("hotadd_mem_init: Overlapping memory region\n"); + ret = -EEXIST; + goto err; + } + + if ((end_pfn >= pgdat_start) && (end_pfn <= pgdat_end)) { + printk ("hotadd_mem_init: Overlapping memory region\n"); + ret = -EEXIST; + goto err; + } + pgdat = pgdat->node_next; + } + + if (hotadd_vaddr_start == -1) + hotadd_vaddr_start = hotadd_reserve_start; + if (hotadd_vaddr_left == -1) + hotadd_vaddr_left = hotadd_reserve_size; + + /* Return if the reserved virtual address range is used up */ + if (hotadd_vaddr_left == 0) { + ret = -ENOMEM; + goto err; + } + + old_flags = current->flags; + current->flags |= PF_MEMALLOC; + + used = hotadd_mem_bootstrap(paddr_start, totalpages); + + if (used < 0) { + printk("hotadd_mem_init: Failed to initialize memory for data structures\n"); + ret = used; + goto done; + } + + ret = hotadd_init_pgdat(totalpages, paddr_start, used); + if (ret < 0) { + printk ("hotadd_init_pgdat failed. Cleaning up...\n"); + hotadd_mem_cleanup(hotadd_vaddr_start, + (hotadd_vaddr_start + used)); + goto done; + } + + hotadd_init_done(totalpages, used); + + printk ("Memory hot-add operation completed successfully\n"); + printk ("Free pages after hot-add: %u\n\n", nr_free_pages()); +done: + current->flags = old_flags; +err: + up(&mem_hotadd_sem); + return ret; +} + +EXPORT_SYMBOL(hotadd_mem_init); +EXPORT_SYMBOL(mem_hotadd_count); diff -Naur -X dontdiff linux-2.4.7.SuSE/include/asm-i386/mem_hotadd.h linux-2.4.7.SuSE.hotadd/include/asm-i386/mem_hotadd.h --- linux-2.4.7.SuSE/include/asm-i386/mem_hotadd.h Wed Dec 31 18:00:00 1969 +++ linux-2.4.7.SuSE.hotadd/include/asm-i386/mem_hotadd.h Thu Mar 6 17:45:50 2003 @@ -0,0 +1,125 @@ +/* + * linux/include/asm-i386/mem_hotadd.h + * (c) 2002 Hewlett-Packard Development Company, L.P. + */ + +#ifndef __ASM_MEM_HOTADD_H +#define __ASM_MEM_HOTADD_H + +/* + * Re-defines macros defined elsewhere to handle the case of multiple + * pg_data_t structures and memory maps + */ + + +extern int mem_hotadd_count; + +#ifdef CONFIG_X86_PAE +#define kaddr_t unsigned long long +#else +#define kaddr_t unsigned long +#endif + +#define MAX_NONPAE_PFN (1 << 20) +#define MAX_PAE_PFN (1 << 23) + +/* + * Return a pointer to the pg_data_t structure corresponding to + * a given physical address + */ +static inline pg_data_t * __addr_to_pgdat(kaddr_t addr) +{ + pg_data_t *temp; + + if (mem_hotadd_count == 0) + return (pgdat_list); + + temp = pgdat_list; + while (temp) { + if ((addr >= temp->node_start_paddr) && + ((addr >> PAGE_SHIFT) < ((temp->node_start_paddr >> PAGE_SHIFT) + temp->node_size))) + return temp; + temp = temp->node_next; + } + + return 0; +} + +/* + * Return a poniter to the pg_data_t structure correspondig to + * a page structure + */ +static inline pg_data_t * __page_to_pgdat(struct page *page) +{ + pg_data_t *temp; + + if ((mem_hotadd_count == 0) && ((page - pgdat_list->node_mem_map) < pgdat_list->node_size)) + return (pgdat_list); + + temp = pgdat_list; + while (temp) { + if ((page >= temp->node_mem_map) && + ((page - temp->node_mem_map) < temp->node_size)) + return (temp); + temp=temp->node_next; + } + return (pg_data_t *)0; +} + +/* + * Return a pointer to the mem map array corresponding to + * a page structure + */ +static inline struct page * __page_to_mem_map(struct page *page) +{ + pg_data_t *temp = __page_to_pgdat(page); + if (temp) + return temp->node_mem_map; + else + return (struct page *)0; +} + +/* + * Similar to the two inline functions above, but assume that + * the page structure is valid. + */ +#define __page_zone(p) ((p)->zone) +#define __page_mem_map(p) (__page_zone(p)->zone_mem_map) + +/* Return the page frame number of a given page structure */ +#define __page_pfn(p) (((p) - (__page_mem_map(p))) + (__page_zone(p)->zone_start_mapnr)) + + +/* Convert a physical address to 'struct page' */ +static inline struct page * addr_to_page(kaddr_t addr) +{ + unsigned long index; + pg_data_t *temp = __addr_to_pgdat(addr); + if (temp) { + index = ((addr & PAGE_MASK) - temp->node_start_paddr); + return (temp->node_mem_map + index); + } + return 0; +} + +/* Convert a "struct page" to a physical address */ +#define page_to_phys(p) ((kaddr_t)(__page_zone(p)->zone_start_paddr + (kaddr_t)((p - __page_mem_map(p)) << PAGE_SHIFT))) + +/* + * Return the page structure corresponding to a PTE + */ +static inline struct page * pte_page(pte_t pte) +{ + pg_data_t *temp = __addr_to_pgdat((kaddr_t)pte_val(pte)); + if (temp) + return (temp->node_mem_map + (((kaddr_t)pte_val(pte) - temp->node_start_paddr) >> PAGE_SHIFT)); + + return 0; +} + +#define VALID_PAGE(page) ((__page_to_mem_map(page) != 0)) + +#define mk_pte(page, pgprot) __mk_pte((__page_pfn(page)), (pgprot)) + +extern int hotadd_mem_init(unsigned long long, unsigned long long, int); +#endif /* ASM_MEM_HOTADD_H */ diff -Naur -X dontdiff linux-2.4.7.SuSE/include/asm-i386/page.h linux-2.4.7.SuSE.hotadd/include/asm-i386/page.h --- linux-2.4.7.SuSE/include/asm-i386/page.h Thu Aug 16 12:31:54 2001 +++ linux-2.4.7.SuSE.hotadd/include/asm-i386/page.h Thu Mar 6 17:45:50 2003 @@ -115,7 +115,11 @@ #define __pa(x) ((unsigned long)(x)-PAGE_OFFSET) #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) #define virt_to_page(kaddr) (mem_map + (__pa(kaddr) >> PAGE_SHIFT)) + +/* Defined in hotadd.h to handle multiple mem maps */ +#ifndef CONFIG_X86_MEM_HOTADD #define VALID_PAGE(page) ((page - mem_map) < max_mapnr) +#endif /* !CONFIG_X86_MEM_HOTADD */ #endif /* __KERNEL__ */ diff -Naur -X dontdiff linux-2.4.7.SuSE/include/asm-i386/pgtable-2level.h linux-2.4.7.SuSE.hotadd/include/asm-i386/pgtable-2level.h --- linux-2.4.7.SuSE/include/asm-i386/pgtable-2level.h Wed Oct 18 16:25:46 2000 +++ linux-2.4.7.SuSE.hotadd/include/asm-i386/pgtable-2level.h Thu Mar 6 17:45:50 2003 @@ -56,7 +56,11 @@ } #define ptep_get_and_clear(xp) __pte(xchg(&(xp)->pte_low, 0)) #define pte_same(a, b) ((a).pte_low == (b).pte_low) + +#ifndef CONFIG_X86_MEM_HOTADD #define pte_page(x) (mem_map+((unsigned long)(((x).pte_low >> PAGE_SHIFT)))) +#endif /* CONFIG_X86_MEM_HOTADD */ + #define pte_none(x) (!(x).pte_low) #define __mk_pte(page_nr,pgprot) __pte(((page_nr) << PAGE_SHIFT) | pgprot_val(pgprot)) diff -Naur -X dontdiff linux-2.4.7.SuSE/include/asm-i386/pgtable-3level.h linux-2.4.7.SuSE.hotadd/include/asm-i386/pgtable-3level.h --- linux-2.4.7.SuSE/include/asm-i386/pgtable-3level.h Sun Mar 25 19:38:31 2001 +++ linux-2.4.7.SuSE.hotadd/include/asm-i386/pgtable-3level.h Thu Mar 6 17:45:50 2003 @@ -86,7 +86,10 @@ return a.pte_low == b.pte_low && a.pte_high == b.pte_high; } +#ifndef CONFIG_X86_MEM_HOTADD #define pte_page(x) (mem_map+(((x).pte_low >> PAGE_SHIFT) | ((x).pte_high << (32 - PAGE_SHIFT)))) +#endif + #define pte_none(x) (!(x).pte_low && !(x).pte_high) static inline pte_t __mk_pte(unsigned long page_nr, pgprot_t pgprot) diff -Naur -X dontdiff linux-2.4.7.SuSE/include/asm-i386/pgtable.h linux-2.4.7.SuSE.hotadd/include/asm-i386/pgtable.h --- linux-2.4.7.SuSE/include/asm-i386/pgtable.h Thu Aug 16 12:31:54 2001 +++ linux-2.4.7.SuSE.hotadd/include/asm-i386/pgtable.h Thu Mar 6 17:45:50 2003 @@ -137,9 +137,18 @@ * area for the same reason. ;) */ #define VMALLOC_OFFSET (8*1024*1024) + +#ifdef CONFIG_X86_MEM_HOTADD +extern unsigned long hotadd_reserve_size; +#define VMALLOC_START (((unsigned long) high_memory + hotadd_reserve_size + \ + 2*VMALLOC_OFFSET-1) & ~(VMALLOC_OFFSET-1)) +#else /* CONFIG_X86_MEM_HOTADD */ #define VMALLOC_START (((unsigned long) high_memory + 2*VMALLOC_OFFSET-1) & \ ~(VMALLOC_OFFSET-1)) +#endif /* CONFIG_X86_MEM_HOTADD */ + #define VMALLOC_VMADDR(x) ((unsigned long)(x)) + #if CONFIG_HIGHMEM # define VMALLOC_END (PKMAP_BASE-2*PAGE_SIZE) #else @@ -250,6 +259,7 @@ #define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT) #define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0) #define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) +#define pmd_has_pse(x) (pmd_val(x) & _PAGE_PSE) /* * Permanent address of a page. Obviously must never be @@ -293,7 +303,9 @@ * and a page entry and page directory to the page they refer to. */ +#ifndef CONFIG_X86_MEM_HOTADD #define mk_pte(page, pgprot) __mk_pte((page) - mem_map, (pgprot)) +#endif /* CONFIG_X86_MEM_HOTADD */ /* This takes a physical page address that is used by the remapping functions */ #define mk_pte_phys(physpage, pgprot) __mk_pte((physpage) >> PAGE_SHIFT, pgprot) diff -Naur -X dontdiff linux-2.4.7.SuSE/include/linux/mm.h linux-2.4.7.SuSE.hotadd/include/linux/mm.h --- linux-2.4.7.SuSE/include/linux/mm.h Thu Aug 16 12:31:54 2001 +++ linux-2.4.7.SuSE.hotadd/include/linux/mm.h Thu Mar 6 17:45:50 2003 @@ -167,6 +167,10 @@ struct zone_struct *zone; /* Memory zone we are in. */ } mem_map_t; +#ifdef CONFIG_X86_MEM_HOTADD +#include +#endif + /* * Methods to modify the page usage count. * diff -Naur -X dontdiff linux-2.4.7.SuSE/include/linux/mmzone.h linux-2.4.7.SuSE.hotadd/include/linux/mmzone.h --- linux-2.4.7.SuSE/include/linux/mmzone.h Thu Aug 16 12:31:54 2001 +++ linux-2.4.7.SuSE.hotadd/include/linux/mmzone.h Thu Mar 6 17:45:50 2003 @@ -50,7 +50,7 @@ */ struct pglist_data *zone_pgdat; struct page *zone_mem_map; - unsigned long zone_start_paddr; + unsigned long long zone_start_paddr; unsigned long zone_start_mapnr; /* @@ -100,7 +100,7 @@ struct page *node_mem_map; unsigned long *valid_addr_bitmap; struct bootmem_data *bdata; - unsigned long node_start_paddr; + unsigned long long node_start_paddr; unsigned long node_start_mapnr; unsigned long node_size; int node_id; @@ -124,6 +124,23 @@ struct page *pmap); extern pg_data_t contig_page_data; + +static inline zone_t * next_node_zone(zone_t *zone) +{ + pg_data_t *pgdat = zone->zone_pgdat; + int index = zone - (pgdat->node_zones); + + if (pgdat->node_next) { + pgdat = pgdat->node_next; + zone = pgdat->node_zones + index; + } + else + zone = NULL; + return zone; +} + +#define for_each_node_zone(zone) \ + for(; zone; zone = next_node_zone(zone)) #ifndef CONFIG_DISCONTIGMEM diff -Naur -X dontdiff linux-2.4.7.SuSE/mm/Makefile linux-2.4.7.SuSE.hotadd/mm/Makefile --- linux-2.4.7.SuSE/mm/Makefile Wed Jul 11 17:46:28 2001 +++ linux-2.4.7.SuSE.hotadd/mm/Makefile Thu Mar 6 17:45:50 2003 @@ -9,7 +9,7 @@ O_TARGET := mm.o -export-objs := shmem.o +export-objs := shmem.o page_alloc.o obj-y := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \ vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \ diff -Naur -X dontdiff linux-2.4.7.SuSE/mm/page_alloc.c linux-2.4.7.SuSE.hotadd/mm/page_alloc.c --- linux-2.4.7.SuSE/mm/page_alloc.c Thu Aug 16 12:28:33 2001 +++ linux-2.4.7.SuSE.hotadd/mm/page_alloc.c Thu Mar 6 17:45:50 2003 @@ -10,6 +10,7 @@ */ #include +#include #include #include #include @@ -22,6 +23,7 @@ int nr_active_pages; int nr_inactive_dirty_pages; pg_data_t *pgdat_list; +EXPORT_SYMBOL(pgdat_list); static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; static int zone_balance_ratio[MAX_NR_ZONES] = { 32, 128, 128, }; @@ -53,7 +55,9 @@ /* * Temporary debugging check. */ -#define BAD_RANGE(zone,x) (((zone) != (x)->zone) || (((x)-mem_map) < (zone)->zone_start_mapnr) || (((x)-mem_map) >= (zone)->zone_start_mapnr+(zone)->size)) +#define MEM_MAP(x) ((x)->zone->zone_pgdat->node_mem_map) + +#define BAD_RANGE(zone,x) (((zone) != (x)->zone) || (((x)-MEM_MAP(x)) < (zone)->zone_start_mapnr) || (((x)-MEM_MAP(x)) >= (zone)->zone_start_mapnr+(zone)->size)) /* * Buddy system. Hairy. You really aren't expected to understand this @@ -232,35 +236,37 @@ if (!z) break; - if (!z->size) - BUG(); - + for_each_node_zone(z) { + if (! z->size) + continue; /* * We allocate if the number of free + inactive_clean * pages is above the watermark. */ - switch (limit) { - default: - case PAGES_MIN: - water_mark = z->pages_min; - break; - case PAGES_LOW: - water_mark = z->pages_low; - break; - case PAGES_HIGH: - water_mark = z->pages_high; - } + switch (limit) { + default: + case PAGES_MIN: + water_mark = z->pages_min; + break; + case PAGES_LOW: + water_mark = z->pages_low; + break; + case PAGES_HIGH: + water_mark = z->pages_high; + } - if (z->free_pages + z->inactive_clean_pages >= water_mark) { - struct page *page = NULL; - /* If possible, reclaim a page directly. */ - if (direct_reclaim) - page = reclaim_page(z); - /* If that fails, fall back to rmqueue. */ - if (!page) - page = rmqueue(z, order); - if (page) - return page; + if (z->free_pages + z->inactive_clean_pages >= + water_mark) { + struct page *page = NULL; + /* If possible, reclaim a page directly. */ + if (direct_reclaim) + page = reclaim_page(z); + /* If that fails, fall back to rmqueue. */ + if (!page) + page = rmqueue(z, order); + if (page) + return page; + } } } @@ -317,16 +323,18 @@ zone_t *z = *(zone++); if (!z) break; - if (!z->size) - BUG(); - if (z->free_pages >= z->pages_low) { - page = rmqueue(z, order); - if (page) - return page; - } else if (z->free_pages < z->pages_min && - waitqueue_active(&kreclaimd_wait)) { - wake_up_interruptible(&kreclaimd_wait); + for_each_node_zone(z) { + if (! z->size) + continue; + if (z->free_pages >= z->pages_low) { + page = rmqueue(z, order); + if (page) + return page; + } else if (z->free_pages < z->pages_min && + waitqueue_active(&kreclaimd_wait)) { + wake_up_interruptible(&kreclaimd_wait); + } } } @@ -418,17 +426,21 @@ break; if (!z->size) continue; - while (z->inactive_clean_pages) { - struct page * page; + for_each_node_zone(z) { + if (! z->size) + continue; + while (z->inactive_clean_pages) { + struct page * page; /* Move one page to the free list. */ - page = reclaim_page(z); - if (!page) - break; - __free_page(page); + page = reclaim_page(z); + if (!page) + break; + __free_page(page); /* Try if the allocation succeeds. */ - page = rmqueue(z, order); - if (page) - return page; + page = rmqueue(z, order); + if (page) + return page; + } } } } @@ -477,8 +489,6 @@ struct page * page = NULL; if (!z) break; - if (!z->size) - BUG(); /* * SUBTLE: direct_reclaim is only possible if the task @@ -486,19 +496,23 @@ * happen when the OOM killer selects this task for * instant execution... */ - if (direct_reclaim) { - page = reclaim_page(z); + for_each_node_zone(z) { + if (! z->size) + continue; + if (direct_reclaim) { + page = reclaim_page(z); + if (page) + return page; + } + + /* XXX: is pages_min/4 a good amount to reserve for this? */ + if (z->free_pages < z->pages_min / 4 && + !(current->flags & PF_MEMALLOC)) + continue; + page = rmqueue(z, order); if (page) return page; } - - /* XXX: is pages_min/4 a good amount to reserve for this? */ - if (z->free_pages < z->pages_min / 4 && - !(current->flags & PF_MEMALLOC)) - continue; - page = rmqueue(z, order); - if (page) - return page; } /* No luck.. */ @@ -711,12 +725,12 @@ */ case ZONE_HIGHMEM: zone = pgdat->node_zones + ZONE_HIGHMEM; - if (zone->size) { #ifndef CONFIG_HIGHMEM + if (zone->size) { BUG(); -#endif - zonelist->zones[j++] = zone; } +#endif + zonelist->zones[j++] = zone; case ZONE_NORMAL: zone = pgdat->node_zones + ZONE_NORMAL; if (zone->size) @@ -732,57 +746,20 @@ #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) -/* - * Set up the zone data structures: - * - mark all pages reserved - * - mark all memory queues empty - * - clear the memory bitmaps - */ -void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap, - unsigned long *zones_size, unsigned long zone_start_paddr, - unsigned long *zholes_size, struct page *lmem_map) +int init_pgdat(pg_data_t *pgdat, + unsigned long long zone_start_paddr, unsigned long totalpages, + unsigned long *zones_size, unsigned long *zholes_size, + struct page *lmem_map, int boot_flag) { struct page *p; + unsigned long offset; unsigned long i, j; - unsigned long map_size; - unsigned long totalpages, offset, realtotalpages; const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1); - if (zone_start_paddr & ~PAGE_MASK) - BUG(); - - totalpages = 0; - for (i = 0; i < MAX_NR_ZONES; i++) { - unsigned long size = zones_size[i]; - totalpages += size; - } - realtotalpages = totalpages; - if (zholes_size) - for (i = 0; i < MAX_NR_ZONES; i++) - realtotalpages -= zholes_size[i]; - - printk("On node %d totalpages: %lu\n", nid, realtotalpages); - - memlist_init(&active_list); - memlist_init(&inactive_dirty_list); - - /* - * Some architectures (with lots of mem and discontinous memory - * maps) have to search for a good mem_map area: - * For discontigmem, the conceptual mem map array starts from - * PAGE_OFFSET, we need to align the actual array onto a mem map - * boundary, so that MAP_NR works. - */ - map_size = (totalpages + 1)*sizeof(struct page); - if (lmem_map == (struct page *)0) { - lmem_map = (struct page *) alloc_bootmem_node(pgdat, map_size); - lmem_map = (struct page *)(PAGE_OFFSET + - MAP_ALIGN((unsigned long)lmem_map - PAGE_OFFSET)); - } - *gmap = pgdat->node_mem_map = lmem_map; + pgdat->node_mem_map = lmem_map; pgdat->node_size = totalpages; pgdat->node_start_paddr = zone_start_paddr; - pgdat->node_start_mapnr = (lmem_map - mem_map); + pgdat->node_start_mapnr = (zone_start_paddr >> PAGE_SHIFT); /* * Initially all pages are reserved - free ones are freed @@ -790,13 +767,15 @@ * done. */ for (p = lmem_map; p < lmem_map + totalpages; p++) { - set_page_count(p, 0); - SetPageReserved(p); + if (boot_flag) { + set_page_count(p, 0); + SetPageReserved(p); + } init_waitqueue_head(&p->wait); memlist_init(&p->list); } - offset = lmem_map - mem_map; + offset = 0; for (j = 0; j < MAX_NR_ZONES; j++) { zone_t *zone = pgdat->node_zones + j; unsigned long mask; @@ -840,15 +819,15 @@ freepages.min += mask; freepages.low += mask*2; freepages.high += mask*3; - zone->zone_mem_map = mem_map + offset; - zone->zone_start_mapnr = offset; + zone->zone_mem_map = lmem_map + offset; + zone->zone_start_mapnr = (zone_start_paddr >> PAGE_SHIFT); zone->zone_start_paddr = zone_start_paddr; if ((zone_start_paddr >> PAGE_SHIFT) & (zone_required_alignment-1)) printk("BUG: wrong zone alignment, it will crash\n"); for (i = 0; i < size; i++) { - struct page *page = mem_map + offset + i; + struct page *page = lmem_map + offset + i; page->zone = zone; #ifndef CONFIG_NO_PAGE_VIRTUAL if (j != ZONE_HIGHMEM) @@ -872,11 +851,90 @@ bitmap_size = size >> (i+1); bitmap_size = (bitmap_size + 7) >> 3; bitmap_size = LONG_ALIGN(bitmap_size); - zone->free_area[i].map = - (unsigned long *) alloc_bootmem_node(pgdat, bitmap_size); + if (boot_flag) { + zone->free_area[i].map = + (unsigned long *) alloc_bootmem_node(pgdat, bitmap_size); + } + else { + zone->free_area[i].map = + (unsigned long *)kmalloc(bitmap_size,GFP_KERNEL); + if (zone->free_area[i].map == NULL) + goto err_cleanup; + + memset(zone->free_area[i].map,0,bitmap_size); + } } } build_zonelists(pgdat); + + return 0; + +err_cleanup: + if (boot_flag) + BUG(); + printk ("init_pgdat: kmalloc failed. Cleaning up...\n"); + for (i=0; inode_zones + i; + if (! zone->size) + continue; + + for (j=0; j < MAX_ORDER; j++) { + if (zone->free_area[i].map) + kfree(zone->free_area[i].map); + } + } + + return -ENOMEM; +} + +/* + * Set up the zone data structures: + * - mark all pages reserved + * - mark all memory queues empty + * - clear the memory bitmaps + */ +void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap, + unsigned long *zones_size, unsigned long zone_start_paddr, + unsigned long *zholes_size, struct page *lmem_map) +{ + unsigned long map_size; + unsigned long totalpages, realtotalpages; + int i; + + if (zone_start_paddr & ~PAGE_MASK) + BUG(); + + totalpages = 0; + for (i = 0; i < MAX_NR_ZONES; i++) { + unsigned long size = zones_size[i]; + totalpages += size; + } + realtotalpages = totalpages; + if (zholes_size) + for (i = 0; i < MAX_NR_ZONES; i++) + realtotalpages -= zholes_size[i]; + + printk("On node %d totalpages: %lu\n", nid, realtotalpages); + + memlist_init(&active_list); + memlist_init(&inactive_dirty_list); + + /* + * Some architectures (with lots of mem and discontinous memory + * maps) have to search for a good mem_map area: + * For discontigmem, the conceptual mem map array starts from + * PAGE_OFFSET, we need to align the actual array onto a mem map + * boundary, so that MAP_NR works. + */ + map_size = (totalpages + 1)*sizeof(struct page); + if (lmem_map == (struct page *)0) { + lmem_map = (struct page *) alloc_bootmem_node(pgdat, map_size); + lmem_map = (struct page *)(PAGE_OFFSET + + MAP_ALIGN((unsigned long)lmem_map - PAGE_OFFSET)); + } + *gmap = lmem_map; + init_pgdat(pgdat, zone_start_paddr, totalpages, + zones_size, zholes_size, lmem_map, 1); } void __init free_area_init(unsigned long *zones_size)