From 1d8b85ccf1ed53a71b092fb5d807edf1ea7dabdd Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 23 Jun 2006 02:03:28 -0700 Subject: [PATCH] page migration cleanup: group functions Reorder functions in migrate.c. Group all migration functions for struct address_space_operations together. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 142 ++++++++++++++++++++++++++++++----------------------------- 1 file changed, 72 insertions(+), 70 deletions(-) (limited to 'mm/migrate.c') diff --git a/mm/migrate.c b/mm/migrate.c index 1c25040693d..49e71ddb679 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -119,15 +119,6 @@ int putback_lru_pages(struct list_head *l) return count; } -/* - * Non migratable page - */ -int fail_migrate_page(struct page *newpage, struct page *page) -{ - return -EIO; -} -EXPORT_SYMBOL(fail_migrate_page); - /* * swapout a single page * page is locked upon entry, unlocked on exit @@ -297,6 +288,17 @@ void migrate_page_copy(struct page *newpage, struct page *page) } EXPORT_SYMBOL(migrate_page_copy); +/************************************************************ + * Migration functions + ***********************************************************/ + +/* Always fail migration. Used for mappings that are not movable */ +int fail_migrate_page(struct page *newpage, struct page *page) +{ + return -EIO; +} +EXPORT_SYMBOL(fail_migrate_page); + /* * Common logic to directly migrate a single page suitable for * pages that do not use PagePrivate. @@ -329,6 +331,67 @@ int migrate_page(struct page *newpage, struct page *page) } EXPORT_SYMBOL(migrate_page); +/* + * Migration function for pages with buffers. This function can only be used + * if the underlying filesystem guarantees that no other references to "page" + * exist. + */ +int buffer_migrate_page(struct page *newpage, struct page *page) +{ + struct address_space *mapping = page->mapping; + struct buffer_head *bh, *head; + int rc; + + if (!mapping) + return -EAGAIN; + + if (!page_has_buffers(page)) + return migrate_page(newpage, page); + + head = page_buffers(page); + + rc = migrate_page_remove_references(newpage, page, 3); + + if (rc) + return rc; + + bh = head; + do { + get_bh(bh); + lock_buffer(bh); + bh = bh->b_this_page; + + } while (bh != head); + + ClearPagePrivate(page); + set_page_private(newpage, page_private(page)); + set_page_private(page, 0); + put_page(page); + get_page(newpage); + + bh = head; + do { + set_bh_page(bh, newpage, bh_offset(bh)); + bh = bh->b_this_page; + + } while (bh != head); + + SetPagePrivate(newpage); + + migrate_page_copy(newpage, page); + + bh = head; + do { + unlock_buffer(bh); + put_bh(bh); + bh = bh->b_this_page; + + } while (bh != head); + + return 0; +} +EXPORT_SYMBOL(buffer_migrate_page); + /* * migrate_pages * @@ -528,67 +591,6 @@ next: return nr_failed + retry; } -/* - * Migration function for pages with buffers. This function can only be used - * if the underlying filesystem guarantees that no other references to "page" - * exist. - */ -int buffer_migrate_page(struct page *newpage, struct page *page) -{ - struct address_space *mapping = page->mapping; - struct buffer_head *bh, *head; - int rc; - - if (!mapping) - return -EAGAIN; - - if (!page_has_buffers(page)) - return migrate_page(newpage, page); - - head = page_buffers(page); - - rc = migrate_page_remove_references(newpage, page, 3); - - if (rc) - return rc; - - bh = head; - do { - get_bh(bh); - lock_buffer(bh); - bh = bh->b_this_page; - - } while (bh != head); - - ClearPagePrivate(page); - set_page_private(newpage, page_private(page)); - set_page_private(page, 0); - put_page(page); - get_page(newpage); - - bh = head; - do { - set_bh_page(bh, newpage, bh_offset(bh)); - bh = bh->b_this_page; - - } while (bh != head); - - SetPagePrivate(newpage); - - migrate_page_copy(newpage, page); - - bh = head; - do { - unlock_buffer(bh); - put_bh(bh); - bh = bh->b_this_page; - - } while (bh != head); - - return 0; -} -EXPORT_SYMBOL(buffer_migrate_page); - /* * Migrate the list 'pagelist' of pages to a certain destination. * -- cgit v1.2.3-70-g09d2 From e7340f73307abed9283d0a07570d06e228c205dd Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 23 Jun 2006 02:03:29 -0700 Subject: [PATCH] page migration cleanup: remove useless definitions Remove the export for migrate_page_remove_references() and migrate_page_copy() that are unlikely to be used directly by filesystems implementing migration. The export was useful when buffer_migrate_page() lived in fs/buffer.c but it has now been moved to migrate.c in the migration reorg. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 2 -- mm/migrate.c | 6 ++---- 2 files changed, 2 insertions(+), 6 deletions(-) (limited to 'mm/migrate.c') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 6789c4940c9..e8d3b08cc35 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -7,8 +7,6 @@ extern int isolate_lru_page(struct page *p, struct list_head *pagelist); extern int putback_lru_pages(struct list_head *l); extern int migrate_page(struct page *, struct page *); -extern void migrate_page_copy(struct page *, struct page *); -extern int migrate_page_remove_references(struct page *, struct page *, int); extern int migrate_pages(struct list_head *l, struct list_head *t, struct list_head *moved, struct list_head *failed); extern int migrate_pages_to(struct list_head *pagelist, diff --git a/mm/migrate.c b/mm/migrate.c index 49e71ddb679..be3f141e53a 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -169,7 +169,7 @@ retry: * Remove references for a page and establish the new page with the correct * basic settings to be able to stop accesses to the page. */ -int migrate_page_remove_references(struct page *newpage, +static int migrate_page_remove_references(struct page *newpage, struct page *page, int nr_refs) { struct address_space *mapping = page_mapping(page); @@ -246,12 +246,11 @@ int migrate_page_remove_references(struct page *newpage, return 0; } -EXPORT_SYMBOL(migrate_page_remove_references); /* * Copy the page to its new location */ -void migrate_page_copy(struct page *newpage, struct page *page) +static void migrate_page_copy(struct page *newpage, struct page *page) { copy_highpage(newpage, page); @@ -286,7 +285,6 @@ void migrate_page_copy(struct page *newpage, struct page *page) if (PageWriteback(newpage)) end_page_writeback(newpage); } -EXPORT_SYMBOL(migrate_page_copy); /************************************************************ * Migration functions -- cgit v1.2.3-70-g09d2 From 5b5c7120e2154239837fad5e3c7b7b781092b19c Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 23 Jun 2006 02:03:29 -0700 Subject: [PATCH] page migration cleanup: drop nr_refs in remove_references() Drop nr_refs parameter from migrate_page_remove_references() The nr_refs parameter is not really useful since the number of remaining references is always 1 for anonymous pages without a mapping 2 for pages with a mapping 3 for pages with a mapping and PagePrivate set. Remove the early check for the number of references since we are checking page_mapcount() earlier. Ultimately only the refcount matters after the tree_lock has been obtained. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'mm/migrate.c') diff --git a/mm/migrate.c b/mm/migrate.c index be3f141e53a..2803a6698dd 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -168,19 +168,19 @@ retry: /* * Remove references for a page and establish the new page with the correct * basic settings to be able to stop accesses to the page. + * + * The number of remaining references must be: + * 1 for anonymous pages without a mapping + * 2 for pages with a mapping + * 3 for pages with a mapping and PagePrivate set. */ static int migrate_page_remove_references(struct page *newpage, - struct page *page, int nr_refs) + struct page *page) { struct address_space *mapping = page_mapping(page); struct page **radix_pointer; - /* - * Avoid doing any of the following work if the page count - * indicates that the page is in use or truncate has removed - * the page. - */ - if (!mapping || page_mapcount(page) + nr_refs != page_count(page)) + if (!mapping) return -EAGAIN; /* @@ -218,7 +218,8 @@ static int migrate_page_remove_references(struct page *newpage, &mapping->page_tree, page_index(page)); - if (!page_mapping(page) || page_count(page) != nr_refs || + if (!page_mapping(page) || + page_count(page) != 2 + !!PagePrivate(page) || *radix_pointer != page) { write_unlock_irq(&mapping->tree_lock); return -EAGAIN; @@ -309,7 +310,7 @@ int migrate_page(struct page *newpage, struct page *page) BUG_ON(PageWriteback(page)); /* Writeback must be complete */ - rc = migrate_page_remove_references(newpage, page, 2); + rc = migrate_page_remove_references(newpage, page); if (rc) return rc; @@ -348,7 +349,7 @@ int buffer_migrate_page(struct page *newpage, struct page *page) head = page_buffers(page); - rc = migrate_page_remove_references(newpage, page, 3); + rc = migrate_page_remove_references(newpage, page); if (rc) return rc; -- cgit v1.2.3-70-g09d2 From c3fcf8a5daacf350f0632e1379414c01f34eeea3 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 23 Jun 2006 02:03:32 -0700 Subject: [PATCH] page migration cleanup: extract try_to_unmap from migration functions Extract try_to_unmap and rename remove_references -> move_mapping try_to_unmap() may significantly change the page state by for example setting the dirty bit. It is therefore best to unmap in migrate_pages() before calling any migration functions. migrate_page_remove_references() will then only move the new page in place of the old page in the mapping. Rename the function to migrate_page_move_mapping(). This allows us to get rid of the special unmapping for the fallback path. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 76 +++++++++++++++++++++++++----------------------------------- 1 file changed, 31 insertions(+), 45 deletions(-) (limited to 'mm/migrate.c') diff --git a/mm/migrate.c b/mm/migrate.c index 2803a6698dd..8095c607a49 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -166,15 +166,14 @@ retry: } /* - * Remove references for a page and establish the new page with the correct - * basic settings to be able to stop accesses to the page. + * Replace the page in the mapping. * * The number of remaining references must be: * 1 for anonymous pages without a mapping * 2 for pages with a mapping * 3 for pages with a mapping and PagePrivate set. */ -static int migrate_page_remove_references(struct page *newpage, +static int migrate_page_move_mapping(struct page *newpage, struct page *page) { struct address_space *mapping = page_mapping(page); @@ -183,35 +182,6 @@ static int migrate_page_remove_references(struct page *newpage, if (!mapping) return -EAGAIN; - /* - * Establish swap ptes for anonymous pages or destroy pte - * maps for files. - * - * In order to reestablish file backed mappings the fault handlers - * will take the radix tree_lock which may then be used to stop - * processses from accessing this page until the new page is ready. - * - * A process accessing via a swap pte (an anonymous page) will take a - * page_lock on the old page which will block the process until the - * migration attempt is complete. At that time the PageSwapCache bit - * will be examined. If the page was migrated then the PageSwapCache - * bit will be clear and the operation to retrieve the page will be - * retried which will find the new page in the radix tree. Then a new - * direct mapping may be generated based on the radix tree contents. - * - * If the page was not migrated then the PageSwapCache bit - * is still set and the operation may continue. - */ - if (try_to_unmap(page, 1) == SWAP_FAIL) - /* A vma has VM_LOCKED set -> permanent failure */ - return -EPERM; - - /* - * Give up if we were unable to remove all mappings. - */ - if (page_mapcount(page)) - return -EAGAIN; - write_lock_irq(&mapping->tree_lock); radix_pointer = (struct page **)radix_tree_lookup_slot( @@ -310,7 +280,7 @@ int migrate_page(struct page *newpage, struct page *page) BUG_ON(PageWriteback(page)); /* Writeback must be complete */ - rc = migrate_page_remove_references(newpage, page); + rc = migrate_page_move_mapping(newpage, page); if (rc) return rc; @@ -349,7 +319,7 @@ int buffer_migrate_page(struct page *newpage, struct page *page) head = page_buffers(page); - rc = migrate_page_remove_references(newpage, page); + rc = migrate_page_move_mapping(newpage, page); if (rc) return rc; @@ -481,6 +451,33 @@ redo: newpage = lru_to_page(to); lock_page(newpage); + /* + * Establish swap ptes for anonymous pages or destroy pte + * maps for files. + * + * In order to reestablish file backed mappings the fault handlers + * will take the radix tree_lock which may then be used to stop + * processses from accessing this page until the new page is ready. + * + * A process accessing via a swap pte (an anonymous page) will take a + * page_lock on the old page which will block the process until the + * migration attempt is complete. At that time the PageSwapCache bit + * will be examined. If the page was migrated then the PageSwapCache + * bit will be clear and the operation to retrieve the page will be + * retried which will find the new page in the radix tree. Then a new + * direct mapping may be generated based on the radix tree contents. + * + * If the page was not migrated then the PageSwapCache bit + * is still set and the operation may continue. + */ + rc = -EPERM; + if (try_to_unmap(page, 1) == SWAP_FAIL) + /* A vma has VM_LOCKED set -> permanent failure */ + goto unlock_both; + + rc = -EAGAIN; + if (page_mapped(page)) + goto unlock_both; /* * Pages are properly locked and writeback is complete. * Try to migrate the page. @@ -501,17 +498,6 @@ redo: goto unlock_both; } - /* Make sure the dirty bit is up to date */ - if (try_to_unmap(page, 1) == SWAP_FAIL) { - rc = -EPERM; - goto unlock_both; - } - - if (page_mapcount(page)) { - rc = -EAGAIN; - goto unlock_both; - } - /* * Default handling if a filesystem does not provide * a migration function. We can only migrate clean -- cgit v1.2.3-70-g09d2 From 2d1db3b1170db4e8bf0531dd636742269c2cf579 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 23 Jun 2006 02:03:33 -0700 Subject: [PATCH] page migration cleanup: pass "mapping" to migration functions Change handling of address spaces. Pass a pointer to the address space in which the page is migrated to all migration function. This avoids repeatedly having to retrieve the address space pointer from the page and checking it for validity. The old page mapping will change once migration has gone to a certain step, so it is less confusing to have the pointer always available. Move the setting of the mapping and index for the new page into migrate_pages(). Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 6 +++-- include/linux/migrate.h | 6 +++-- mm/migrate.c | 70 ++++++++++++++++++++++++------------------------- 3 files changed, 42 insertions(+), 40 deletions(-) (limited to 'mm/migrate.c') diff --git a/include/linux/fs.h b/include/linux/fs.h index c823a3815e2..e917403f4d5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -376,7 +376,8 @@ struct address_space_operations { struct page* (*get_xip_page)(struct address_space *, sector_t, int); /* migrate the contents of a page to the specified target */ - int (*migratepage) (struct page *, struct page *); + int (*migratepage) (struct address_space *, + struct page *, struct page *); }; struct backing_dev_info; @@ -1772,7 +1773,8 @@ extern void simple_release_fs(struct vfsmount **mount, int *count); extern ssize_t simple_read_from_buffer(void __user *, size_t, loff_t *, const void *, size_t); #ifdef CONFIG_MIGRATION -extern int buffer_migrate_page(struct page *, struct page *); +extern int buffer_migrate_page(struct address_space *, + struct page *, struct page *); #else #define buffer_migrate_page NULL #endif diff --git a/include/linux/migrate.h b/include/linux/migrate.h index e8d3b08cc35..287c47b5e5d 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -6,12 +6,14 @@ #ifdef CONFIG_MIGRATION extern int isolate_lru_page(struct page *p, struct list_head *pagelist); extern int putback_lru_pages(struct list_head *l); -extern int migrate_page(struct page *, struct page *); +extern int migrate_page(struct address_space *, + struct page *, struct page *); extern int migrate_pages(struct list_head *l, struct list_head *t, struct list_head *moved, struct list_head *failed); extern int migrate_pages_to(struct list_head *pagelist, struct vm_area_struct *vma, int dest); -extern int fail_migrate_page(struct page *, struct page *); +extern int fail_migrate_page(struct address_space *, + struct page *, struct page *); extern int migrate_prep(void); diff --git a/mm/migrate.c b/mm/migrate.c index 8095c607a49..f65e69d9452 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -173,15 +173,11 @@ retry: * 2 for pages with a mapping * 3 for pages with a mapping and PagePrivate set. */ -static int migrate_page_move_mapping(struct page *newpage, - struct page *page) +static int migrate_page_move_mapping(struct address_space *mapping, + struct page *newpage, struct page *page) { - struct address_space *mapping = page_mapping(page); struct page **radix_pointer; - if (!mapping) - return -EAGAIN; - write_lock_irq(&mapping->tree_lock); radix_pointer = (struct page **)radix_tree_lookup_slot( @@ -197,15 +193,8 @@ static int migrate_page_move_mapping(struct page *newpage, /* * Now we know that no one else is looking at the page. - * - * Certain minimal information about a page must be available - * in order for other subsystems to properly handle the page if they - * find it through the radix tree update before we are finished - * copying the page. */ get_page(newpage); - newpage->index = page->index; - newpage->mapping = page->mapping; if (PageSwapCache(page)) { SetPageSwapCache(newpage); set_page_private(newpage, page_private(page)); @@ -262,7 +251,8 @@ static void migrate_page_copy(struct page *newpage, struct page *page) ***********************************************************/ /* Always fail migration. Used for mappings that are not movable */ -int fail_migrate_page(struct page *newpage, struct page *page) +int fail_migrate_page(struct address_space *mapping, + struct page *newpage, struct page *page) { return -EIO; } @@ -274,13 +264,14 @@ EXPORT_SYMBOL(fail_migrate_page); * * Pages are locked upon entry and exit. */ -int migrate_page(struct page *newpage, struct page *page) +int migrate_page(struct address_space *mapping, + struct page *newpage, struct page *page) { int rc; BUG_ON(PageWriteback(page)); /* Writeback must be complete */ - rc = migrate_page_move_mapping(newpage, page); + rc = migrate_page_move_mapping(mapping, newpage, page); if (rc) return rc; @@ -305,21 +296,18 @@ EXPORT_SYMBOL(migrate_page); * if the underlying filesystem guarantees that no other references to "page" * exist. */ -int buffer_migrate_page(struct page *newpage, struct page *page) +int buffer_migrate_page(struct address_space *mapping, + struct page *newpage, struct page *page) { - struct address_space *mapping = page->mapping; struct buffer_head *bh, *head; int rc; - if (!mapping) - return -EAGAIN; - if (!page_has_buffers(page)) - return migrate_page(newpage, page); + return migrate_page(mapping, newpage, page); head = page_buffers(page); - rc = migrate_page_move_mapping(newpage, page); + rc = migrate_page_move_mapping(mapping, newpage, page); if (rc) return rc; @@ -448,9 +436,6 @@ redo: goto next; } - newpage = lru_to_page(to); - lock_page(newpage); - /* * Establish swap ptes for anonymous pages or destroy pte * maps for files. @@ -473,11 +458,18 @@ redo: rc = -EPERM; if (try_to_unmap(page, 1) == SWAP_FAIL) /* A vma has VM_LOCKED set -> permanent failure */ - goto unlock_both; + goto unlock_page; rc = -EAGAIN; if (page_mapped(page)) - goto unlock_both; + goto unlock_page; + + newpage = lru_to_page(to); + lock_page(newpage); + /* Prepare mapping for the new page.*/ + newpage->index = page->index; + newpage->mapping = page->mapping; + /* * Pages are properly locked and writeback is complete. * Try to migrate the page. @@ -494,7 +486,8 @@ redo: * own migration function. This is the most common * path for page migration. */ - rc = mapping->a_ops->migratepage(newpage, page); + rc = mapping->a_ops->migratepage(mapping, + newpage, page); goto unlock_both; } @@ -524,7 +517,7 @@ redo: */ if (!page_has_buffers(page) || try_to_release_page(page, GFP_KERNEL)) { - rc = migrate_page(newpage, page); + rc = migrate_page(mapping, newpage, page); goto unlock_both; } @@ -553,12 +546,17 @@ unlock_page: unlock_page(page); next: - if (rc == -EAGAIN) { - retry++; - } else if (rc) { - /* Permanent failure */ - list_move(&page->lru, failed); - nr_failed++; + if (rc) { + if (newpage) + newpage->mapping = NULL; + + if (rc == -EAGAIN) + retry++; + else { + /* Permanent failure */ + list_move(&page->lru, failed); + nr_failed++; + } } else { if (newpage) { /* Successful migration. Return page to LRU */ -- cgit v1.2.3-70-g09d2 From 8351a6e4785218a2b03c142be92926baff95ba5c Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 23 Jun 2006 02:03:33 -0700 Subject: [PATCH] page migration cleanup: move fallback handling into special function Move the fallback code into a new fallback function and make the function behave like any other migration function. This requires retaking the lock if pageout() drops it. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 90 ++++++++++++++++++++++++++---------------------------------- 1 file changed, 39 insertions(+), 51 deletions(-) (limited to 'mm/migrate.c') diff --git a/mm/migrate.c b/mm/migrate.c index f65e69d9452..5a340f4ca21 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -349,6 +349,42 @@ int buffer_migrate_page(struct address_space *mapping, } EXPORT_SYMBOL(buffer_migrate_page); +static int fallback_migrate_page(struct address_space *mapping, + struct page *newpage, struct page *page) +{ + /* + * Default handling if a filesystem does not provide + * a migration function. We can only migrate clean + * pages so try to write out any dirty pages first. + */ + if (PageDirty(page)) { + switch (pageout(page, mapping)) { + case PAGE_KEEP: + case PAGE_ACTIVATE: + return -EAGAIN; + + case PAGE_SUCCESS: + /* Relock since we lost the lock */ + lock_page(page); + /* Must retry since page state may have changed */ + return -EAGAIN; + + case PAGE_CLEAN: + ; /* try to migrate the page below */ + } + } + + /* + * Buffers may be managed in a filesystem specific way. + * We must have no buffers or drop them. + */ + if (page_has_buffers(page) && + !try_to_release_page(page, GFP_KERNEL)) + return -EAGAIN; + + return migrate_page(mapping, newpage, page); +} + /* * migrate_pages * @@ -478,7 +514,7 @@ redo: if (!mapping) goto unlock_both; - if (mapping->a_ops->migratepage) { + if (mapping->a_ops->migratepage) /* * Most pages have a mapping and most filesystems * should provide a migration function. Anonymous @@ -488,56 +524,8 @@ redo: */ rc = mapping->a_ops->migratepage(mapping, newpage, page); - goto unlock_both; - } - - /* - * Default handling if a filesystem does not provide - * a migration function. We can only migrate clean - * pages so try to write out any dirty pages first. - */ - if (PageDirty(page)) { - switch (pageout(page, mapping)) { - case PAGE_KEEP: - case PAGE_ACTIVATE: - goto unlock_both; - - case PAGE_SUCCESS: - unlock_page(newpage); - goto next; - - case PAGE_CLEAN: - ; /* try to migrate the page below */ - } - } - - /* - * Buffers are managed in a filesystem specific way. - * We must have no buffers or drop them. - */ - if (!page_has_buffers(page) || - try_to_release_page(page, GFP_KERNEL)) { - rc = migrate_page(mapping, newpage, page); - goto unlock_both; - } - - /* - * On early passes with mapped pages simply - * retry. There may be a lock held for some - * buffers that may go away. Later - * swap them out. - */ - if (pass > 4) { - /* - * Persistently unable to drop buffers..... As a - * measure of last resort we fall back to - * swap_page(). - */ - unlock_page(newpage); - newpage = NULL; - rc = swap_page(page); - goto next; - } + else + rc = fallback_migrate_page(mapping, newpage, page); unlock_both: unlock_page(newpage); -- cgit v1.2.3-70-g09d2 From 0697212a411c1dae03c27845f2de2f3adb32c331 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 23 Jun 2006 02:03:35 -0700 Subject: [PATCH] Swapless page migration: add R/W migration entries Implement read/write migration ptes We take the upper two swapfiles for the two types of migration ptes and define a series of macros in swapops.h. The VM is modified to handle the migration entries. migration entries can only be encountered when the page they are pointing to is locked. This limits the number of places one has to fix. We also check in copy_pte_range and in mprotect_pte_range() for migration ptes. We check for migration ptes in do_swap_cache and call a function that will then wait on the page lock. This allows us to effectively stop all accesses to apge. Migration entries are created by try_to_unmap if called for migration and removed by local functions in migrate.c From: Hugh Dickins Several times while testing swapless page migration (I've no NUMA, just hacking it up to migrate recklessly while running load), I've hit the BUG_ON(!PageLocked(p)) in migration_entry_to_page. This comes from an orphaned migration entry, unrelated to the current correctly locked migration, but hit by remove_anon_migration_ptes as it checks an address in each vma of the anon_vma list. Such an orphan may be left behind if an earlier migration raced with fork: copy_one_pte can duplicate a migration entry from parent to child, after remove_anon_migration_ptes has checked the child vma, but before it has removed it from the parent vma. (If the process were later to fault on this orphaned entry, it would hit the same BUG from migration_entry_wait.) This could be fixed by locking anon_vma in copy_one_pte, but we'd rather not. There's no such problem with file pages, because vma_prio_tree_add adds child vma after parent vma, and the page table locking at each end is enough to serialize. Follow that example with anon_vma: add new vmas to the tail instead of the head. (There's no corresponding problem when inserting migration entries, because a missed pte will leave the page count and mapcount high, which is allowed for. And there's no corresponding problem when migrating via swap, because a leftover swap entry will be correctly faulted. But the swapless method has no refcounting of its entries.) From: Ingo Molnar pte_unmap_unlock() takes the pte pointer as an argument. From: Hugh Dickins Several times while testing swapless page migration, gcc has tried to exec a pointer instead of a string: smells like COW mappings are not being properly write-protected on fork. The protection in copy_one_pte looks very convincing, until at last you realize that the second arg to make_migration_entry is a boolean "write", and SWP_MIGRATION_READ is 30. Anyway, it's better done like in change_pte_range, using is_write_migration_entry and make_migration_entry_read. From: Hugh Dickins Remove unnecessary obfuscation from sys_swapon's range check on swap type, which blew up causing memory corruption once swapless migration made MAX_SWAPFILES no longer 2 ^ MAX_SWAPFILES_SHIFT. Signed-off-by: Hugh Dickins Acked-by: Martin Schwidefsky Signed-off-by: Hugh Dickins Signed-off-by: Christoph Lameter Signed-off-by: Ingo Molnar From: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 7 +++ include/linux/swapops.h | 53 ++++++++++++++++++++ mm/memory.c | 18 ++++++- mm/migrate.c | 128 +++++++++++++++++++++++++++++++++++++++++++++++- mm/mprotect.c | 23 +++++++-- mm/rmap.c | 38 ++++++++------ mm/swapfile.c | 20 +++----- 7 files changed, 255 insertions(+), 32 deletions(-) (limited to 'mm/migrate.c') diff --git a/include/linux/swap.h b/include/linux/swap.h index cd28ad206da..7cee73ef4f1 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -28,7 +28,14 @@ static inline int current_is_kswapd(void) * the type/offset into the pte as 5/27 as well. */ #define MAX_SWAPFILES_SHIFT 5 +#ifndef CONFIG_MIGRATION #define MAX_SWAPFILES (1 << MAX_SWAPFILES_SHIFT) +#else +/* Use last two entries for page migration swap entries */ +#define MAX_SWAPFILES ((1 << MAX_SWAPFILES_SHIFT)-2) +#define SWP_MIGRATION_READ MAX_SWAPFILES +#define SWP_MIGRATION_WRITE (MAX_SWAPFILES + 1) +#endif /* * Magic header for a swap area. The first part of the union is diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 87b9d14c710..ec639aa3a1d 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -67,3 +67,56 @@ static inline pte_t swp_entry_to_pte(swp_entry_t entry) BUG_ON(pte_file(__swp_entry_to_pte(arch_entry))); return __swp_entry_to_pte(arch_entry); } + +#ifdef CONFIG_MIGRATION +static inline swp_entry_t make_migration_entry(struct page *page, int write) +{ + BUG_ON(!PageLocked(page)); + return swp_entry(write ? SWP_MIGRATION_WRITE : SWP_MIGRATION_READ, + page_to_pfn(page)); +} + +static inline int is_migration_entry(swp_entry_t entry) +{ + return unlikely(swp_type(entry) == SWP_MIGRATION_READ || + swp_type(entry) == SWP_MIGRATION_WRITE); +} + +static inline int is_write_migration_entry(swp_entry_t entry) +{ + return unlikely(swp_type(entry) == SWP_MIGRATION_WRITE); +} + +static inline struct page *migration_entry_to_page(swp_entry_t entry) +{ + struct page *p = pfn_to_page(swp_offset(entry)); + /* + * Any use of migration entries may only occur while the + * corresponding page is locked + */ + BUG_ON(!PageLocked(p)); + return p; +} + +static inline void make_migration_entry_read(swp_entry_t *entry) +{ + *entry = swp_entry(SWP_MIGRATION_READ, swp_offset(*entry)); +} + +extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, + unsigned long address); +#else + +#define make_migration_entry(page, write) swp_entry(0, 0) +#define is_migration_entry(swp) 0 +#define migration_entry_to_page(swp) NULL +static inline void make_migration_entry_read(swp_entry_t *entryp) { } +static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, + unsigned long address) { } +static inline int is_write_migration_entry(swp_entry_t entry) +{ + return 0; +} + +#endif + diff --git a/mm/memory.c b/mm/memory.c index 7e3683fd4f3..11673c5d2c2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -434,7 +434,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, /* pte contains position in swap or file, so copy. */ if (unlikely(!pte_present(pte))) { if (!pte_file(pte)) { - swap_duplicate(pte_to_swp_entry(pte)); + swp_entry_t entry = pte_to_swp_entry(pte); + + swap_duplicate(entry); /* make sure dst_mm is on swapoff's mmlist. */ if (unlikely(list_empty(&dst_mm->mmlist))) { spin_lock(&mmlist_lock); @@ -443,6 +445,16 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, &src_mm->mmlist); spin_unlock(&mmlist_lock); } + if (is_write_migration_entry(entry) && + is_cow_mapping(vm_flags)) { + /* + * COW mappings require pages in both parent + * and child to be set to read. + */ + make_migration_entry_read(&entry); + pte = swp_entry_to_pte(entry); + set_pte_at(src_mm, addr, src_pte, pte); + } } goto out_set_pte; } @@ -1879,6 +1891,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, goto out; entry = pte_to_swp_entry(orig_pte); + if (is_migration_entry(entry)) { + migration_entry_wait(mm, pmd, address); + goto out; + } page = lookup_swap_cache(entry); if (!page) { swapin_readahead(entry, address, vma); diff --git a/mm/migrate.c b/mm/migrate.c index 5a340f4ca21..0a011e421bb 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -23,7 +24,6 @@ #include #include #include -#include #include "internal.h" @@ -119,6 +119,132 @@ int putback_lru_pages(struct list_head *l) return count; } +static inline int is_swap_pte(pte_t pte) +{ + return !pte_none(pte) && !pte_present(pte) && !pte_file(pte); +} + +/* + * Restore a potential migration pte to a working pte entry + */ +static void remove_migration_pte(struct vm_area_struct *vma, unsigned long addr, + struct page *old, struct page *new) +{ + struct mm_struct *mm = vma->vm_mm; + swp_entry_t entry; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep, pte; + spinlock_t *ptl; + + pgd = pgd_offset(mm, addr); + if (!pgd_present(*pgd)) + return; + + pud = pud_offset(pgd, addr); + if (!pud_present(*pud)) + return; + + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd)) + return; + + ptep = pte_offset_map(pmd, addr); + + if (!is_swap_pte(*ptep)) { + pte_unmap(ptep); + return; + } + + ptl = pte_lockptr(mm, pmd); + spin_lock(ptl); + pte = *ptep; + if (!is_swap_pte(pte)) + goto out; + + entry = pte_to_swp_entry(pte); + + if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old) + goto out; + + inc_mm_counter(mm, anon_rss); + get_page(new); + pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); + if (is_write_migration_entry(entry)) + pte = pte_mkwrite(pte); + set_pte_at(mm, addr, ptep, pte); + page_add_anon_rmap(new, vma, addr); +out: + pte_unmap_unlock(ptep, ptl); +} + +/* + * Get rid of all migration entries and replace them by + * references to the indicated page. + * + * Must hold mmap_sem lock on at least one of the vmas containing + * the page so that the anon_vma cannot vanish. + */ +static void remove_migration_ptes(struct page *old, struct page *new) +{ + struct anon_vma *anon_vma; + struct vm_area_struct *vma; + unsigned long mapping; + + mapping = (unsigned long)new->mapping; + + if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0) + return; + + /* + * We hold the mmap_sem lock. So no need to call page_lock_anon_vma. + */ + anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON); + spin_lock(&anon_vma->lock); + + list_for_each_entry(vma, &anon_vma->head, anon_vma_node) + remove_migration_pte(vma, page_address_in_vma(new, vma), + old, new); + + spin_unlock(&anon_vma->lock); +} + +/* + * Something used the pte of a page under migration. We need to + * get to the page and wait until migration is finished. + * When we return from this function the fault will be retried. + * + * This function is called from do_swap_page(). + */ +void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, + unsigned long address) +{ + pte_t *ptep, pte; + spinlock_t *ptl; + swp_entry_t entry; + struct page *page; + + ptep = pte_offset_map_lock(mm, pmd, address, &ptl); + pte = *ptep; + if (!is_swap_pte(pte)) + goto out; + + entry = pte_to_swp_entry(pte); + if (!is_migration_entry(entry)) + goto out; + + page = migration_entry_to_page(entry); + + get_page(page); + pte_unmap_unlock(ptep, ptl); + wait_on_page_locked(page); + put_page(page); + return; +out: + pte_unmap_unlock(ptep, ptl); +} + /* * swapout a single page * page is locked upon entry, unlocked on exit diff --git a/mm/mprotect.c b/mm/mprotect.c index 5faf01ad3ef..14f93e62270 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -19,7 +19,8 @@ #include #include #include - +#include +#include #include #include #include @@ -28,12 +29,13 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot) { - pte_t *pte; + pte_t *pte, oldpte; spinlock_t *ptl; pte = pte_offset_map_lock(mm, pmd, addr, &ptl); do { - if (pte_present(*pte)) { + oldpte = *pte; + if (pte_present(oldpte)) { pte_t ptent; /* Avoid an SMP race with hardware updated dirty/clean @@ -43,7 +45,22 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, ptent = pte_modify(ptep_get_and_clear(mm, addr, pte), newprot); set_pte_at(mm, addr, pte, ptent); lazy_mmu_prot_update(ptent); +#ifdef CONFIG_MIGRATION + } else if (!pte_file(oldpte)) { + swp_entry_t entry = pte_to_swp_entry(oldpte); + + if (is_write_migration_entry(entry)) { + /* + * A protection check is difficult so + * just be safe and disable write + */ + make_migration_entry_read(&entry); + set_pte_at(mm, addr, pte, + swp_entry_to_pte(entry)); + } +#endif } + } while (pte++, addr += PAGE_SIZE, addr != end); pte_unmap_unlock(pte - 1, ptl); } diff --git a/mm/rmap.c b/mm/rmap.c index 10806b7af40..3b8ce86daa3 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -103,7 +103,7 @@ int anon_vma_prepare(struct vm_area_struct *vma) spin_lock(&mm->page_table_lock); if (likely(!vma->anon_vma)) { vma->anon_vma = anon_vma; - list_add(&vma->anon_vma_node, &anon_vma->head); + list_add_tail(&vma->anon_vma_node, &anon_vma->head); allocated = NULL; } spin_unlock(&mm->page_table_lock); @@ -127,7 +127,7 @@ void __anon_vma_link(struct vm_area_struct *vma) struct anon_vma *anon_vma = vma->anon_vma; if (anon_vma) { - list_add(&vma->anon_vma_node, &anon_vma->head); + list_add_tail(&vma->anon_vma_node, &anon_vma->head); validate_anon_vma(vma); } } @@ -138,7 +138,7 @@ void anon_vma_link(struct vm_area_struct *vma) if (anon_vma) { spin_lock(&anon_vma->lock); - list_add(&vma->anon_vma_node, &anon_vma->head); + list_add_tail(&vma->anon_vma_node, &anon_vma->head); validate_anon_vma(vma); spin_unlock(&anon_vma->lock); } @@ -620,17 +620,27 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if (PageAnon(page)) { swp_entry_t entry = { .val = page_private(page) }; - /* - * Store the swap location in the pte. - * See handle_pte_fault() ... - */ - BUG_ON(!PageSwapCache(page)); - swap_duplicate(entry); - if (list_empty(&mm->mmlist)) { - spin_lock(&mmlist_lock); - if (list_empty(&mm->mmlist)) - list_add(&mm->mmlist, &init_mm.mmlist); - spin_unlock(&mmlist_lock); + + if (PageSwapCache(page)) { + /* + * Store the swap location in the pte. + * See handle_pte_fault() ... + */ + swap_duplicate(entry); + if (list_empty(&mm->mmlist)) { + spin_lock(&mmlist_lock); + if (list_empty(&mm->mmlist)) + list_add(&mm->mmlist, &init_mm.mmlist); + spin_unlock(&mmlist_lock); + } + } else { + /* + * Store the pfn of the page in a special migration + * pte. do_swap_page() will wait until the migration + * pte is removed and then restart fault handling. + */ + BUG_ON(!migration); + entry = make_migration_entry(page, pte_write(pteval)); } set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); BUG_ON(pte_file(*pte)); diff --git a/mm/swapfile.c b/mm/swapfile.c index 47a6812f5f8..e3b1362372c 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -395,6 +395,9 @@ void free_swap_and_cache(swp_entry_t entry) struct swap_info_struct * p; struct page *page = NULL; + if (is_migration_entry(entry)) + return; + p = swap_info_get(entry); if (p) { if (swap_entry_free(p, swp_offset(entry)) == 1) { @@ -1400,19 +1403,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) if (!(p->flags & SWP_USED)) break; error = -EPERM; - /* - * Test if adding another swap device is possible. There are - * two limiting factors: 1) the number of bits for the swap - * type swp_entry_t definition and 2) the number of bits for - * the swap type in the swap ptes as defined by the different - * architectures. To honor both limitations a swap entry - * with swap offset 0 and swap type ~0UL is created, encoded - * to a swap pte, decoded to a swp_entry_t again and finally - * the swap type part is extracted. This will mask all bits - * from the initial ~0UL that can't be encoded in either the - * swp_entry_t or the architecture definition of a swap pte. - */ - if (type > swp_type(pte_to_swp_entry(swp_entry_to_pte(swp_entry(~0UL,0))))) { + if (type >= MAX_SWAPFILES) { spin_unlock(&swap_lock); goto out; } @@ -1702,6 +1693,9 @@ int swap_duplicate(swp_entry_t entry) unsigned long offset, type; int result = 0; + if (is_migration_entry(entry)) + return 1; + type = swp_type(entry); if (type >= nr_swapfiles) goto bad_file; -- cgit v1.2.3-70-g09d2 From d75a0fcda2cfc71b50e16dc89e0c32c57d427e85 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 23 Jun 2006 02:03:36 -0700 Subject: [PATCH] Swapless page migration: rip out swap based logic Rip the page migration logic out. Remove all code that has to do with swapping during page migration. This also guts the ability to migrate pages to swap. No one used that so lets let it go for good. Page migration should be a bit broken after this patch. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rmap.h | 1 - mm/migrate.c | 75 +++------------------------------------------------- mm/rmap.c | 38 -------------------------- mm/swapfile.c | 9 ------- 4 files changed, 3 insertions(+), 120 deletions(-) (limited to 'mm/migrate.c') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 2d4c81a220d..bf97b090001 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -91,7 +91,6 @@ static inline void page_dup_rmap(struct page *page) */ int page_referenced(struct page *, int is_locked); int try_to_unmap(struct page *, int ignore_refs); -void remove_from_swap(struct page *page); /* * Called from mm/filemap_xip.c to unmap empty zero page diff --git a/mm/migrate.c b/mm/migrate.c index 0a011e421bb..81721a061d5 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -70,10 +70,6 @@ int isolate_lru_page(struct page *page, struct list_head *pagelist) */ int migrate_prep(void) { - /* Must have swap device for migration */ - if (nr_swap_pages <= 0) - return -ENODEV; - /* * Clear the LRU lists so pages can be isolated. * Note that pages may be moved off the LRU after we have @@ -245,52 +241,6 @@ out: pte_unmap_unlock(ptep, ptl); } -/* - * swapout a single page - * page is locked upon entry, unlocked on exit - */ -static int swap_page(struct page *page) -{ - struct address_space *mapping = page_mapping(page); - - if (page_mapped(page) && mapping) - if (try_to_unmap(page, 1) != SWAP_SUCCESS) - goto unlock_retry; - - if (PageDirty(page)) { - /* Page is dirty, try to write it out here */ - switch(pageout(page, mapping)) { - case PAGE_KEEP: - case PAGE_ACTIVATE: - goto unlock_retry; - - case PAGE_SUCCESS: - goto retry; - - case PAGE_CLEAN: - ; /* try to free the page below */ - } - } - - if (PagePrivate(page)) { - if (!try_to_release_page(page, GFP_KERNEL) || - (!mapping && page_count(page) == 1)) - goto unlock_retry; - } - - if (remove_mapping(mapping, page)) { - /* Success */ - unlock_page(page); - return 0; - } - -unlock_retry: - unlock_page(page); - -retry: - return -EAGAIN; -} - /* * Replace the page in the mapping. * @@ -517,8 +467,7 @@ static int fallback_migrate_page(struct address_space *mapping, * Two lists are passed to this function. The first list * contains the pages isolated from the LRU to be migrated. * The second list contains new pages that the pages isolated - * can be moved to. If the second list is NULL then all - * pages are swapped out. + * can be moved to. * * The function returns after 10 attempts or if no pages * are movable anymore because to has become empty @@ -574,29 +523,11 @@ redo: * Only wait on writeback if we have already done a pass where * we we may have triggered writeouts for lots of pages. */ - if (pass > 0) { + if (pass > 0) wait_on_page_writeback(page); - } else { + else if (PageWriteback(page)) goto unlock_page; - } - - /* - * Anonymous pages must have swap cache references otherwise - * the information contained in the page maps cannot be - * preserved. - */ - if (PageAnon(page) && !PageSwapCache(page)) { - if (!add_to_swap(page, GFP_KERNEL)) { - rc = -ENOMEM; - goto unlock_page; - } - } - - if (!to) { - rc = swap_page(page); - goto next; - } /* * Establish swap ptes for anonymous pages or destroy pte diff --git a/mm/rmap.c b/mm/rmap.c index 3b8ce86daa3..a53a10b93ec 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -205,44 +205,6 @@ out: return anon_vma; } -#ifdef CONFIG_MIGRATION -/* - * Remove an anonymous page from swap replacing the swap pte's - * through real pte's pointing to valid pages and then releasing - * the page from the swap cache. - * - * Must hold page lock on page and mmap_sem of one vma that contains - * the page. - */ -void remove_from_swap(struct page *page) -{ - struct anon_vma *anon_vma; - struct vm_area_struct *vma; - unsigned long mapping; - - if (!PageSwapCache(page)) - return; - - mapping = (unsigned long)page->mapping; - - if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0) - return; - - /* - * We hold the mmap_sem lock. So no need to call page_lock_anon_vma. - */ - anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON); - spin_lock(&anon_vma->lock); - - list_for_each_entry(vma, &anon_vma->head, anon_vma_node) - remove_vma_swap(vma, page); - - spin_unlock(&anon_vma->lock); - delete_from_swap_cache(page); -} -EXPORT_SYMBOL(remove_from_swap); -#endif - /* * At what user virtual address is page expected in vma? */ diff --git a/mm/swapfile.c b/mm/swapfile.c index e3b1362372c..fbceed67a07 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -618,15 +618,6 @@ static int unuse_mm(struct mm_struct *mm, return 0; } -#ifdef CONFIG_MIGRATION -int remove_vma_swap(struct vm_area_struct *vma, struct page *page) -{ - swp_entry_t entry = { .val = page_private(page) }; - - return unuse_vma(vma, entry, page); -} -#endif - /* * Scan swap_map from current position to next entry still in use. * Recycle to start on reaching the end, returning 0 when empty. -- cgit v1.2.3-70-g09d2 From 6c5240ae7f48c83fcaa8e24fa63e7eb09aba5651 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 23 Jun 2006 02:03:37 -0700 Subject: [PATCH] Swapless page migration: modify core logic Use the migration entries for page migration This modifies the migration code to use the new migration entries. It now becomes possible to migrate anonymous pages without having to add a swap entry. We add a couple of new functions to replace migration entries with the proper ptes. We cannot take the tree_lock for migrating anonymous pages anymore. However, we know that we hold the only remaining reference to the page when the page count reaches 1. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 4 ++-- mm/migrate.c | 53 +++++++++++++++++++++-------------------------------- 2 files changed, 23 insertions(+), 34 deletions(-) (limited to 'mm/migrate.c') diff --git a/mm/Kconfig b/mm/Kconfig index 332f5c29b53..66e65ab3942 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -138,8 +138,8 @@ config SPLIT_PTLOCK_CPUS # config MIGRATION bool "Page migration" - def_bool y if NUMA - depends on SWAP && NUMA + def_bool y + depends on NUMA help Allows the migration of the physical location of pages of processes while the virtual addresses are not changed. This is useful for diff --git a/mm/migrate.c b/mm/migrate.c index 81721a061d5..8f91463eab4 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -254,14 +254,20 @@ static int migrate_page_move_mapping(struct address_space *mapping, { struct page **radix_pointer; + if (!mapping) { + /* Anonymous page */ + if (page_count(page) != 1) + return -EAGAIN; + return 0; + } + write_lock_irq(&mapping->tree_lock); radix_pointer = (struct page **)radix_tree_lookup_slot( &mapping->page_tree, page_index(page)); - if (!page_mapping(page) || - page_count(page) != 2 + !!PagePrivate(page) || + if (page_count(page) != 2 + !!PagePrivate(page) || *radix_pointer != page) { write_unlock_irq(&mapping->tree_lock); return -EAGAIN; @@ -271,10 +277,12 @@ static int migrate_page_move_mapping(struct address_space *mapping, * Now we know that no one else is looking at the page. */ get_page(newpage); +#ifdef CONFIG_SWAP if (PageSwapCache(page)) { SetPageSwapCache(newpage); set_page_private(newpage, page_private(page)); } +#endif *radix_pointer = newpage; __put_page(page); @@ -308,7 +316,9 @@ static void migrate_page_copy(struct page *newpage, struct page *page) set_page_dirty(newpage); } +#ifdef CONFIG_SWAP ClearPageSwapCache(page); +#endif ClearPageActive(page); ClearPagePrivate(page); set_page_private(page, 0); @@ -353,16 +363,6 @@ int migrate_page(struct address_space *mapping, return rc; migrate_page_copy(newpage, page); - - /* - * Remove auxiliary swap entries and replace - * them with real ptes. - * - * Note that a real pte entry will allow processes that are not - * waiting on the page lock to use the new page via the page tables - * before the new page is unlocked. - */ - remove_from_swap(newpage); return 0; } EXPORT_SYMBOL(migrate_page); @@ -530,23 +530,7 @@ redo: goto unlock_page; /* - * Establish swap ptes for anonymous pages or destroy pte - * maps for files. - * - * In order to reestablish file backed mappings the fault handlers - * will take the radix tree_lock which may then be used to stop - * processses from accessing this page until the new page is ready. - * - * A process accessing via a swap pte (an anonymous page) will take a - * page_lock on the old page which will block the process until the - * migration attempt is complete. At that time the PageSwapCache bit - * will be examined. If the page was migrated then the PageSwapCache - * bit will be clear and the operation to retrieve the page will be - * retried which will find the new page in the radix tree. Then a new - * direct mapping may be generated based on the radix tree contents. - * - * If the page was not migrated then the PageSwapCache bit - * is still set and the operation may continue. + * Establish migration ptes or remove ptes */ rc = -EPERM; if (try_to_unmap(page, 1) == SWAP_FAIL) @@ -569,9 +553,9 @@ redo: */ mapping = page_mapping(page); if (!mapping) - goto unlock_both; + rc = migrate_page(mapping, newpage, page); - if (mapping->a_ops->migratepage) + else if (mapping->a_ops->migratepage) /* * Most pages have a mapping and most filesystems * should provide a migration function. Anonymous @@ -584,10 +568,15 @@ redo: else rc = fallback_migrate_page(mapping, newpage, page); -unlock_both: + if (!rc) + remove_migration_ptes(page, newpage); + unlock_page(newpage); unlock_page: + if (rc) + remove_migration_ptes(page, page); + unlock_page(page); next: -- cgit v1.2.3-70-g09d2 From 442c9137de8d769053e81d325709dca72f0b5e44 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 23 Jun 2006 02:03:38 -0700 Subject: [PATCH] More page migration: do not inc/dec rss counters If we install a migration entry then the rss not really decreases since the page is just moved somewhere else. We can save ourselves the work of decrementing and later incrementing which will just eventually cause cacheline bouncing. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 1 - mm/rmap.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'mm/migrate.c') diff --git a/mm/migrate.c b/mm/migrate.c index 8f91463eab4..96b9546e69e 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -164,7 +164,6 @@ static void remove_migration_pte(struct vm_area_struct *vma, unsigned long addr, if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old) goto out; - inc_mm_counter(mm, anon_rss); get_page(new); pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); if (is_write_migration_entry(entry)) diff --git a/mm/rmap.c b/mm/rmap.c index a53a10b93ec..05d6d73a692 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -595,6 +595,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, list_add(&mm->mmlist, &init_mm.mmlist); spin_unlock(&mmlist_lock); } + dec_mm_counter(mm, anon_rss); } else { /* * Store the pfn of the page in a special migration @@ -606,7 +607,6 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, } set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); BUG_ON(pte_file(*pte)); - dec_mm_counter(mm, anon_rss); } else dec_mm_counter(mm, file_rss); -- cgit v1.2.3-70-g09d2 From 04e62a29bf157ce1edd168f2b71b533c80d13628 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 23 Jun 2006 02:03:38 -0700 Subject: [PATCH] More page migration: use migration entries for file pages This implements the use of migration entries to preserve ptes of file backed pages during migration. Processes can therefore be migrated back and forth without loosing their connection to pagecache pages. Note that we implement the migration entries only for linear mappings. Nonlinear mappings still require the unmapping of the ptes for migration. And another writepage() ugliness shows up. writepage() can drop the page lock. Therefore we have to remove migration ptes before calling writepages() in order to avoid having migration entries point to unlocked pages. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 15 ------ mm/migrate.c | 127 ++++++++++++++++++++++++++++++++++++++++----------- mm/rmap.c | 11 +++++ mm/vmscan.c | 14 +++++- 4 files changed, 124 insertions(+), 43 deletions(-) (limited to 'mm/migrate.c') diff --git a/include/linux/swap.h b/include/linux/swap.h index 7cee73ef4f1..1cf234e8df5 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -186,20 +186,6 @@ extern unsigned long shrink_all_memory(unsigned long nr_pages); extern int vm_swappiness; extern int remove_mapping(struct address_space *mapping, struct page *page); -/* possible outcome of pageout() */ -typedef enum { - /* failed to write page out, page is locked */ - PAGE_KEEP, - /* move page to the active list, page is locked */ - PAGE_ACTIVATE, - /* page has been sent to the disk successfully, page is unlocked */ - PAGE_SUCCESS, - /* page is clean and locked */ - PAGE_CLEAN, -} pageout_t; - -extern pageout_t pageout(struct page *page, struct address_space *mapping); - #ifdef CONFIG_NUMA extern int zone_reclaim_mode; extern int zone_reclaim_interval; @@ -259,7 +245,6 @@ extern int remove_exclusive_swap_page(struct page *); struct backing_dev_info; extern spinlock_t swap_lock; -extern int remove_vma_swap(struct vm_area_struct *vma, struct page *page); /* linux/mm/thrash.c */ extern struct mm_struct * swap_token_mm; diff --git a/mm/migrate.c b/mm/migrate.c index 96b9546e69e..b5000d46389 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -24,6 +24,7 @@ #include #include #include +#include #include "internal.h" @@ -123,7 +124,7 @@ static inline int is_swap_pte(pte_t pte) /* * Restore a potential migration pte to a working pte entry */ -static void remove_migration_pte(struct vm_area_struct *vma, unsigned long addr, +static void remove_migration_pte(struct vm_area_struct *vma, struct page *old, struct page *new) { struct mm_struct *mm = vma->vm_mm; @@ -133,6 +134,10 @@ static void remove_migration_pte(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd; pte_t *ptep, pte; spinlock_t *ptl; + unsigned long addr = page_address_in_vma(new, vma); + + if (addr == -EFAULT) + return; pgd = pgd_offset(mm, addr); if (!pgd_present(*pgd)) @@ -169,19 +174,47 @@ static void remove_migration_pte(struct vm_area_struct *vma, unsigned long addr, if (is_write_migration_entry(entry)) pte = pte_mkwrite(pte); set_pte_at(mm, addr, ptep, pte); - page_add_anon_rmap(new, vma, addr); + + if (PageAnon(new)) + page_add_anon_rmap(new, vma, addr); + else + page_add_file_rmap(new); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(vma, addr, pte); + lazy_mmu_prot_update(pte); + out: pte_unmap_unlock(ptep, ptl); } /* - * Get rid of all migration entries and replace them by - * references to the indicated page. - * + * Note that remove_file_migration_ptes will only work on regular mappings, + * Nonlinear mappings do not use migration entries. + */ +static void remove_file_migration_ptes(struct page *old, struct page *new) +{ + struct vm_area_struct *vma; + struct address_space *mapping = page_mapping(new); + struct prio_tree_iter iter; + pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + + if (!mapping) + return; + + spin_lock(&mapping->i_mmap_lock); + + vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) + remove_migration_pte(vma, old, new); + + spin_unlock(&mapping->i_mmap_lock); +} + +/* * Must hold mmap_sem lock on at least one of the vmas containing * the page so that the anon_vma cannot vanish. */ -static void remove_migration_ptes(struct page *old, struct page *new) +static void remove_anon_migration_ptes(struct page *old, struct page *new) { struct anon_vma *anon_vma; struct vm_area_struct *vma; @@ -199,12 +232,23 @@ static void remove_migration_ptes(struct page *old, struct page *new) spin_lock(&anon_vma->lock); list_for_each_entry(vma, &anon_vma->head, anon_vma_node) - remove_migration_pte(vma, page_address_in_vma(new, vma), - old, new); + remove_migration_pte(vma, old, new); spin_unlock(&anon_vma->lock); } +/* + * Get rid of all migration entries and replace them by + * references to the indicated page. + */ +static void remove_migration_ptes(struct page *old, struct page *new) +{ + if (PageAnon(new)) + remove_anon_migration_ptes(old, new); + else + remove_file_migration_ptes(old, new); +} + /* * Something used the pte of a page under migration. We need to * get to the page and wait until migration is finished. @@ -424,30 +468,59 @@ int buffer_migrate_page(struct address_space *mapping, } EXPORT_SYMBOL(buffer_migrate_page); -static int fallback_migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page) +/* + * Writeback a page to clean the dirty state + */ +static int writeout(struct address_space *mapping, struct page *page) { + struct writeback_control wbc = { + .sync_mode = WB_SYNC_NONE, + .nr_to_write = 1, + .range_start = 0, + .range_end = LLONG_MAX, + .nonblocking = 1, + .for_reclaim = 1 + }; + int rc; + + if (!mapping->a_ops->writepage) + /* No write method for the address space */ + return -EINVAL; + + if (!clear_page_dirty_for_io(page)) + /* Someone else already triggered a write */ + return -EAGAIN; + /* - * Default handling if a filesystem does not provide - * a migration function. We can only migrate clean - * pages so try to write out any dirty pages first. + * A dirty page may imply that the underlying filesystem has + * the page on some queue. So the page must be clean for + * migration. Writeout may mean we loose the lock and the + * page state is no longer what we checked for earlier. + * At this point we know that the migration attempt cannot + * be successful. */ - if (PageDirty(page)) { - switch (pageout(page, mapping)) { - case PAGE_KEEP: - case PAGE_ACTIVATE: - return -EAGAIN; + remove_migration_ptes(page, page); - case PAGE_SUCCESS: - /* Relock since we lost the lock */ - lock_page(page); - /* Must retry since page state may have changed */ - return -EAGAIN; + rc = mapping->a_ops->writepage(page, &wbc); + if (rc < 0) + /* I/O Error writing */ + return -EIO; - case PAGE_CLEAN: - ; /* try to migrate the page below */ - } - } + if (rc != AOP_WRITEPAGE_ACTIVATE) + /* unlocked. Relock */ + lock_page(page); + + return -EAGAIN; +} + +/* + * Default handling if a filesystem does not provide a migration function. + */ +static int fallback_migrate_page(struct address_space *mapping, + struct page *newpage, struct page *page) +{ + if (PageDirty(page)) + return writeout(mapping, page); /* * Buffers may be managed in a filesystem specific way. diff --git a/mm/rmap.c b/mm/rmap.c index 05d6d73a692..882a85826bb 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -596,6 +596,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, spin_unlock(&mmlist_lock); } dec_mm_counter(mm, anon_rss); +#ifdef CONFIG_MIGRATION } else { /* * Store the pfn of the page in a special migration @@ -604,12 +605,22 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, */ BUG_ON(!migration); entry = make_migration_entry(page, pte_write(pteval)); +#endif } set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); BUG_ON(pte_file(*pte)); } else +#ifdef CONFIG_MIGRATION + if (migration) { + /* Establish migration entry for a file page */ + swp_entry_t entry; + entry = make_migration_entry(page, pte_write(pteval)); + set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); + } else +#endif dec_mm_counter(mm, file_rss); + page_remove_rmap(page); page_cache_release(page); diff --git a/mm/vmscan.c b/mm/vmscan.c index bc5d4f43036..71a02e29503 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -290,11 +290,23 @@ static void handle_write_error(struct address_space *mapping, unlock_page(page); } +/* possible outcome of pageout() */ +typedef enum { + /* failed to write page out, page is locked */ + PAGE_KEEP, + /* move page to the active list, page is locked */ + PAGE_ACTIVATE, + /* page has been sent to the disk successfully, page is unlocked */ + PAGE_SUCCESS, + /* page is clean and locked */ + PAGE_CLEAN, +} pageout_t; + /* * pageout is called by shrink_page_list() for each dirty page. * Calls ->writepage(). */ -pageout_t pageout(struct page *page, struct address_space *mapping) +static pageout_t pageout(struct page *page, struct address_space *mapping) { /* * If the page is dirty, only perform writeback if that write -- cgit v1.2.3-70-g09d2 From e24f0b8f76cc3dd96f36f5b6a9f020f6c3fce198 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 23 Jun 2006 02:03:51 -0700 Subject: [PATCH] page migration: simplify migrate_pages() Currently migrate_pages() is mess with lots of goto. Extract two functions from migrate_pages() and get rid of the gotos. Plus we can just unconditionally set the locked bit on the new page since we are the only one holding a reference. Locking is to stop others from accessing the page once we establish references to the new page. Remove the list_del from move_to_lru in order to have finer control over list processing. [akpm@osdl.org: add debug check] Signed-off-by: Christoph Lameter Cc: Hugh Dickins Cc: Jes Sorensen Cc: KAMEZAWA Hiroyuki Cc: Lee Schermerhorn Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 218 +++++++++++++++++++++++++++++++---------------------------- 1 file changed, 115 insertions(+), 103 deletions(-) (limited to 'mm/migrate.c') diff --git a/mm/migrate.c b/mm/migrate.c index b5000d46389..09038163bfe 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -84,7 +84,6 @@ int migrate_prep(void) static inline void move_to_lru(struct page *page) { - list_del(&page->lru); if (PageActive(page)) { /* * lru_cache_add_active checks that @@ -110,6 +109,7 @@ int putback_lru_pages(struct list_head *l) int count = 0; list_for_each_entry_safe(page, page2, l, lru) { + list_del(&page->lru); move_to_lru(page); count++; } @@ -533,12 +533,109 @@ static int fallback_migrate_page(struct address_space *mapping, return migrate_page(mapping, newpage, page); } +/* + * Move a page to a newly allocated page + * The page is locked and all ptes have been successfully removed. + * + * The new page will have replaced the old page if this function + * is successful. + */ +static int move_to_new_page(struct page *newpage, struct page *page) +{ + struct address_space *mapping; + int rc; + + /* + * Block others from accessing the page when we get around to + * establishing additional references. We are the only one + * holding a reference to the new page at this point. + */ + if (TestSetPageLocked(newpage)) + BUG(); + + /* Prepare mapping for the new page.*/ + newpage->index = page->index; + newpage->mapping = page->mapping; + + mapping = page_mapping(page); + if (!mapping) + rc = migrate_page(mapping, newpage, page); + else if (mapping->a_ops->migratepage) + /* + * Most pages have a mapping and most filesystems + * should provide a migration function. Anonymous + * pages are part of swap space which also has its + * own migration function. This is the most common + * path for page migration. + */ + rc = mapping->a_ops->migratepage(mapping, + newpage, page); + else + rc = fallback_migrate_page(mapping, newpage, page); + + if (!rc) + remove_migration_ptes(page, newpage); + else + newpage->mapping = NULL; + + unlock_page(newpage); + + return rc; +} + +/* + * Obtain the lock on page, remove all ptes and migrate the page + * to the newly allocated page in newpage. + */ +static int unmap_and_move(struct page *newpage, struct page *page, int force) +{ + int rc = 0; + + if (page_count(page) == 1) + /* page was freed from under us. So we are done. */ + goto ret; + + rc = -EAGAIN; + if (TestSetPageLocked(page)) { + if (!force) + goto ret; + lock_page(page); + } + + if (PageWriteback(page)) { + if (!force) + goto unlock; + wait_on_page_writeback(page); + } + + /* + * Establish migration ptes or remove ptes + */ + if (try_to_unmap(page, 1) != SWAP_FAIL) { + if (!page_mapped(page)) + rc = move_to_new_page(newpage, page); + } else + /* A vma has VM_LOCKED set -> permanent failure */ + rc = -EPERM; + + if (rc) + remove_migration_ptes(page, page); +unlock: + unlock_page(page); +ret: + if (rc != -EAGAIN) { + list_del(&newpage->lru); + move_to_lru(newpage); + } + return rc; +} + /* * migrate_pages * * Two lists are passed to this function. The first list * contains the pages isolated from the LRU to be migrated. - * The second list contains new pages that the pages isolated + * The second list contains new pages that the isolated pages * can be moved to. * * The function returns after 10 attempts or if no pages @@ -550,7 +647,7 @@ static int fallback_migrate_page(struct address_space *mapping, int migrate_pages(struct list_head *from, struct list_head *to, struct list_head *moved, struct list_head *failed) { - int retry; + int retry = 1; int nr_failed = 0; int pass = 0; struct page *page; @@ -561,118 +658,33 @@ int migrate_pages(struct list_head *from, struct list_head *to, if (!swapwrite) current->flags |= PF_SWAPWRITE; -redo: - retry = 0; - - list_for_each_entry_safe(page, page2, from, lru) { - struct page *newpage = NULL; - struct address_space *mapping; - - cond_resched(); - - rc = 0; - if (page_count(page) == 1) - /* page was freed from under us. So we are done. */ - goto next; - - if (to && list_empty(to)) - break; - - /* - * Skip locked pages during the first two passes to give the - * functions holding the lock time to release the page. Later we - * use lock_page() to have a higher chance of acquiring the - * lock. - */ - rc = -EAGAIN; - if (pass > 2) - lock_page(page); - else - if (TestSetPageLocked(page)) - goto next; - - /* - * Only wait on writeback if we have already done a pass where - * we we may have triggered writeouts for lots of pages. - */ - if (pass > 0) - wait_on_page_writeback(page); - else - if (PageWriteback(page)) - goto unlock_page; - - /* - * Establish migration ptes or remove ptes - */ - rc = -EPERM; - if (try_to_unmap(page, 1) == SWAP_FAIL) - /* A vma has VM_LOCKED set -> permanent failure */ - goto unlock_page; + for(pass = 0; pass < 10 && retry; pass++) { + retry = 0; - rc = -EAGAIN; - if (page_mapped(page)) - goto unlock_page; + list_for_each_entry_safe(page, page2, from, lru) { - newpage = lru_to_page(to); - lock_page(newpage); - /* Prepare mapping for the new page.*/ - newpage->index = page->index; - newpage->mapping = page->mapping; + if (list_empty(to)) + break; - /* - * Pages are properly locked and writeback is complete. - * Try to migrate the page. - */ - mapping = page_mapping(page); - if (!mapping) - rc = migrate_page(mapping, newpage, page); + cond_resched(); - else if (mapping->a_ops->migratepage) - /* - * Most pages have a mapping and most filesystems - * should provide a migration function. Anonymous - * pages are part of swap space which also has its - * own migration function. This is the most common - * path for page migration. - */ - rc = mapping->a_ops->migratepage(mapping, - newpage, page); - else - rc = fallback_migrate_page(mapping, newpage, page); - - if (!rc) - remove_migration_ptes(page, newpage); - - unlock_page(newpage); - -unlock_page: - if (rc) - remove_migration_ptes(page, page); + rc = unmap_and_move(lru_to_page(to), page, pass > 2); - unlock_page(page); - -next: - if (rc) { - if (newpage) - newpage->mapping = NULL; - - if (rc == -EAGAIN) + switch(rc) { + case -EAGAIN: retry++; - else { + break; + case 0: + list_move(&page->lru, moved); + break; + default: /* Permanent failure */ list_move(&page->lru, failed); nr_failed++; + break; } - } else { - if (newpage) { - /* Successful migration. Return page to LRU */ - move_to_lru(newpage); - } - list_move(&page->lru, moved); } } - if (retry && pass++ < 10) - goto redo; if (!swapwrite) current->flags &= ~PF_SWAPWRITE; -- cgit v1.2.3-70-g09d2 From aaa994b300a172afafab47938804836b923e5ef7 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 23 Jun 2006 02:03:52 -0700 Subject: [PATCH] page migration: handle freeing of pages in migrate_pages() Do not leave pages on the lists passed to migrate_pages(). Seems that we will not need any postprocessing of pages. This will simplify the handling of pages by the callers of migrate_pages(). Signed-off-by: Christoph Lameter Cc: Hugh Dickins Cc: Jes Sorensen Cc: KAMEZAWA Hiroyuki Cc: Lee Schermerhorn Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 7 +++---- mm/mempolicy.c | 8 +------- mm/migrate.c | 48 +++++++++++++++++++++++------------------------- 3 files changed, 27 insertions(+), 36 deletions(-) (limited to 'mm/migrate.c') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 287c47b5e5d..83af25949fa 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -8,8 +8,7 @@ extern int isolate_lru_page(struct page *p, struct list_head *pagelist); extern int putback_lru_pages(struct list_head *l); extern int migrate_page(struct address_space *, struct page *, struct page *); -extern int migrate_pages(struct list_head *l, struct list_head *t, - struct list_head *moved, struct list_head *failed); +extern int migrate_pages(struct list_head *l, struct list_head *t); extern int migrate_pages_to(struct list_head *pagelist, struct vm_area_struct *vma, int dest); extern int fail_migrate_page(struct address_space *, @@ -22,8 +21,8 @@ extern int migrate_prep(void); static inline int isolate_lru_page(struct page *p, struct list_head *list) { return -ENOSYS; } static inline int putback_lru_pages(struct list_head *l) { return 0; } -static inline int migrate_pages(struct list_head *l, struct list_head *t, - struct list_head *moved, struct list_head *failed) { return -ENOSYS; } +static inline int migrate_pages(struct list_head *l, struct list_head *t) + { return -ENOSYS; } static inline int migrate_pages_to(struct list_head *pagelist, struct vm_area_struct *vma, int dest) { return 0; } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 8778f58880c..244f3f130e4 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -603,11 +603,8 @@ int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags) check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask, flags | MPOL_MF_DISCONTIG_OK, &pagelist); - if (!list_empty(&pagelist)) { + if (!list_empty(&pagelist)) err = migrate_pages_to(&pagelist, NULL, dest); - if (!list_empty(&pagelist)) - putback_lru_pages(&pagelist); - } return err; } @@ -773,9 +770,6 @@ long do_mbind(unsigned long start, unsigned long len, err = -EIO; } - if (!list_empty(&pagelist)) - putback_lru_pages(&pagelist); - up_write(&mm->mmap_sem); mpol_free(new); return err; diff --git a/mm/migrate.c b/mm/migrate.c index 09038163bfe..d3a1810a4c9 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -624,6 +624,15 @@ unlock: unlock_page(page); ret: if (rc != -EAGAIN) { + /* + * A page that has been migrated has all references + * removed and will be freed. A page that has not been + * migrated will have kepts its references and be + * restored. + */ + list_del(&page->lru); + move_to_lru(page); + list_del(&newpage->lru); move_to_lru(newpage); } @@ -640,12 +649,12 @@ ret: * * The function returns after 10 attempts or if no pages * are movable anymore because to has become empty - * or no retryable pages exist anymore. + * or no retryable pages exist anymore. All pages will be + * retruned to the LRU or freed. * - * Return: Number of pages not migrated when "to" ran empty. + * Return: Number of pages not migrated. */ -int migrate_pages(struct list_head *from, struct list_head *to, - struct list_head *moved, struct list_head *failed) +int migrate_pages(struct list_head *from, struct list_head *to) { int retry = 1; int nr_failed = 0; @@ -675,11 +684,9 @@ int migrate_pages(struct list_head *from, struct list_head *to, retry++; break; case 0: - list_move(&page->lru, moved); break; default: /* Permanent failure */ - list_move(&page->lru, failed); nr_failed++; break; } @@ -689,6 +696,7 @@ int migrate_pages(struct list_head *from, struct list_head *to, if (!swapwrite) current->flags &= ~PF_SWAPWRITE; + putback_lru_pages(from); return nr_failed + retry; } @@ -702,11 +710,10 @@ int migrate_pages_to(struct list_head *pagelist, struct vm_area_struct *vma, int dest) { LIST_HEAD(newlist); - LIST_HEAD(moved); - LIST_HEAD(failed); int err = 0; unsigned long offset = 0; int nr_pages; + int nr_failed = 0; struct page *page; struct list_head *p; @@ -740,26 +747,17 @@ redo: if (nr_pages > MIGRATE_CHUNK_SIZE) break; } - err = migrate_pages(pagelist, &newlist, &moved, &failed); + err = migrate_pages(pagelist, &newlist); - putback_lru_pages(&moved); /* Call release pages instead ?? */ - - if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist)) - goto redo; -out: - /* Return leftover allocated pages */ - while (!list_empty(&newlist)) { - page = list_entry(newlist.next, struct page, lru); - list_del(&page->lru); - __free_page(page); + if (err >= 0) { + nr_failed += err; + if (list_empty(&newlist) && !list_empty(pagelist)) + goto redo; } - list_splice(&failed, pagelist); - if (err < 0) - return err; +out: /* Calculate number of leftover pages */ - nr_pages = 0; list_for_each(p, pagelist) - nr_pages++; - return nr_pages; + nr_failed++; + return nr_failed; } -- cgit v1.2.3-70-g09d2 From 95a402c3847cc16f4ba03013cd01404fa0f14c2e Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 23 Jun 2006 02:03:53 -0700 Subject: [PATCH] page migration: use allocator function for migrate_pages() Instead of passing a list of new pages, pass a function to allocate a new page. This allows the correct placement of MPOL_INTERLEAVE pages during page migration. It also further simplifies the callers of migrate pages. migrate_pages() becomes similar to migrate_pages_to() so drop migrate_pages_to(). The batching of new page allocations becomes unnecessary. Signed-off-by: Christoph Lameter Cc: Hugh Dickins Cc: Jes Sorensen Cc: KAMEZAWA Hiroyuki Cc: Lee Schermerhorn Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 11 ++--- mm/mempolicy.c | 23 +++++++++- mm/migrate.c | 115 ++++++++++++++---------------------------------- 3 files changed, 59 insertions(+), 90 deletions(-) (limited to 'mm/migrate.c') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 83af25949fa..5b95d6568dc 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -3,14 +3,15 @@ #include +typedef struct page *new_page_t(struct page *, unsigned long private); + #ifdef CONFIG_MIGRATION extern int isolate_lru_page(struct page *p, struct list_head *pagelist); extern int putback_lru_pages(struct list_head *l); extern int migrate_page(struct address_space *, struct page *, struct page *); -extern int migrate_pages(struct list_head *l, struct list_head *t); -extern int migrate_pages_to(struct list_head *pagelist, - struct vm_area_struct *vma, int dest); +extern int migrate_pages(struct list_head *l, new_page_t x, unsigned long); + extern int fail_migrate_page(struct address_space *, struct page *, struct page *); @@ -21,8 +22,8 @@ extern int migrate_prep(void); static inline int isolate_lru_page(struct page *p, struct list_head *list) { return -ENOSYS; } static inline int putback_lru_pages(struct list_head *l) { return 0; } -static inline int migrate_pages(struct list_head *l, struct list_head *t) - { return -ENOSYS; } +static inline int migrate_pages(struct list_head *l, new_page_t x, + unsigned long private) { return -ENOSYS; } static inline int migrate_pages_to(struct list_head *pagelist, struct vm_area_struct *vma, int dest) { return 0; } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 244f3f130e4..f432642e9e6 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -87,6 +87,7 @@ #include #include #include +#include #include #include @@ -587,6 +588,11 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, isolate_lru_page(page, pagelist); } +static struct page *new_node_page(struct page *page, unsigned long node) +{ + return alloc_pages_node(node, GFP_HIGHUSER, 0); +} + /* * Migrate pages from one node to a target node. * Returns error or the number of pages not migrated. @@ -604,7 +610,8 @@ int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags) flags | MPOL_MF_DISCONTIG_OK, &pagelist); if (!list_empty(&pagelist)) - err = migrate_pages_to(&pagelist, NULL, dest); + err = migrate_pages(&pagelist, new_node_page, dest); + return err; } @@ -691,6 +698,12 @@ int do_migrate_pages(struct mm_struct *mm, } +static struct page *new_vma_page(struct page *page, unsigned long private) +{ + struct vm_area_struct *vma = (struct vm_area_struct *)private; + + return alloc_page_vma(GFP_HIGHUSER, vma, page_address_in_vma(page, vma)); +} #else static void migrate_page_add(struct page *page, struct list_head *pagelist, @@ -703,6 +716,11 @@ int do_migrate_pages(struct mm_struct *mm, { return -ENOSYS; } + +static struct page *new_vma_page(struct page *page, unsigned long private) +{ + return NULL; +} #endif long do_mbind(unsigned long start, unsigned long len, @@ -764,7 +782,8 @@ long do_mbind(unsigned long start, unsigned long len, err = mbind_range(vma, start, end, new); if (!list_empty(&pagelist)) - nr_failed = migrate_pages_to(&pagelist, vma, -1); + nr_failed = migrate_pages(&pagelist, new_vma_page, + (unsigned long)vma); if (!err && nr_failed && (flags & MPOL_MF_STRICT)) err = -EIO; diff --git a/mm/migrate.c b/mm/migrate.c index d3a1810a4c9..251a8d15825 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -28,9 +28,6 @@ #include "internal.h" -/* The maximum number of pages to take off the LRU for migration */ -#define MIGRATE_CHUNK_SIZE 256 - #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) /* @@ -587,18 +584,23 @@ static int move_to_new_page(struct page *newpage, struct page *page) * Obtain the lock on page, remove all ptes and migrate the page * to the newly allocated page in newpage. */ -static int unmap_and_move(struct page *newpage, struct page *page, int force) +static int unmap_and_move(new_page_t get_new_page, unsigned long private, + struct page *page, int force) { int rc = 0; + struct page *newpage = get_new_page(page, private); + + if (!newpage) + return -ENOMEM; if (page_count(page) == 1) /* page was freed from under us. So we are done. */ - goto ret; + goto move_newpage; rc = -EAGAIN; if (TestSetPageLocked(page)) { if (!force) - goto ret; + goto move_newpage; lock_page(page); } @@ -622,7 +624,7 @@ static int unmap_and_move(struct page *newpage, struct page *page, int force) remove_migration_ptes(page, page); unlock: unlock_page(page); -ret: + if (rc != -EAGAIN) { /* * A page that has been migrated has all references @@ -632,29 +634,33 @@ ret: */ list_del(&page->lru); move_to_lru(page); - - list_del(&newpage->lru); - move_to_lru(newpage); } + +move_newpage: + /* + * Move the new page to the LRU. If migration was not successful + * then this will free the page. + */ + move_to_lru(newpage); return rc; } /* * migrate_pages * - * Two lists are passed to this function. The first list - * contains the pages isolated from the LRU to be migrated. - * The second list contains new pages that the isolated pages - * can be moved to. + * The function takes one list of pages to migrate and a function + * that determines from the page to be migrated and the private data + * the target of the move and allocates the page. * * The function returns after 10 attempts or if no pages * are movable anymore because to has become empty * or no retryable pages exist anymore. All pages will be * retruned to the LRU or freed. * - * Return: Number of pages not migrated. + * Return: Number of pages not migrated or error code. */ -int migrate_pages(struct list_head *from, struct list_head *to) +int migrate_pages(struct list_head *from, + new_page_t get_new_page, unsigned long private) { int retry = 1; int nr_failed = 0; @@ -671,15 +677,14 @@ int migrate_pages(struct list_head *from, struct list_head *to) retry = 0; list_for_each_entry_safe(page, page2, from, lru) { - - if (list_empty(to)) - break; - cond_resched(); - rc = unmap_and_move(lru_to_page(to), page, pass > 2); + rc = unmap_and_move(get_new_page, private, + page, pass > 2); switch(rc) { + case -ENOMEM: + goto out; case -EAGAIN: retry++; break; @@ -692,72 +697,16 @@ int migrate_pages(struct list_head *from, struct list_head *to) } } } - + rc = 0; +out: if (!swapwrite) current->flags &= ~PF_SWAPWRITE; putback_lru_pages(from); - return nr_failed + retry; -} -/* - * Migrate the list 'pagelist' of pages to a certain destination. - * - * Specify destination with either non-NULL vma or dest_node >= 0 - * Return the number of pages not migrated or error code - */ -int migrate_pages_to(struct list_head *pagelist, - struct vm_area_struct *vma, int dest) -{ - LIST_HEAD(newlist); - int err = 0; - unsigned long offset = 0; - int nr_pages; - int nr_failed = 0; - struct page *page; - struct list_head *p; - -redo: - nr_pages = 0; - list_for_each(p, pagelist) { - if (vma) { - /* - * The address passed to alloc_page_vma is used to - * generate the proper interleave behavior. We fake - * the address here by an increasing offset in order - * to get the proper distribution of pages. - * - * No decision has been made as to which page - * a certain old page is moved to so we cannot - * specify the correct address. - */ - page = alloc_page_vma(GFP_HIGHUSER, vma, - offset + vma->vm_start); - offset += PAGE_SIZE; - } - else - page = alloc_pages_node(dest, GFP_HIGHUSER, 0); - - if (!page) { - err = -ENOMEM; - goto out; - } - list_add_tail(&page->lru, &newlist); - nr_pages++; - if (nr_pages > MIGRATE_CHUNK_SIZE) - break; - } - err = migrate_pages(pagelist, &newlist); - - if (err >= 0) { - nr_failed += err; - if (list_empty(&newlist) && !list_empty(pagelist)) - goto redo; - } -out: + if (rc) + return rc; - /* Calculate number of leftover pages */ - list_for_each(p, pagelist) - nr_failed++; - return nr_failed; + return nr_failed + retry; } + -- cgit v1.2.3-70-g09d2 From 742755a1d8ce2b548428f7aacf1758b4bba50080 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 23 Jun 2006 02:03:55 -0700 Subject: [PATCH] page migration: sys_move_pages(): support moving of individual pages move_pages() is used to move individual pages of a process. The function can be used to determine the location of pages and to move them onto the desired node. move_pages() returns status information for each page. long move_pages(pid, number_of_pages_to_move, addresses_of_pages[], nodes[] or NULL, status[], flags); The addresses of pages is an array of void * pointing to the pages to be moved. The nodes array contains the node numbers that the pages should be moved to. If a NULL is passed instead of an array then no pages are moved but the status array is updated. The status request may be used to determine the page state before issuing another move_pages() to move pages. The status array will contain the state of all individual page migration attempts when the function terminates. The status array is only valid if move_pages() completed successfullly. Possible page states in status[]: 0..MAX_NUMNODES The page is now on the indicated node. -ENOENT Page is not present -EACCES Page is mapped by multiple processes and can only be moved if MPOL_MF_MOVE_ALL is specified. -EPERM The page has been mlocked by a process/driver and cannot be moved. -EBUSY Page is busy and cannot be moved. Try again later. -EFAULT Invalid address (no VMA or zero page). -ENOMEM Unable to allocate memory on target node. -EIO Unable to write back page. The page must be written back in order to move it since the page is dirty and the filesystem does not provide a migration function that would allow the moving of dirty pages. -EINVAL A dirty page cannot be moved. The filesystem does not provide a migration function and has no ability to write back pages. The flags parameter indicates what types of pages to move: MPOL_MF_MOVE Move pages that are only mapped by the process. MPOL_MF_MOVE_ALL Also move pages that are mapped by multiple processes. Requires sufficient capabilities. Possible return codes from move_pages() -ENOENT No pages found that would require moving. All pages are either already on the target node, not present, had an invalid address or could not be moved because they were mapped by multiple processes. -EINVAL Flags other than MPOL_MF_MOVE(_ALL) specified or an attempt to migrate pages in a kernel thread. -EPERM MPOL_MF_MOVE_ALL specified without sufficient priviledges. or an attempt to move a process belonging to another user. -EACCES One of the target nodes is not allowed by the current cpuset. -ENODEV One of the target nodes is not online. -ESRCH Process does not exist. -E2BIG Too many pages to move. -ENOMEM Not enough memory to allocate control array. -EFAULT Parameters could not be accessed. A test program for move_pages() may be found with the patches on ftp.kernel.org:/pub/linux/kernel/people/christoph/pmig/patches-2.6.17-rc4-mm3 From: Christoph Lameter Detailed results for sys_move_pages() Pass a pointer to an integer to get_new_page() that may be used to indicate where the completion status of a migration operation should be placed. This allows sys_move_pags() to report back exactly what happened to each page. Wish there would be a better way to do this. Looks a bit hacky. Signed-off-by: Christoph Lameter Cc: Hugh Dickins Cc: Jes Sorensen Cc: KAMEZAWA Hiroyuki Cc: Lee Schermerhorn Cc: Andi Kleen Cc: Michael Kerrisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/page_migration | 29 ++--- arch/ia64/kernel/entry.S | 2 +- include/asm-ia64/unistd.h | 2 +- include/linux/migrate.h | 2 +- include/linux/syscalls.h | 5 + kernel/sys_ni.c | 1 + mm/mempolicy.c | 4 +- mm/migrate.c | 268 +++++++++++++++++++++++++++++++++++++++- 8 files changed, 288 insertions(+), 25 deletions(-) (limited to 'mm/migrate.c') diff --git a/Documentation/vm/page_migration b/Documentation/vm/page_migration index 0a5d5fb1885..99f89aa1016 100644 --- a/Documentation/vm/page_migration +++ b/Documentation/vm/page_migration @@ -26,8 +26,13 @@ a process are located. See also the numa_maps manpage in the numactl package. Manual migration is useful if for example the scheduler has relocated a process to a processor on a distant node. A batch scheduler or an administrator may detect the situation and move the pages of the process -nearer to the new processor. At some point in the future we may have -some mechanism in the scheduler that will automatically move the pages. +nearer to the new processor. The kernel itself does only provide +manual page migration support. Automatic page migration may be implemented +through user space processes that move pages. A special function call +"move_pages" allows the moving of individual pages within a process. +A NUMA profiler may f.e. obtain a log showing frequent off node +accesses and may use the result to move pages to more advantageous +locations. Larger installations usually partition the system using cpusets into sections of nodes. Paul Jackson has equipped cpusets with the ability to @@ -62,22 +67,14 @@ A. In kernel use of migrate_pages() It also prevents the swapper or other scans to encounter the page. -2. Generate a list of newly allocates pages. These pages will contain the - contents of the pages from the first list after page migration is - complete. +2. We need to have a function of type new_page_t that can be + passed to migrate_pages(). This function should figure out + how to allocate the correct new page given the old page. 3. The migrate_pages() function is called which attempts - to do the migration. It returns the moved pages in the - list specified as the third parameter and the failed - migrations in the fourth parameter. When the function - returns the first list will contain the pages that could still be retried. - -4. The leftover pages of various types are returned - to the LRU using putback_to_lru_pages() or otherwise - disposed of. The pages will still have the refcount as - increased by isolate_lru_pages() if putback_to_lru_pages() is not - used! The kernel may want to handle the various cases of failures in - different ways. + to do the migration. It will call the function to allocate + the new page for each page that is considered for + moving. B. How migrate_pages() works ---------------------------- diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S index bcb80ca5cf4..32c999f58d1 100644 --- a/arch/ia64/kernel/entry.S +++ b/arch/ia64/kernel/entry.S @@ -1584,7 +1584,7 @@ sys_call_table: data8 sys_keyctl data8 sys_ioprio_set data8 sys_ioprio_get // 1275 - data8 sys_ni_syscall + data8 sys_move_pages data8 sys_inotify_init data8 sys_inotify_add_watch data8 sys_inotify_rm_watch diff --git a/include/asm-ia64/unistd.h b/include/asm-ia64/unistd.h index 632f2eedf72..bb0eb727dcd 100644 --- a/include/asm-ia64/unistd.h +++ b/include/asm-ia64/unistd.h @@ -265,7 +265,7 @@ #define __NR_keyctl 1273 #define __NR_ioprio_set 1274 #define __NR_ioprio_get 1275 -/* 1276 is available for reuse (was briefly sys_set_zone_reclaim) */ +#define __NR_move_pages 1276 #define __NR_inotify_init 1277 #define __NR_inotify_add_watch 1278 #define __NR_inotify_rm_watch 1279 diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 5b95d6568dc..5dba23a1c0d 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -3,7 +3,7 @@ #include -typedef struct page *new_page_t(struct page *, unsigned long private); +typedef struct page *new_page_t(struct page *, unsigned long private, int **); #ifdef CONFIG_MIGRATION extern int isolate_lru_page(struct page *p, struct list_head *pagelist); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index bd67a4413df..7e3f2349091 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -516,6 +516,11 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, const unsigned long __user *from, const unsigned long __user *to); +asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, + const void __user * __user *pages, + const int __user *nodes, + int __user *status, + int flags); asmlinkage long sys_mbind(unsigned long start, unsigned long len, unsigned long mode, unsigned long __user *nmask, diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 5433195040f..597229749de 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -87,6 +87,7 @@ cond_syscall(sys_inotify_init); cond_syscall(sys_inotify_add_watch); cond_syscall(sys_inotify_rm_watch); cond_syscall(sys_migrate_pages); +cond_syscall(sys_move_pages); cond_syscall(sys_chown16); cond_syscall(sys_fchown16); cond_syscall(sys_getegid16); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index f432642e9e6..05b84acf0bb 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -588,7 +588,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, isolate_lru_page(page, pagelist); } -static struct page *new_node_page(struct page *page, unsigned long node) +static struct page *new_node_page(struct page *page, unsigned long node, int **x) { return alloc_pages_node(node, GFP_HIGHUSER, 0); } @@ -698,7 +698,7 @@ int do_migrate_pages(struct mm_struct *mm, } -static struct page *new_vma_page(struct page *page, unsigned long private) +static struct page *new_vma_page(struct page *page, unsigned long private, int **x) { struct vm_area_struct *vma = (struct vm_area_struct *)private; diff --git a/mm/migrate.c b/mm/migrate.c index 251a8d15825..033a12f4c94 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -25,6 +25,8 @@ #include #include #include +#include +#include #include "internal.h" @@ -62,9 +64,8 @@ int isolate_lru_page(struct page *page, struct list_head *pagelist) } /* - * migrate_prep() needs to be called after we have compiled the list of pages - * to be migrated using isolate_lru_page() but before we begin a series of calls - * to migrate_pages(). + * migrate_prep() needs to be called before we start compiling a list of pages + * to be migrated using isolate_lru_page(). */ int migrate_prep(void) { @@ -588,7 +589,8 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, struct page *page, int force) { int rc = 0; - struct page *newpage = get_new_page(page, private); + int *result = NULL; + struct page *newpage = get_new_page(page, private, &result); if (!newpage) return -ENOMEM; @@ -642,6 +644,12 @@ move_newpage: * then this will free the page. */ move_to_lru(newpage); + if (result) { + if (rc) + *result = rc; + else + *result = page_to_nid(newpage); + } return rc; } @@ -710,3 +718,255 @@ out: return nr_failed + retry; } +#ifdef CONFIG_NUMA +/* + * Move a list of individual pages + */ +struct page_to_node { + unsigned long addr; + struct page *page; + int node; + int status; +}; + +static struct page *new_page_node(struct page *p, unsigned long private, + int **result) +{ + struct page_to_node *pm = (struct page_to_node *)private; + + while (pm->node != MAX_NUMNODES && pm->page != p) + pm++; + + if (pm->node == MAX_NUMNODES) + return NULL; + + *result = &pm->status; + + return alloc_pages_node(pm->node, GFP_HIGHUSER, 0); +} + +/* + * Move a set of pages as indicated in the pm array. The addr + * field must be set to the virtual address of the page to be moved + * and the node number must contain a valid target node. + */ +static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm, + int migrate_all) +{ + int err; + struct page_to_node *pp; + LIST_HEAD(pagelist); + + down_read(&mm->mmap_sem); + + /* + * Build a list of pages to migrate + */ + migrate_prep(); + for (pp = pm; pp->node != MAX_NUMNODES; pp++) { + struct vm_area_struct *vma; + struct page *page; + + /* + * A valid page pointer that will not match any of the + * pages that will be moved. + */ + pp->page = ZERO_PAGE(0); + + err = -EFAULT; + vma = find_vma(mm, pp->addr); + if (!vma) + goto set_status; + + page = follow_page(vma, pp->addr, FOLL_GET); + err = -ENOENT; + if (!page) + goto set_status; + + if (PageReserved(page)) /* Check for zero page */ + goto put_and_set; + + pp->page = page; + err = page_to_nid(page); + + if (err == pp->node) + /* + * Node already in the right place + */ + goto put_and_set; + + err = -EACCES; + if (page_mapcount(page) > 1 && + !migrate_all) + goto put_and_set; + + err = isolate_lru_page(page, &pagelist); +put_and_set: + /* + * Either remove the duplicate refcount from + * isolate_lru_page() or drop the page ref if it was + * not isolated. + */ + put_page(page); +set_status: + pp->status = err; + } + + if (!list_empty(&pagelist)) + err = migrate_pages(&pagelist, new_page_node, + (unsigned long)pm); + else + err = -ENOENT; + + up_read(&mm->mmap_sem); + return err; +} + +/* + * Determine the nodes of a list of pages. The addr in the pm array + * must have been set to the virtual address of which we want to determine + * the node number. + */ +static int do_pages_stat(struct mm_struct *mm, struct page_to_node *pm) +{ + down_read(&mm->mmap_sem); + + for ( ; pm->node != MAX_NUMNODES; pm++) { + struct vm_area_struct *vma; + struct page *page; + int err; + + err = -EFAULT; + vma = find_vma(mm, pm->addr); + if (!vma) + goto set_status; + + page = follow_page(vma, pm->addr, 0); + err = -ENOENT; + /* Use PageReserved to check for zero page */ + if (!page || PageReserved(page)) + goto set_status; + + err = page_to_nid(page); +set_status: + pm->status = err; + } + + up_read(&mm->mmap_sem); + return 0; +} + +/* + * Move a list of pages in the address space of the currently executing + * process. + */ +asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, + const void __user * __user *pages, + const int __user *nodes, + int __user *status, int flags) +{ + int err = 0; + int i; + struct task_struct *task; + nodemask_t task_nodes; + struct mm_struct *mm; + struct page_to_node *pm = NULL; + + /* Check flags */ + if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) + return -EINVAL; + + if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) + return -EPERM; + + /* Find the mm_struct */ + read_lock(&tasklist_lock); + task = pid ? find_task_by_pid(pid) : current; + if (!task) { + read_unlock(&tasklist_lock); + return -ESRCH; + } + mm = get_task_mm(task); + read_unlock(&tasklist_lock); + + if (!mm) + return -EINVAL; + + /* + * Check if this process has the right to modify the specified + * process. The right exists if the process has administrative + * capabilities, superuser privileges or the same + * userid as the target process. + */ + if ((current->euid != task->suid) && (current->euid != task->uid) && + (current->uid != task->suid) && (current->uid != task->uid) && + !capable(CAP_SYS_NICE)) { + err = -EPERM; + goto out2; + } + + task_nodes = cpuset_mems_allowed(task); + + /* Limit nr_pages so that the multiplication may not overflow */ + if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) { + err = -E2BIG; + goto out2; + } + + pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node)); + if (!pm) { + err = -ENOMEM; + goto out2; + } + + /* + * Get parameters from user space and initialize the pm + * array. Return various errors if the user did something wrong. + */ + for (i = 0; i < nr_pages; i++) { + const void *p; + + err = -EFAULT; + if (get_user(p, pages + i)) + goto out; + + pm[i].addr = (unsigned long)p; + if (nodes) { + int node; + + if (get_user(node, nodes + i)) + goto out; + + err = -ENODEV; + if (!node_online(node)) + goto out; + + err = -EACCES; + if (!node_isset(node, task_nodes)) + goto out; + + pm[i].node = node; + } + } + /* End marker */ + pm[nr_pages].node = MAX_NUMNODES; + + if (nodes) + err = do_move_pages(mm, pm, flags & MPOL_MF_MOVE_ALL); + else + err = do_pages_stat(mm, pm); + + if (err >= 0) + /* Return status information */ + for (i = 0; i < nr_pages; i++) + if (put_user(pm[i].status, status + i)) + err = -EFAULT; + +out: + vfree(pm); +out2: + mmput(mm); + return err; +} +#endif + -- cgit v1.2.3-70-g09d2 From 86c3a7645c05a7d06b72653aa4b2bea4e7229d1b Mon Sep 17 00:00:00 2001 From: David Quigley Date: Fri, 23 Jun 2006 02:04:02 -0700 Subject: [PATCH] SELinux: add security_task_movememory calls to mm code This patch inserts security_task_movememory hook calls into memory management code to enable security modules to mediate this operation between tasks. Since the last posting, the hook has been renamed following feedback from Christoph Lameter. Signed-off-by: David Quigley Acked-by: Stephen Smalley Signed-off-by: James Morris Cc: Andi Kleen Acked-by: Christoph Lameter Acked-by: Chris Wright Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 5 +++++ mm/migrate.c | 6 ++++++ 2 files changed, 11 insertions(+) (limited to 'mm/migrate.c') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 05b84acf0bb..ec4a1a950df 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -88,6 +88,7 @@ #include #include #include +#include #include #include @@ -942,6 +943,10 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, goto out; } + err = security_task_movememory(task); + if (err) + goto out; + err = do_migrate_pages(mm, &old, &new, capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); out: diff --git a/mm/migrate.c b/mm/migrate.c index 033a12f4c94..1c2a71aa05c 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -27,6 +27,7 @@ #include #include #include +#include #include "internal.h" @@ -905,6 +906,11 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, goto out2; } + err = security_task_movememory(task); + if (err) + goto out2; + + task_nodes = cpuset_mems_allowed(task); /* Limit nr_pages so that the multiplication may not overflow */ -- cgit v1.2.3-70-g09d2