summaryrefslogtreecommitdiffstats
path: root/mm/hugetlb.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r--mm/hugetlb.c562
1 files changed, 355 insertions, 207 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c82290b9c1f..eeceeeb0901 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -35,7 +35,6 @@
#include <linux/node.h>
#include "internal.h"
-const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
unsigned long hugepages_treat_as_movable;
int hugetlb_max_hstate __read_mostly;
@@ -544,7 +543,7 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
/* Movability of hugepages depends on migration support. */
static inline gfp_t htlb_alloc_mask(struct hstate *h)
{
- if (hugepages_treat_as_movable || hugepage_migration_support(h))
+ if (hugepages_treat_as_movable || hugepage_migration_supported(h))
return GFP_HIGHUSER_MOVABLE;
else
return GFP_HIGHUSER;
@@ -607,25 +606,242 @@ err:
return NULL;
}
+/*
+ * common helper functions for hstate_next_node_to_{alloc|free}.
+ * We may have allocated or freed a huge page based on a different
+ * nodes_allowed previously, so h->next_node_to_{alloc|free} might
+ * be outside of *nodes_allowed. Ensure that we use an allowed
+ * node for alloc or free.
+ */
+static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
+{
+ nid = next_node(nid, *nodes_allowed);
+ if (nid == MAX_NUMNODES)
+ nid = first_node(*nodes_allowed);
+ VM_BUG_ON(nid >= MAX_NUMNODES);
+
+ return nid;
+}
+
+static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
+{
+ if (!node_isset(nid, *nodes_allowed))
+ nid = next_node_allowed(nid, nodes_allowed);
+ return nid;
+}
+
+/*
+ * returns the previously saved node ["this node"] from which to
+ * allocate a persistent huge page for the pool and advance the
+ * next node from which to allocate, handling wrap at end of node
+ * mask.
+ */
+static int hstate_next_node_to_alloc(struct hstate *h,
+ nodemask_t *nodes_allowed)
+{
+ int nid;
+
+ VM_BUG_ON(!nodes_allowed);
+
+ nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
+ h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
+
+ return nid;
+}
+
+/*
+ * helper for free_pool_huge_page() - return the previously saved
+ * node ["this node"] from which to free a huge page. Advance the
+ * next node id whether or not we find a free huge page to free so
+ * that the next attempt to free addresses the next node.
+ */
+static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
+{
+ int nid;
+
+ VM_BUG_ON(!nodes_allowed);
+
+ nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
+ h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
+
+ return nid;
+}
+
+#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \
+ for (nr_nodes = nodes_weight(*mask); \
+ nr_nodes > 0 && \
+ ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \
+ nr_nodes--)
+
+#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \
+ for (nr_nodes = nodes_weight(*mask); \
+ nr_nodes > 0 && \
+ ((node = hstate_next_node_to_free(hs, mask)) || 1); \
+ nr_nodes--)
+
+#if defined(CONFIG_CMA) && defined(CONFIG_X86_64)
+static void destroy_compound_gigantic_page(struct page *page,
+ unsigned long order)
+{
+ int i;
+ int nr_pages = 1 << order;
+ struct page *p = page + 1;
+
+ for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
+ __ClearPageTail(p);
+ set_page_refcounted(p);
+ p->first_page = NULL;
+ }
+
+ set_compound_order(page, 0);
+ __ClearPageHead(page);
+}
+
+static void free_gigantic_page(struct page *page, unsigned order)
+{
+ free_contig_range(page_to_pfn(page), 1 << order);
+}
+
+static int __alloc_gigantic_page(unsigned long start_pfn,
+ unsigned long nr_pages)
+{
+ unsigned long end_pfn = start_pfn + nr_pages;
+ return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
+}
+
+static bool pfn_range_valid_gigantic(unsigned long start_pfn,
+ unsigned long nr_pages)
+{
+ unsigned long i, end_pfn = start_pfn + nr_pages;
+ struct page *page;
+
+ for (i = start_pfn; i < end_pfn; i++) {
+ if (!pfn_valid(i))
+ return false;
+
+ page = pfn_to_page(i);
+
+ if (PageReserved(page))
+ return false;
+
+ if (page_count(page) > 0)
+ return false;
+
+ if (PageHuge(page))
+ return false;
+ }
+
+ return true;
+}
+
+static bool zone_spans_last_pfn(const struct zone *zone,
+ unsigned long start_pfn, unsigned long nr_pages)
+{
+ unsigned long last_pfn = start_pfn + nr_pages - 1;
+ return zone_spans_pfn(zone, last_pfn);
+}
+
+static struct page *alloc_gigantic_page(int nid, unsigned order)
+{
+ unsigned long nr_pages = 1 << order;
+ unsigned long ret, pfn, flags;
+ struct zone *z;
+
+ z = NODE_DATA(nid)->node_zones;
+ for (; z - NODE_DATA(nid)->node_zones < MAX_NR_ZONES; z++) {
+ spin_lock_irqsave(&z->lock, flags);
+
+ pfn = ALIGN(z->zone_start_pfn, nr_pages);
+ while (zone_spans_last_pfn(z, pfn, nr_pages)) {
+ if (pfn_range_valid_gigantic(pfn, nr_pages)) {
+ /*
+ * We release the zone lock here because
+ * alloc_contig_range() will also lock the zone
+ * at some point. If there's an allocation
+ * spinning on this lock, it may win the race
+ * and cause alloc_contig_range() to fail...
+ */
+ spin_unlock_irqrestore(&z->lock, flags);
+ ret = __alloc_gigantic_page(pfn, nr_pages);
+ if (!ret)
+ return pfn_to_page(pfn);
+ spin_lock_irqsave(&z->lock, flags);
+ }
+ pfn += nr_pages;
+ }
+
+ spin_unlock_irqrestore(&z->lock, flags);
+ }
+
+ return NULL;
+}
+
+static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
+static void prep_compound_gigantic_page(struct page *page, unsigned long order);
+
+static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid)
+{
+ struct page *page;
+
+ page = alloc_gigantic_page(nid, huge_page_order(h));
+ if (page) {
+ prep_compound_gigantic_page(page, huge_page_order(h));
+ prep_new_huge_page(h, page, nid);
+ }
+
+ return page;
+}
+
+static int alloc_fresh_gigantic_page(struct hstate *h,
+ nodemask_t *nodes_allowed)
+{
+ struct page *page = NULL;
+ int nr_nodes, node;
+
+ for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
+ page = alloc_fresh_gigantic_page_node(h, node);
+ if (page)
+ return 1;
+ }
+
+ return 0;
+}
+
+static inline bool gigantic_page_supported(void) { return true; }
+#else
+static inline bool gigantic_page_supported(void) { return false; }
+static inline void free_gigantic_page(struct page *page, unsigned order) { }
+static inline void destroy_compound_gigantic_page(struct page *page,
+ unsigned long order) { }
+static inline int alloc_fresh_gigantic_page(struct hstate *h,
+ nodemask_t *nodes_allowed) { return 0; }
+#endif
+
static void update_and_free_page(struct hstate *h, struct page *page)
{
int i;
- VM_BUG_ON(h->order >= MAX_ORDER);
+ if (hstate_is_gigantic(h) && !gigantic_page_supported())
+ return;
h->nr_huge_pages--;
h->nr_huge_pages_node[page_to_nid(page)]--;
for (i = 0; i < pages_per_huge_page(h); i++) {
page[i].flags &= ~(1 << PG_locked | 1 << PG_error |
1 << PG_referenced | 1 << PG_dirty |
- 1 << PG_active | 1 << PG_reserved |
- 1 << PG_private | 1 << PG_writeback);
+ 1 << PG_active | 1 << PG_private |
+ 1 << PG_writeback);
}
VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
set_compound_page_dtor(page, NULL);
set_page_refcounted(page);
- arch_release_hugepage(page);
- __free_pages(page, huge_page_order(h));
+ if (hstate_is_gigantic(h)) {
+ destroy_compound_gigantic_page(page, huge_page_order(h));
+ free_gigantic_page(page, huge_page_order(h));
+ } else {
+ arch_release_hugepage(page);
+ __free_pages(page, huge_page_order(h));
+ }
}
struct hstate *size_to_hstate(unsigned long size)
@@ -639,7 +855,7 @@ struct hstate *size_to_hstate(unsigned long size)
return NULL;
}
-static void free_huge_page(struct page *page)
+void free_huge_page(struct page *page)
{
/*
* Can't pass hstate in here because it is called from the
@@ -664,7 +880,7 @@ static void free_huge_page(struct page *page)
if (restore_reserve)
h->resv_huge_pages++;
- if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) {
+ if (h->surplus_huge_pages_node[nid]) {
/* remove the page from active list */
list_del(&page->lru);
update_and_free_page(h, page);
@@ -690,8 +906,7 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
put_page(page); /* free it into the hugepage allocator */
}
-static void __init prep_compound_gigantic_page(struct page *page,
- unsigned long order)
+static void prep_compound_gigantic_page(struct page *page, unsigned long order)
{
int i;
int nr_pages = 1 << order;
@@ -769,9 +984,6 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
{
struct page *page;
- if (h->order >= MAX_ORDER)
- return NULL;
-
page = alloc_pages_exact_node(nid,
htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
__GFP_REPEAT|__GFP_NOWARN,
@@ -787,79 +999,6 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
return page;
}
-/*
- * common helper functions for hstate_next_node_to_{alloc|free}.
- * We may have allocated or freed a huge page based on a different
- * nodes_allowed previously, so h->next_node_to_{alloc|free} might
- * be outside of *nodes_allowed. Ensure that we use an allowed
- * node for alloc or free.
- */
-static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
-{
- nid = next_node(nid, *nodes_allowed);
- if (nid == MAX_NUMNODES)
- nid = first_node(*nodes_allowed);
- VM_BUG_ON(nid >= MAX_NUMNODES);
-
- return nid;
-}
-
-static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
-{
- if (!node_isset(nid, *nodes_allowed))
- nid = next_node_allowed(nid, nodes_allowed);
- return nid;
-}
-
-/*
- * returns the previously saved node ["this node"] from which to
- * allocate a persistent huge page for the pool and advance the
- * next node from which to allocate, handling wrap at end of node
- * mask.
- */
-static int hstate_next_node_to_alloc(struct hstate *h,
- nodemask_t *nodes_allowed)
-{
- int nid;
-
- VM_BUG_ON(!nodes_allowed);
-
- nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
- h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
-
- return nid;
-}
-
-/*
- * helper for free_pool_huge_page() - return the previously saved
- * node ["this node"] from which to free a huge page. Advance the
- * next node id whether or not we find a free huge page to free so
- * that the next attempt to free addresses the next node.
- */
-static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
-{
- int nid;
-
- VM_BUG_ON(!nodes_allowed);
-
- nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
- h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
-
- return nid;
-}
-
-#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \
- for (nr_nodes = nodes_weight(*mask); \
- nr_nodes > 0 && \
- ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \
- nr_nodes--)
-
-#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \
- for (nr_nodes = nodes_weight(*mask); \
- nr_nodes > 0 && \
- ((node = hstate_next_node_to_free(hs, mask)) || 1); \
- nr_nodes--)
-
static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
{
struct page *page;
@@ -949,6 +1088,9 @@ void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
unsigned long pfn;
struct hstate *h;
+ if (!hugepages_supported())
+ return;
+
/* Set scan step to minimum hugepage size */
for_each_hstate(h)
if (order > huge_page_order(h))
@@ -963,7 +1105,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
struct page *page;
unsigned int r_nid;
- if (h->order >= MAX_ORDER)
+ if (hstate_is_gigantic(h))
return NULL;
/*
@@ -1156,7 +1298,7 @@ static void return_unused_surplus_pages(struct hstate *h,
h->resv_huge_pages -= unused_resv_pages;
/* Cannot return gigantic pages currently */
- if (h->order >= MAX_ORDER)
+ if (hstate_is_gigantic(h))
return;
nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
@@ -1246,24 +1388,17 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
return ERR_PTR(-ENOSPC);
ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
- if (ret) {
- if (chg || avoid_reserve)
- hugepage_subpool_put_pages(spool, 1);
- return ERR_PTR(-ENOSPC);
- }
+ if (ret)
+ goto out_subpool_put;
+
spin_lock(&hugetlb_lock);
page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, chg);
if (!page) {
spin_unlock(&hugetlb_lock);
page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
- if (!page) {
- hugetlb_cgroup_uncharge_cgroup(idx,
- pages_per_huge_page(h),
- h_cg);
- if (chg || avoid_reserve)
- hugepage_subpool_put_pages(spool, 1);
- return ERR_PTR(-ENOSPC);
- }
+ if (!page)
+ goto out_uncharge_cgroup;
+
spin_lock(&hugetlb_lock);
list_move(&page->lru, &h->hugepage_activelist);
/* Fall through */
@@ -1275,6 +1410,13 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
vma_commit_reservation(h, vma, addr);
return page;
+
+out_uncharge_cgroup:
+ hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
+out_subpool_put:
+ if (chg || avoid_reserve)
+ hugepage_subpool_put_pages(spool, 1);
+ return ERR_PTR(-ENOSPC);
}
/*
@@ -1356,7 +1498,7 @@ static void __init gather_bootmem_prealloc(void)
* fix confusing memory reports from free(1) and another
* side-effects, like CommitLimit going negative.
*/
- if (h->order > (MAX_ORDER - 1))
+ if (hstate_is_gigantic(h))
adjust_managed_page_count(page, 1 << h->order);
}
}
@@ -1366,7 +1508,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
unsigned long i;
for (i = 0; i < h->max_huge_pages; ++i) {
- if (h->order >= MAX_ORDER) {
+ if (hstate_is_gigantic(h)) {
if (!alloc_bootmem_huge_page(h))
break;
} else if (!alloc_fresh_huge_page(h,
@@ -1382,7 +1524,7 @@ static void __init hugetlb_init_hstates(void)
for_each_hstate(h) {
/* oversize hugepages were init'ed in early boot */
- if (h->order < MAX_ORDER)
+ if (!hstate_is_gigantic(h))
hugetlb_hstate_alloc_pages(h);
}
}
@@ -1416,7 +1558,7 @@ static void try_to_free_low(struct hstate *h, unsigned long count,
{
int i;
- if (h->order >= MAX_ORDER)
+ if (hstate_is_gigantic(h))
return;
for_each_node_mask(i, *nodes_allowed) {
@@ -1479,7 +1621,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
{
unsigned long min_count, ret;
- if (h->order >= MAX_ORDER)
+ if (hstate_is_gigantic(h) && !gigantic_page_supported())
return h->max_huge_pages;
/*
@@ -1506,7 +1648,10 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
* and reducing the surplus.
*/
spin_unlock(&hugetlb_lock);
- ret = alloc_fresh_huge_page(h, nodes_allowed);
+ if (hstate_is_gigantic(h))
+ ret = alloc_fresh_gigantic_page(h, nodes_allowed);
+ else
+ ret = alloc_fresh_huge_page(h, nodes_allowed);
spin_lock(&hugetlb_lock);
if (!ret)
goto out;
@@ -1591,22 +1736,14 @@ static ssize_t nr_hugepages_show_common(struct kobject *kobj,
return sprintf(buf, "%lu\n", nr_huge_pages);
}
-static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
- struct kobject *kobj, struct kobj_attribute *attr,
- const char *buf, size_t len)
+static ssize_t __nr_hugepages_store_common(bool obey_mempolicy,
+ struct hstate *h, int nid,
+ unsigned long count, size_t len)
{
int err;
- int nid;
- unsigned long count;
- struct hstate *h;
NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY);
- err = kstrtoul(buf, 10, &count);
- if (err)
- goto out;
-
- h = kobj_to_hstate(kobj, &nid);
- if (h->order >= MAX_ORDER) {
+ if (hstate_is_gigantic(h) && !gigantic_page_supported()) {
err = -EINVAL;
goto out;
}
@@ -1641,6 +1778,23 @@ out:
return err;
}
+static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
+ struct kobject *kobj, const char *buf,
+ size_t len)
+{
+ struct hstate *h;
+ unsigned long count;
+ int nid;
+ int err;
+
+ err = kstrtoul(buf, 10, &count);
+ if (err)
+ return err;
+
+ h = kobj_to_hstate(kobj, &nid);
+ return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len);
+}
+
static ssize_t nr_hugepages_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
@@ -1650,7 +1804,7 @@ static ssize_t nr_hugepages_show(struct kobject *kobj,
static ssize_t nr_hugepages_store(struct kobject *kobj,
struct kobj_attribute *attr, const char *buf, size_t len)
{
- return nr_hugepages_store_common(false, kobj, attr, buf, len);
+ return nr_hugepages_store_common(false, kobj, buf, len);
}
HSTATE_ATTR(nr_hugepages);
@@ -1669,7 +1823,7 @@ static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj,
static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj,
struct kobj_attribute *attr, const char *buf, size_t len)
{
- return nr_hugepages_store_common(true, kobj, attr, buf, len);
+ return nr_hugepages_store_common(true, kobj, buf, len);
}
HSTATE_ATTR(nr_hugepages_mempolicy);
#endif
@@ -1689,7 +1843,7 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
unsigned long input;
struct hstate *h = kobj_to_hstate(kobj, NULL);
- if (h->order >= MAX_ORDER)
+ if (hstate_is_gigantic(h))
return -EINVAL;
err = kstrtoul(buf, 10, &input);
@@ -2105,36 +2259,21 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
void __user *buffer, size_t *length, loff_t *ppos)
{
struct hstate *h = &default_hstate;
- unsigned long tmp;
+ unsigned long tmp = h->max_huge_pages;
int ret;
if (!hugepages_supported())
return -ENOTSUPP;
- tmp = h->max_huge_pages;
-
- if (write && h->order >= MAX_ORDER)
- return -EINVAL;
-
table->data = &tmp;
table->maxlen = sizeof(unsigned long);
ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
if (ret)
goto out;
- if (write) {
- NODEMASK_ALLOC(nodemask_t, nodes_allowed,
- GFP_KERNEL | __GFP_NORETRY);
- if (!(obey_mempolicy &&
- init_nodemask_of_mempolicy(nodes_allowed))) {
- NODEMASK_FREE(nodes_allowed);
- nodes_allowed = &node_states[N_MEMORY];
- }
- h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed);
-
- if (nodes_allowed != &node_states[N_MEMORY])
- NODEMASK_FREE(nodes_allowed);
- }
+ if (write)
+ ret = __nr_hugepages_store_common(obey_mempolicy, h,
+ NUMA_NO_NODE, tmp, *length);
out:
return ret;
}
@@ -2169,7 +2308,7 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
tmp = h->nr_overcommit_huge_pages;
- if (write && h->order >= MAX_ORDER)
+ if (write && hstate_is_gigantic(h))
return -EINVAL;
table->data = &tmp;
@@ -2377,6 +2516,31 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma,
update_mmu_cache(vma, address, ptep);
}
+static int is_hugetlb_entry_migration(pte_t pte)
+{
+ swp_entry_t swp;
+
+ if (huge_pte_none(pte) || pte_present(pte))
+ return 0;
+ swp = pte_to_swp_entry(pte);
+ if (non_swap_entry(swp) && is_migration_entry(swp))
+ return 1;
+ else
+ return 0;
+}
+
+static int is_hugetlb_entry_hwpoisoned(pte_t pte)
+{
+ swp_entry_t swp;
+
+ if (huge_pte_none(pte) || pte_present(pte))
+ return 0;
+ swp = pte_to_swp_entry(pte);
+ if (non_swap_entry(swp) && is_hwpoison_entry(swp))
+ return 1;
+ else
+ return 0;
+}
int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
struct vm_area_struct *vma)
@@ -2416,7 +2580,24 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
dst_ptl = huge_pte_lock(h, dst, dst_pte);
src_ptl = huge_pte_lockptr(h, src, src_pte);
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
- if (!huge_pte_none(huge_ptep_get(src_pte))) {
+ entry = huge_ptep_get(src_pte);
+ if (huge_pte_none(entry)) { /* skip none entry */
+ ;
+ } else if (unlikely(is_hugetlb_entry_migration(entry) ||
+ is_hugetlb_entry_hwpoisoned(entry))) {
+ swp_entry_t swp_entry = pte_to_swp_entry(entry);
+
+ if (is_write_migration_entry(swp_entry) && cow) {
+ /*
+ * COW mappings require pages in both
+ * parent and child to be set to read.
+ */
+ make_migration_entry_read(&swp_entry);
+ entry = swp_entry_to_pte(swp_entry);
+ set_huge_pte_at(src, addr, src_pte, entry);
+ }
+ set_huge_pte_at(dst, addr, dst_pte, entry);
+ } else {
if (cow)
huge_ptep_set_wrprotect(src, addr, src_pte);
entry = huge_ptep_get(src_pte);
@@ -2435,32 +2616,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
return ret;
}
-static int is_hugetlb_entry_migration(pte_t pte)
-{
- swp_entry_t swp;
-
- if (huge_pte_none(pte) || pte_present(pte))
- return 0;
- swp = pte_to_swp_entry(pte);
- if (non_swap_entry(swp) && is_migration_entry(swp))
- return 1;
- else
- return 0;
-}
-
-static int is_hugetlb_entry_hwpoisoned(pte_t pte)
-{
- swp_entry_t swp;
-
- if (huge_pte_none(pte) || pte_present(pte))
- return 0;
- swp = pte_to_swp_entry(pte);
- if (non_swap_entry(swp) && is_hwpoison_entry(swp))
- return 1;
- else
- return 0;
-}
-
void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
unsigned long start, unsigned long end,
struct page *ref_page)
@@ -2595,8 +2750,8 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
* from other VMAs and let the children be SIGKILLed if they are faulting the
* same region.
*/
-static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
- struct page *page, unsigned long address)
+static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
+ struct page *page, unsigned long address)
{
struct hstate *h = hstate_vma(vma);
struct vm_area_struct *iter_vma;
@@ -2635,8 +2790,6 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
address + huge_page_size(h), page);
}
mutex_unlock(&mapping->i_mmap_mutex);
-
- return 1;
}
/*
@@ -2651,7 +2804,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
{
struct hstate *h = hstate_vma(vma);
struct page *old_page, *new_page;
- int outside_reserve = 0;
+ int ret = 0, outside_reserve = 0;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
@@ -2681,14 +2834,14 @@ retry_avoidcopy:
page_cache_get(old_page);
- /* Drop page table lock as buddy allocator may be called */
+ /*
+ * Drop page table lock as buddy allocator may be called. It will
+ * be acquired again before returning to the caller, as expected.
+ */
spin_unlock(ptl);
new_page = alloc_huge_page(vma, address, outside_reserve);
if (IS_ERR(new_page)) {
- long err = PTR_ERR(new_page);
- page_cache_release(old_page);
-
/*
* If a process owning a MAP_PRIVATE mapping fails to COW,
* it is due to references held by a child and an insufficient
@@ -2697,29 +2850,25 @@ retry_avoidcopy:
* may get SIGKILLed if it later faults.
*/
if (outside_reserve) {
+ page_cache_release(old_page);
BUG_ON(huge_pte_none(pte));
- if (unmap_ref_private(mm, vma, old_page, address)) {
- BUG_ON(huge_pte_none(pte));
- spin_lock(ptl);
- ptep = huge_pte_offset(mm, address & huge_page_mask(h));
- if (likely(ptep &&
- pte_same(huge_ptep_get(ptep), pte)))
- goto retry_avoidcopy;
- /*
- * race occurs while re-acquiring page table
- * lock, and our job is done.
- */
- return 0;
- }
- WARN_ON_ONCE(1);
+ unmap_ref_private(mm, vma, old_page, address);
+ BUG_ON(huge_pte_none(pte));
+ spin_lock(ptl);
+ ptep = huge_pte_offset(mm, address & huge_page_mask(h));
+ if (likely(ptep &&
+ pte_same(huge_ptep_get(ptep), pte)))
+ goto retry_avoidcopy;
+ /*
+ * race occurs while re-acquiring page table
+ * lock, and our job is done.
+ */
+ return 0;
}
- /* Caller expects lock to be held */
- spin_lock(ptl);
- if (err == -ENOMEM)
- return VM_FAULT_OOM;
- else
- return VM_FAULT_SIGBUS;
+ ret = (PTR_ERR(new_page) == -ENOMEM) ?
+ VM_FAULT_OOM : VM_FAULT_SIGBUS;
+ goto out_release_old;
}
/*
@@ -2727,11 +2876,8 @@ retry_avoidcopy:
* anon_vma prepared.
*/
if (unlikely(anon_vma_prepare(vma))) {
- page_cache_release(new_page);
- page_cache_release(old_page);
- /* Caller expects lock to be held */
- spin_lock(ptl);
- return VM_FAULT_OOM;
+ ret = VM_FAULT_OOM;
+ goto out_release_all;
}
copy_user_huge_page(new_page, old_page, address, vma,
@@ -2741,6 +2887,7 @@ retry_avoidcopy:
mmun_start = address & huge_page_mask(h);
mmun_end = mmun_start + huge_page_size(h);
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+
/*
* Retake the page table lock to check for racing updates
* before the page tables are altered
@@ -2761,12 +2908,13 @@ retry_avoidcopy:
}
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+out_release_all:
page_cache_release(new_page);
+out_release_old:
page_cache_release(old_page);
- /* Caller expects lock to be held */
- spin_lock(ptl);
- return 0;
+ spin_lock(ptl); /* Caller expects lock to be held */
+ return ret;
}
/* Return the pagecache page at a given address within a VMA */