summaryrefslogtreecommitdiffstats
path: root/mm/hugetlb.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r--mm/hugetlb.c214
1 files changed, 131 insertions, 83 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 901ac523a1c..3e52df7c471 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -22,6 +22,10 @@ unsigned long max_huge_pages;
static struct list_head hugepage_freelists[MAX_NUMNODES];
static unsigned int nr_huge_pages_node[MAX_NUMNODES];
static unsigned int free_huge_pages_node[MAX_NUMNODES];
+
+/*
+ * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
+ */
static DEFINE_SPINLOCK(hugetlb_lock);
static void enqueue_huge_page(struct page *page)
@@ -61,8 +65,10 @@ static struct page *alloc_fresh_huge_page(void)
HUGETLB_PAGE_ORDER);
nid = (nid + 1) % num_online_nodes();
if (page) {
+ spin_lock(&hugetlb_lock);
nr_huge_pages++;
nr_huge_pages_node[page_to_nid(page)]++;
+ spin_unlock(&hugetlb_lock);
}
return page;
}
@@ -103,6 +109,9 @@ static int __init hugetlb_init(void)
unsigned long i;
struct page *page;
+ if (HPAGE_SHIFT == 0)
+ return 0;
+
for (i = 0; i < MAX_NUMNODES; ++i)
INIT_LIST_HEAD(&hugepage_freelists[i]);
@@ -234,7 +243,6 @@ unsigned long hugetlb_total_pages(void)
{
return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE);
}
-EXPORT_SYMBOL(hugetlb_total_pages);
/*
* We cannot handle pagefaults against hugetlb pages at all. They cause
@@ -274,21 +282,26 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
{
pte_t *src_pte, *dst_pte, entry;
struct page *ptepage;
- unsigned long addr = vma->vm_start;
- unsigned long end = vma->vm_end;
+ unsigned long addr;
- while (addr < end) {
+ for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
+ src_pte = huge_pte_offset(src, addr);
+ if (!src_pte)
+ continue;
dst_pte = huge_pte_alloc(dst, addr);
if (!dst_pte)
goto nomem;
- src_pte = huge_pte_offset(src, addr);
- BUG_ON(!src_pte || pte_none(*src_pte)); /* prefaulted */
- entry = *src_pte;
- ptepage = pte_page(entry);
- get_page(ptepage);
- add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE);
- set_huge_pte_at(dst, addr, dst_pte, entry);
- addr += HPAGE_SIZE;
+ spin_lock(&dst->page_table_lock);
+ spin_lock(&src->page_table_lock);
+ if (!pte_none(*src_pte)) {
+ entry = *src_pte;
+ ptepage = pte_page(entry);
+ get_page(ptepage);
+ add_mm_counter(dst, file_rss, HPAGE_SIZE / PAGE_SIZE);
+ set_huge_pte_at(dst, addr, dst_pte, entry);
+ }
+ spin_unlock(&src->page_table_lock);
+ spin_unlock(&dst->page_table_lock);
}
return 0;
@@ -309,12 +322,14 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
BUG_ON(start & ~HPAGE_MASK);
BUG_ON(end & ~HPAGE_MASK);
+ spin_lock(&mm->page_table_lock);
+
+ /* Update high watermark before we lower rss */
+ update_hiwater_rss(mm);
+
for (address = start; address < end; address += HPAGE_SIZE) {
ptep = huge_pte_offset(mm, address);
- if (! ptep)
- /* This can happen on truncate, or if an
- * mmap() is aborted due to an error before
- * the prefault */
+ if (!ptep)
continue;
pte = huge_ptep_get_and_clear(mm, address, ptep);
@@ -323,74 +338,99 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
page = pte_page(pte);
put_page(page);
+ add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE));
}
- add_mm_counter(mm, rss, -((end - start) >> PAGE_SHIFT));
+
+ spin_unlock(&mm->page_table_lock);
flush_tlb_range(vma, start, end);
}
-void zap_hugepage_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long length)
+static struct page *find_lock_huge_page(struct address_space *mapping,
+ unsigned long idx)
{
- struct mm_struct *mm = vma->vm_mm;
+ struct page *page;
+ int err;
+ struct inode *inode = mapping->host;
+ unsigned long size;
+
+retry:
+ page = find_lock_page(mapping, idx);
+ if (page)
+ goto out;
+
+ /* Check to make sure the mapping hasn't been truncated */
+ size = i_size_read(inode) >> HPAGE_SHIFT;
+ if (idx >= size)
+ goto out;
+
+ if (hugetlb_get_quota(mapping))
+ goto out;
+ page = alloc_huge_page();
+ if (!page) {
+ hugetlb_put_quota(mapping);
+ goto out;
+ }
- spin_lock(&mm->page_table_lock);
- unmap_hugepage_range(vma, start, start + length);
- spin_unlock(&mm->page_table_lock);
+ err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
+ if (err) {
+ put_page(page);
+ hugetlb_put_quota(mapping);
+ if (err == -EEXIST)
+ goto retry;
+ page = NULL;
+ }
+out:
+ return page;
}
-int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
+int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, int write_access)
{
- struct mm_struct *mm = current->mm;
- unsigned long addr;
- int ret = 0;
+ int ret = VM_FAULT_SIGBUS;
+ unsigned long idx;
+ unsigned long size;
+ pte_t *pte;
+ struct page *page;
+ struct address_space *mapping;
- WARN_ON(!is_vm_hugetlb_page(vma));
- BUG_ON(vma->vm_start & ~HPAGE_MASK);
- BUG_ON(vma->vm_end & ~HPAGE_MASK);
+ pte = huge_pte_alloc(mm, address);
+ if (!pte)
+ goto out;
+
+ mapping = vma->vm_file->f_mapping;
+ idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
+ + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
- hugetlb_prefault_arch_hook(mm);
+ /*
+ * Use page lock to guard against racing truncation
+ * before we get page_table_lock.
+ */
+ page = find_lock_huge_page(mapping, idx);
+ if (!page)
+ goto out;
spin_lock(&mm->page_table_lock);
- for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
- unsigned long idx;
- pte_t *pte = huge_pte_alloc(mm, addr);
- struct page *page;
+ size = i_size_read(mapping->host) >> HPAGE_SHIFT;
+ if (idx >= size)
+ goto backout;
- if (!pte) {
- ret = -ENOMEM;
- goto out;
- }
+ ret = VM_FAULT_MINOR;
+ if (!pte_none(*pte))
+ goto backout;
- idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
- + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
- page = find_get_page(mapping, idx);
- if (!page) {
- /* charge the fs quota first */
- if (hugetlb_get_quota(mapping)) {
- ret = -ENOMEM;
- goto out;
- }
- page = alloc_huge_page();
- if (!page) {
- hugetlb_put_quota(mapping);
- ret = -ENOMEM;
- goto out;
- }
- ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
- if (! ret) {
- unlock_page(page);
- } else {
- hugetlb_put_quota(mapping);
- free_huge_page(page);
- goto out;
- }
- }
- add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE);
- set_huge_pte_at(mm, addr, pte, make_huge_pte(vma, page));
- }
-out:
+ add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
+ set_huge_pte_at(mm, address, pte, make_huge_pte(vma, page));
spin_unlock(&mm->page_table_lock);
+ unlock_page(page);
+out:
return ret;
+
+backout:
+ spin_unlock(&mm->page_table_lock);
+ hugetlb_put_quota(mapping);
+ unlock_page(page);
+ put_page(page);
+ goto out;
}
int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -400,28 +440,36 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long vpfn, vaddr = *position;
int remainder = *length;
- BUG_ON(!is_vm_hugetlb_page(vma));
-
vpfn = vaddr/PAGE_SIZE;
+ spin_lock(&mm->page_table_lock);
while (vaddr < vma->vm_end && remainder) {
+ pte_t *pte;
+ struct page *page;
- if (pages) {
- pte_t *pte;
- struct page *page;
-
- /* Some archs (sparc64, sh*) have multiple
- * pte_ts to each hugepage. We have to make
- * sure we get the first, for the page
- * indexing below to work. */
- pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
+ /*
+ * Some archs (sparc64, sh*) have multiple pte_ts to
+ * each hugepage. We have to make * sure we get the
+ * first, for the page indexing below to work.
+ */
+ pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
- /* hugetlb should be locked, and hence, prefaulted */
- WARN_ON(!pte || pte_none(*pte));
+ if (!pte || pte_none(*pte)) {
+ int ret;
- page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
+ spin_unlock(&mm->page_table_lock);
+ ret = hugetlb_fault(mm, vma, vaddr, 0);
+ spin_lock(&mm->page_table_lock);
+ if (ret == VM_FAULT_MINOR)
+ continue;
- WARN_ON(!PageCompound(page));
+ remainder = 0;
+ if (!i)
+ i = -EFAULT;
+ break;
+ }
+ if (pages) {
+ page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
get_page(page);
pages[i] = page;
}
@@ -434,7 +482,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
--remainder;
++i;
}
-
+ spin_unlock(&mm->page_table_lock);
*length = remainder;
*position = vaddr;