diff options
32 files changed, 282 insertions, 105 deletions
diff --git a/arch/Kconfig b/arch/Kconfig index 8d2ae24b9f4..1feb169274f 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -407,6 +407,12 @@ config CLONE_BACKWARDS2 help Architecture has the first two arguments of clone(2) swapped. +config CLONE_BACKWARDS3 + bool + help + Architecture has tls passed as the 3rd argument of clone(2), + not the 5th one. + config ODD_RT_SIGACTION bool help diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig index 33a97929d05..77d442ab28c 100644 --- a/arch/hexagon/Kconfig +++ b/arch/hexagon/Kconfig @@ -158,6 +158,7 @@ source "kernel/Kconfig.hz" endmenu source "init/Kconfig" +source "kernel/Kconfig.freezer" source "drivers/Kconfig" source "fs/Kconfig" diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index d22a4ecffff..4fab52294d9 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -28,7 +28,7 @@ config MICROBLAZE select GENERIC_CLOCKEVENTS select GENERIC_IDLE_POLL_SETUP select MODULES_USE_ELF_RELA - select CLONE_BACKWARDS + select CLONE_BACKWARDS3 config SWAP def_bool n diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig index 99dbab1c59a..d60bf98fa5c 100644 --- a/arch/openrisc/Kconfig +++ b/arch/openrisc/Kconfig @@ -55,6 +55,7 @@ config GENERIC_CSUM source "init/Kconfig" +source "kernel/Kconfig.freezer" menu "Processor type and features" diff --git a/arch/score/Kconfig b/arch/score/Kconfig index c8def8bc902..5fc237581ca 100644 --- a/arch/score/Kconfig +++ b/arch/score/Kconfig @@ -87,6 +87,8 @@ config STACKTRACE_SUPPORT source "init/Kconfig" +source "kernel/Kconfig.freezer" + config MMU def_bool y diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h index f2b489cf160..3bf2dd0cf61 100644 --- a/arch/x86/include/asm/pgtable-2level.h +++ b/arch/x86/include/asm/pgtable-2level.h @@ -55,9 +55,53 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp) #define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp) #endif +#ifdef CONFIG_MEM_SOFT_DIRTY + +/* + * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE, _PAGE_BIT_SOFT_DIRTY and + * _PAGE_BIT_PROTNONE are taken, split up the 28 bits of offset + * into this range. + */ +#define PTE_FILE_MAX_BITS 28 +#define PTE_FILE_SHIFT1 (_PAGE_BIT_PRESENT + 1) +#define PTE_FILE_SHIFT2 (_PAGE_BIT_FILE + 1) +#define PTE_FILE_SHIFT3 (_PAGE_BIT_PROTNONE + 1) +#define PTE_FILE_SHIFT4 (_PAGE_BIT_SOFT_DIRTY + 1) +#define PTE_FILE_BITS1 (PTE_FILE_SHIFT2 - PTE_FILE_SHIFT1 - 1) +#define PTE_FILE_BITS2 (PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1) +#define PTE_FILE_BITS3 (PTE_FILE_SHIFT4 - PTE_FILE_SHIFT3 - 1) + +#define pte_to_pgoff(pte) \ + ((((pte).pte_low >> (PTE_FILE_SHIFT1)) \ + & ((1U << PTE_FILE_BITS1) - 1))) \ + + ((((pte).pte_low >> (PTE_FILE_SHIFT2)) \ + & ((1U << PTE_FILE_BITS2) - 1)) \ + << (PTE_FILE_BITS1)) \ + + ((((pte).pte_low >> (PTE_FILE_SHIFT3)) \ + & ((1U << PTE_FILE_BITS3) - 1)) \ + << (PTE_FILE_BITS1 + PTE_FILE_BITS2)) \ + + ((((pte).pte_low >> (PTE_FILE_SHIFT4))) \ + << (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3)) + +#define pgoff_to_pte(off) \ + ((pte_t) { .pte_low = \ + ((((off)) & ((1U << PTE_FILE_BITS1) - 1)) << PTE_FILE_SHIFT1) \ + + ((((off) >> PTE_FILE_BITS1) \ + & ((1U << PTE_FILE_BITS2) - 1)) \ + << PTE_FILE_SHIFT2) \ + + ((((off) >> (PTE_FILE_BITS1 + PTE_FILE_BITS2)) \ + & ((1U << PTE_FILE_BITS3) - 1)) \ + << PTE_FILE_SHIFT3) \ + + ((((off) >> \ + (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3))) \ + << PTE_FILE_SHIFT4) \ + + _PAGE_FILE }) + +#else /* CONFIG_MEM_SOFT_DIRTY */ + /* * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken, - * split up the 29 bits of offset into this range: + * split up the 29 bits of offset into this range. */ #define PTE_FILE_MAX_BITS 29 #define PTE_FILE_SHIFT1 (_PAGE_BIT_PRESENT + 1) @@ -88,6 +132,8 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp) << PTE_FILE_SHIFT3) \ + _PAGE_FILE }) +#endif /* CONFIG_MEM_SOFT_DIRTY */ + /* Encode and de-code a swap entry */ #if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE #define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index 4cc9f2b7cdc..81bb91b49a8 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -179,6 +179,9 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp) /* * Bits 0, 6 and 7 are taken in the low part of the pte, * put the 32 bits of offset into the high part. + * + * For soft-dirty tracking 11 bit is taken from + * the low part of pte as well. */ #define pte_to_pgoff(pte) ((pte).pte_high) #define pgoff_to_pte(off) \ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 7dc305a4605..1c00631164c 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -314,6 +314,36 @@ static inline pmd_t pmd_mksoft_dirty(pmd_t pmd) return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY); } +static inline pte_t pte_swp_mksoft_dirty(pte_t pte) +{ + return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY); +} + +static inline int pte_swp_soft_dirty(pte_t pte) +{ + return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY; +} + +static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) +{ + return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY); +} + +static inline pte_t pte_file_clear_soft_dirty(pte_t pte) +{ + return pte_clear_flags(pte, _PAGE_SOFT_DIRTY); +} + +static inline pte_t pte_file_mksoft_dirty(pte_t pte) +{ + return pte_set_flags(pte, _PAGE_SOFT_DIRTY); +} + +static inline int pte_file_soft_dirty(pte_t pte) +{ + return pte_flags(pte) & _PAGE_SOFT_DIRTY; +} + /* * Mask out unsupported bits in a present pgprot. Non-present pgprots * can use those bits for other purposes, so leave them be. diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index c98ac63aae4..f4843e03113 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -61,12 +61,27 @@ * they do not conflict with each other. */ +#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_HIDDEN + #ifdef CONFIG_MEM_SOFT_DIRTY -#define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN) +#define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SOFT_DIRTY) #else #define _PAGE_SOFT_DIRTY (_AT(pteval_t, 0)) #endif +/* + * Tracking soft dirty bit when a page goes to a swap is tricky. + * We need a bit which can be stored in pte _and_ not conflict + * with swap entry format. On x86 bits 6 and 7 are *not* involved + * into swap entry computation, but bit 6 is used for nonlinear + * file mapping, so we borrow bit 7 for soft dirty tracking. + */ +#ifdef CONFIG_MEM_SOFT_DIRTY +#define _PAGE_SWP_SOFT_DIRTY _PAGE_PSE +#else +#define _PAGE_SWP_SOFT_DIRTY (_AT(pteval_t, 0)) +#endif + #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) #define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) #else diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index dbded5aedb8..48f8375e4c6 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -101,7 +101,7 @@ static void find_start_end(unsigned long flags, unsigned long *begin, *begin = new_begin; } } else { - *begin = TASK_UNMAPPED_BASE; + *begin = mmap_legacy_base(); *end = TASK_SIZE; } } diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 62c29a5bfe2..f63778cb236 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -98,7 +98,7 @@ static unsigned long mmap_base(void) * Bottom-up (legacy) layout on X86_32 did not support randomization, X86_64 * does, but not when emulating X86_32 */ -static unsigned long mmap_legacy_base(void) +unsigned long mmap_legacy_base(void) { if (mmap_is_ia32()) return TASK_UNMAPPED_BASE; diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index 99cb944a002..4d45dba7fb8 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -906,16 +906,10 @@ bio_pageinc(struct bio *bio) int i; bio_for_each_segment(bv, bio, i) { - page = bv->bv_page; /* Non-zero page count for non-head members of - * compound pages is no longer allowed by the kernel, - * but this has never been seen here. + * compound pages is no longer allowed by the kernel. */ - if (unlikely(PageCompound(page))) - if (compound_trans_head(page) != page) { - pr_crit("page tail used for block I/O\n"); - BUG(); - } + page = compound_trans_head(bv->bv_page); atomic_inc(&page->_count); } } @@ -924,10 +918,13 @@ static void bio_pagedec(struct bio *bio) { struct bio_vec *bv; + struct page *page; int i; - bio_for_each_segment(bv, bio, i) - atomic_dec(&bv->bv_page->_count); + bio_for_each_segment(bv, bio, i) { + page = compound_trans_head(bv->bv_page); + atomic_dec(&page->_count); + } } static void diff --git a/drivers/rtc/rtc-stmp3xxx.c b/drivers/rtc/rtc-stmp3xxx.c index 767fee2ab34..26019531db1 100644 --- a/drivers/rtc/rtc-stmp3xxx.c +++ b/drivers/rtc/rtc-stmp3xxx.c @@ -23,6 +23,7 @@ #include <linux/init.h> #include <linux/platform_device.h> #include <linux/interrupt.h> +#include <linux/delay.h> #include <linux/rtc.h> #include <linux/slab.h> #include <linux/of_device.h> @@ -119,24 +120,39 @@ static void stmp3xxx_wdt_register(struct platform_device *rtc_pdev) } #endif /* CONFIG_STMP3XXX_RTC_WATCHDOG */ -static void stmp3xxx_wait_time(struct stmp3xxx_rtc_data *rtc_data) +static int stmp3xxx_wait_time(struct stmp3xxx_rtc_data *rtc_data) { + int timeout = 5000; /* 3ms according to i.MX28 Ref Manual */ /* - * The datasheet doesn't say which way round the - * NEW_REGS/STALE_REGS bitfields go. In fact it's 0x1=P0, - * 0x2=P1, .., 0x20=P5, 0x40=ALARM, 0x80=SECONDS + * The i.MX28 Applications Processor Reference Manual, Rev. 1, 2010 + * states: + * | The order in which registers are updated is + * | Persistent 0, 1, 2, 3, 4, 5, Alarm, Seconds. + * | (This list is in bitfield order, from LSB to MSB, as they would + * | appear in the STALE_REGS and NEW_REGS bitfields of the HW_RTC_STAT + * | register. For example, the Seconds register corresponds to + * | STALE_REGS or NEW_REGS containing 0x80.) */ - while (readl(rtc_data->io + STMP3XXX_RTC_STAT) & - (0x80 << STMP3XXX_RTC_STAT_STALE_SHIFT)) - cpu_relax(); + do { + if (!(readl(rtc_data->io + STMP3XXX_RTC_STAT) & + (0x80 << STMP3XXX_RTC_STAT_STALE_SHIFT))) + return 0; + udelay(1); + } while (--timeout > 0); + return (readl(rtc_data->io + STMP3XXX_RTC_STAT) & + (0x80 << STMP3XXX_RTC_STAT_STALE_SHIFT)) ? -ETIME : 0; } /* Time read/write */ static int stmp3xxx_rtc_gettime(struct device *dev, struct rtc_time *rtc_tm) { + int ret; struct stmp3xxx_rtc_data *rtc_data = dev_get_drvdata(dev); - stmp3xxx_wait_time(rtc_data); + ret = stmp3xxx_wait_time(rtc_data); + if (ret) + return ret; + rtc_time_to_tm(readl(rtc_data->io + STMP3XXX_RTC_SECONDS), rtc_tm); return 0; } @@ -146,8 +162,7 @@ static int stmp3xxx_rtc_set_mmss(struct device *dev, unsigned long t) struct stmp3xxx_rtc_data *rtc_data = dev_get_drvdata(dev); writel(t, rtc_data->io + STMP3XXX_RTC_SECONDS); - stmp3xxx_wait_time(rtc_data); - return 0; + return stmp3xxx_wait_time(rtc_data); } /* interrupt(s) handler */ diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index a3f868ae3fd..34423978b17 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -463,6 +463,14 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb, return inode; } +/* + * Hugetlbfs is not reclaimable; therefore its i_mmap_mutex will never + * be taken from reclaim -- unlike regular filesystems. This needs an + * annotation because huge_pmd_share() does an allocation under + * i_mmap_mutex. + */ +struct lock_class_key hugetlbfs_i_mmap_mutex_key; + static struct inode *hugetlbfs_get_inode(struct super_block *sb, struct inode *dir, umode_t mode, dev_t dev) @@ -474,6 +482,8 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, struct hugetlbfs_inode_info *info; inode->i_ino = get_next_ino(); inode_init_owner(inode, dir, mode); + lockdep_set_class(&inode->i_mapping->i_mmap_mutex, + &hugetlbfs_i_mmap_mutex_key); inode->i_mapping->a_ops = &hugetlbfs_aops; inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 79736a28d84..2abf97b2a59 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1757,7 +1757,7 @@ try_again: goto out; } else if (ret == 1) { clusters_need = wc->w_clen; - ret = ocfs2_refcount_cow(inode, filp, di_bh, + ret = ocfs2_refcount_cow(inode, di_bh, wc->w_cpos, wc->w_clen, UINT_MAX); if (ret) { mlog_errno(ret); diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index eb760d8acd5..30544ce8e9f 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -2153,11 +2153,9 @@ int ocfs2_empty_dir(struct inode *inode) { int ret; struct ocfs2_empty_dir_priv priv = { - .ctx.actor = ocfs2_empty_dir_filldir + .ctx.actor = ocfs2_empty_dir_filldir, }; - memset(&priv, 0, sizeof(priv)); - if (ocfs2_dir_indexed(inode)) { ret = ocfs2_empty_dir_dx(inode, &priv); if (ret) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 41000f223ca..3261d71319e 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -370,7 +370,7 @@ static int ocfs2_cow_file_pos(struct inode *inode, if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) goto out; - return ocfs2_refcount_cow(inode, NULL, fe_bh, cpos, 1, cpos+1); + return ocfs2_refcount_cow(inode, fe_bh, cpos, 1, cpos+1); out: return status; @@ -899,7 +899,7 @@ static int ocfs2_zero_extend_get_range(struct inode *inode, zero_clusters = last_cpos - zero_cpos; if (needs_cow) { - rc = ocfs2_refcount_cow(inode, NULL, di_bh, zero_cpos, + rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos, zero_clusters, UINT_MAX); if (rc) { mlog_errno(rc); @@ -2078,7 +2078,7 @@ static int ocfs2_prepare_inode_for_refcount(struct inode *inode, *meta_level = 1; - ret = ocfs2_refcount_cow(inode, file, di_bh, cpos, clusters, UINT_MAX); + ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX); if (ret) mlog_errno(ret); out: diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 96f9ac237e8..0a992737dca 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -537,7 +537,7 @@ static inline int ocfs2_calc_extend_credits(struct super_block *sb, extent_blocks = 1 + 1 + le16_to_cpu(root_el->l_tree_depth); return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks + - ocfs2_quota_trans_credits(sb) + bits_wanted; + ocfs2_quota_trans_credits(sb); } static inline int ocfs2_calc_symlink_credits(struct super_block *sb) diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index f1fc172175b..452068b4574 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -69,7 +69,7 @@ static int __ocfs2_move_extent(handle_t *handle, u64 ino = ocfs2_metadata_cache_owner(context->et.et_ci); u64 old_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cpos); - ret = ocfs2_duplicate_clusters_by_page(handle, context->file, cpos, + ret = ocfs2_duplicate_clusters_by_page(handle, inode, cpos, p_cpos, new_p_cpos, len); if (ret) { mlog_errno(ret); diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 9f6b96a0961..a70d604593b 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -49,7 +49,6 @@ struct ocfs2_cow_context { struct inode *inode; - struct file *file; u32 cow_start; u32 cow_len; struct ocfs2_extent_tree data_et; @@ -66,7 +65,7 @@ struct ocfs2_cow_context { u32 *num_clusters, unsigned int *extent_flags); int (*cow_duplicate_clusters)(handle_t *handle, - struct file *file, + struct inode *inode, u32 cpos, u32 old_cluster, u32 new_cluster, u32 new_len); }; @@ -2922,14 +2921,12 @@ static int ocfs2_clear_cow_buffer(handle_t *handle, struct buffer_head *bh) } int ocfs2_duplicate_clusters_by_page(handle_t *handle, - struct file *file, + struct inode *inode, u32 cpos, u32 old_cluster, u32 new_cluster, u32 new_len) { int ret = 0, partial; - struct inode *inode = file_inode(file); - struct ocfs2_caching_info *ci = INODE_CACHE(inode); - struct super_block *sb = ocfs2_metadata_cache_get_super(ci); + struct super_block *sb = inode->i_sb; u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); struct page *page; pgoff_t page_index; @@ -2978,13 +2975,6 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle, if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize) BUG_ON(PageDirty(page)); - if (PageReadahead(page)) { - page_cache_async_readahead(mapping, - &file->f_ra, file, - page, page_index, - readahead_pages); - } - if (!PageUptodate(page)) { ret = block_read_full_page(page, ocfs2_get_block); if (ret) { @@ -3004,7 +2994,8 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle, } } - ocfs2_map_and_dirty_page(inode, handle, from, to, + ocfs2_map_and_dirty_page(inode, + handle, from, to, page, 0, &new_block); mark_page_accessed(page); unlock: @@ -3020,12 +3011,11 @@ unlock: } int ocfs2_duplicate_clusters_by_jbd(handle_t *handle, - struct file *file, + struct inode *inode, u32 cpos, u32 old_cluster, u32 new_cluster, u32 new_len) { int ret = 0; - struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; struct ocfs2_caching_info *ci = INODE_CACHE(inode); int i, blocks = ocfs2_clusters_to_blocks(sb, new_len); @@ -3150,7 +3140,7 @@ static int ocfs2_replace_clusters(handle_t *handle, /*If the old clusters is unwritten, no need to duplicate. */ if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) { - ret = context->cow_duplicate_clusters(handle, context->file, + ret = context->cow_duplicate_clusters(handle, context->inode, cpos, old, new, len); if (ret) { mlog_errno(ret); @@ -3428,35 +3418,12 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context) return ret; } -static void ocfs2_readahead_for_cow(struct inode *inode, - struct file *file, - u32 start, u32 len) -{ - struct address_space *mapping; - pgoff_t index; - unsigned long num_pages; - int cs_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits; - - if (!file) - return; - - mapping = file->f_mapping; - num_pages = (len << cs_bits) >> PAGE_CACHE_SHIFT; - if (!num_pages) - num_pages = 1; - - index = ((loff_t)start << cs_bits) >> PAGE_CACHE_SHIFT; - page_cache_sync_readahead(mapping, &file->f_ra, file, - index, num_pages); -} - /* * Starting at cpos, try to CoW write_len clusters. Don't CoW * past max_cpos. This will stop when it runs into a hole or an * unrefcounted extent. */ static int ocfs2_refcount_cow_hunk(struct inode *inode, - struct file *file, struct buffer_head *di_bh, u32 cpos, u32 write_len, u32 max_cpos) { @@ -3485,8 +3452,6 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode, BUG_ON(cow_len == 0); - ocfs2_readahead_for_cow(inode, file, cow_start, cow_len); - context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS); if (!context) { ret = -ENOMEM; @@ -3508,7 +3473,6 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode, context->ref_root_bh = ref_root_bh; context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page; context->get_clusters = ocfs2_di_get_clusters; - context->file = file; ocfs2_init_dinode_extent_tree(&context->data_et, INODE_CACHE(inode), di_bh); @@ -3537,7 +3501,6 @@ out: * clusters between cpos and cpos+write_len are safe to modify. */ int ocfs2_refcount_cow(struct inode *inode, - struct file *file, struct buffer_head *di_bh, u32 cpos, u32 write_len, u32 max_cpos) { @@ -3557,7 +3520,7 @@ int ocfs2_refcount_cow(struct inode *inode, num_clusters = write_len; if (ext_flags & OCFS2_EXT_REFCOUNTED) { - ret = ocfs2_refcount_cow_hunk(inode, file, di_bh, cpos, + ret = ocfs2_refcount_cow_hunk(inode, di_bh, cpos, num_clusters, max_cpos); if (ret) { mlog_errno(ret); diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h index 7754608c83a..6422bbcdb52 100644 --- a/fs/ocfs2/refcounttree.h +++ b/fs/ocfs2/refcounttree.h @@ -53,7 +53,7 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode, int *credits, int *ref_blocks); int ocfs2_refcount_cow(struct inode *inode, - struct file *filep, struct buffer_head *di_bh, + struct buffer_head *di_bh, u32 cpos, u32 write_len, u32 max_cpos); typedef int (ocfs2_post_refcount_func)(struct inode *inode, @@ -85,11 +85,11 @@ int ocfs2_refcount_cow_xattr(struct inode *inode, u32 cpos, u32 write_len, struct ocfs2_post_refcount *post); int ocfs2_duplicate_clusters_by_page(handle_t *handle, - struct file *file, + struct inode *inode, u32 cpos, u32 old_cluster, u32 new_cluster, u32 new_len); int ocfs2_duplicate_clusters_by_jbd(handle_t *handle, - struct file *file, + struct inode *inode, u32 cpos, u32 old_cluster, u32 new_cluster, u32 new_len); int ocfs2_cow_sync_writeback(struct super_block *sb, diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index dbf61f6174f..107d026f5d6 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -730,8 +730,16 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, * of how soft-dirty works. */ pte_t ptent = *pte; - ptent = pte_wrprotect(ptent); - ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); + + if (pte_present(ptent)) { + ptent = pte_wrprotect(ptent); + ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); + } else if (is_swap_pte(ptent)) { + ptent = pte_swp_clear_soft_dirty(ptent); + } else if (pte_file(ptent)) { + ptent = pte_file_clear_soft_dirty(ptent); + } + set_pte_at(vma->vm_mm, addr, pte, ptent); #endif } @@ -752,14 +760,15 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) { ptent = *pte; - if (!pte_present(ptent)) - continue; if (cp->type == CLEAR_REFS_SOFT_DIRTY) { clear_soft_dirty(vma, addr, pte); continue; } + if (!pte_present(ptent)) + continue; + page = vm_normal_page(vma, addr, ptent); if (!page) continue; @@ -859,7 +868,7 @@ typedef struct { } pagemap_entry_t; struct pagemapread { - int pos, len; + int pos, len; /* units: PM_ENTRY_BYTES, not bytes */ pagemap_entry_t *buffer; bool v2; }; @@ -867,7 +876,7 @@ struct pagemapread { #define PAGEMAP_WALK_SIZE (PMD_SIZE) #define PAGEMAP_WALK_MASK (PMD_MASK) -#define PM_ENTRY_BYTES sizeof(u64) +#define PM_ENTRY_BYTES sizeof(pagemap_entry_t) #define PM_STATUS_BITS 3 #define PM_STATUS_OFFSET (64 - PM_STATUS_BITS) #define PM_STATUS_MASK (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET) @@ -930,8 +939,10 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, flags = PM_PRESENT; page = vm_normal_page(vma, addr, pte); } else if (is_swap_pte(pte)) { - swp_entry_t entry = pte_to_swp_entry(pte); - + swp_entry_t entry; + if (pte_swp_soft_dirty(pte)) + flags2 |= __PM_SOFT_DIRTY; + entry = pte_to_swp_entry(pte); frame = swp_type(entry) | (swp_offset(entry) << MAX_SWAPFILES_SHIFT); flags = PM_SWAP; @@ -1116,8 +1127,8 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, goto out_task; pm.v2 = soft_dirty_cleared; - pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); - pm.buffer = kmalloc(pm.len, GFP_TEMPORARY); + pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); + pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_TEMPORARY); ret = -ENOMEM; if (!pm.buffer) goto out_task; diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 2f47ade1b56..0807ddf97b0 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -417,6 +417,36 @@ static inline pmd_t pmd_mksoft_dirty(pmd_t pmd) { return pmd; } + +static inline pte_t pte_swp_mksoft_dirty(pte_t pte) +{ + return pte; +} + +static inline int pte_swp_soft_dirty(pte_t pte) +{ + return 0; +} + +static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) +{ + return pte; +} + +static inline pte_t pte_file_clear_soft_dirty(pte_t pte) +{ + return pte; +} + +static inline pte_t pte_file_mksoft_dirty(pte_t pte) +{ + return pte; +} + +static inline int pte_file_soft_dirty(pte_t pte) +{ + return 0; +} #endif #ifndef __HAVE_PFNMAP_TRACKING diff --git a/include/linux/sched.h b/include/linux/sched.h index 078066daffd..e9995eb5985 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -314,6 +314,7 @@ struct nsproxy; struct user_namespace; #ifdef CONFIG_MMU +extern unsigned long mmap_legacy_base(void); extern void arch_pick_mmap_layout(struct mm_struct *mm); extern unsigned long arch_get_unmapped_area(struct file *, unsigned long, unsigned long, diff --git a/include/linux/swapops.h b/include/linux/swapops.h index c5fd30d2a41..8d4fa82bfb9 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -67,6 +67,8 @@ static inline swp_entry_t pte_to_swp_entry(pte_t pte) swp_entry_t arch_entry; BUG_ON(pte_file(pte)); + if (pte_swp_soft_dirty(pte)) + pte = pte_swp_clear_soft_dirty(pte); arch_entry = __pte_to_swp_entry(pte); return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry)); } diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 4147d700a29..84662ecc7b5 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -802,9 +802,14 @@ asmlinkage long sys_vfork(void); asmlinkage long sys_clone(unsigned long, unsigned long, int __user *, int, int __user *); #else +#ifdef CONFIG_CLONE_BACKWARDS3 +asmlinkage long sys_clone(unsigned long, unsigned long, int, int __user *, + int __user *, int); +#else asmlinkage long sys_clone(unsigned long, unsigned long, int __user *, int __user *, int); #endif +#endif asmlinkage long sys_execve(const char __user *filename, const char __user *const __user *argv, diff --git a/kernel/fork.c b/kernel/fork.c index 403d2bb8a96..e23bb19e2a3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1679,6 +1679,12 @@ SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags, int __user *, parent_tidptr, int __user *, child_tidptr, int, tls_val) +#elif defined(CONFIG_CLONE_BACKWARDS3) +SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp, + int, stack_size, + int __user *, parent_tidptr, + int __user *, child_tidptr, + int, tls_val) #else SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, int __user *, parent_tidptr, diff --git a/mm/fremap.c b/mm/fremap.c index 87da3590c61..5bff0814776 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -57,17 +57,22 @@ static int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long pgoff, pgprot_t prot) { int err = -ENOMEM; - pte_t *pte; + pte_t *pte, ptfile; spinlock_t *ptl; pte = get_locked_pte(mm, addr, &ptl); if (!pte) goto out; - if (!pte_none(*pte)) + ptfile = pgoff_to_pte(pgoff); + + if (!pte_none(*pte)) { + if (pte_present(*pte) && pte_soft_dirty(*pte)) + pte_file_mksoft_dirty(ptfile); zap_pte(mm, vma, addr, pte); + } - set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); + set_pte_at(mm, addr, pte, ptfile); /* * We don't need to run update_mmu_cache() here because the "file pte" * being installed by install_file_pte() is not a real pte - it's a diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c290a1cf386..c5792a5d87c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3195,11 +3195,11 @@ int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s, if (!s->memcg_params) return -ENOMEM; - INIT_WORK(&s->memcg_params->destroy, - kmem_cache_destroy_work_func); if (memcg) { s->memcg_params->memcg = memcg; s->memcg_params->root_cache = root_cache; + INIT_WORK(&s->memcg_params->destroy, + kmem_cache_destroy_work_func); } else s->memcg_params->is_root_cache = true; diff --git a/mm/memory.c b/mm/memory.c index 1ce2e2a734f..40268410732 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1141,9 +1141,12 @@ again: continue; if (unlikely(details) && details->nonlinear_vma && linear_page_index(details->nonlinear_vma, - addr) != page->index) - set_pte_at(mm, addr, pte, - pgoff_to_pte(page->index)); + addr) != page->index) { + pte_t ptfile = pgoff_to_pte(page->index); + if (pte_soft_dirty(ptent)) + pte_file_mksoft_dirty(ptfile); + set_pte_at(mm, addr, pte, ptfile); + } if (PageAnon(page)) rss[MM_ANONPAGES]--; else { @@ -3115,6 +3118,8 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, exclusive = 1; } flush_icache_page(vma, page); + if (pte_swp_soft_dirty(orig_pte)) + pte = pte_mksoft_dirty(pte); set_pte_at(mm, address, page_table, pte); if (page == swapcache) do_page_add_anon_rmap(page, vma, address, exclusive); @@ -3408,6 +3413,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, entry = mk_pte(page, vma->vm_page_prot); if (flags & FAULT_FLAG_WRITE) entry = maybe_mkwrite(pte_mkdirty(entry), vma); + else if (pte_file(orig_pte) && pte_file_soft_dirty(orig_pte)) + pte_mksoft_dirty(entry); if (anon) { inc_mm_counter_fast(mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, address); diff --git a/mm/rmap.c b/mm/rmap.c index cd356df4f71..b2e29acd7e3 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1236,6 +1236,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, swp_entry_to_pte(make_hwpoison_entry(page))); } else if (PageAnon(page)) { swp_entry_t entry = { .val = page_private(page) }; + pte_t swp_pte; if (PageSwapCache(page)) { /* @@ -1264,7 +1265,10 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION); entry = make_migration_entry(page, pte_write(pteval)); } - set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); + swp_pte = swp_entry_to_pte(entry); + if (pte_soft_dirty(pteval)) + swp_pte = pte_swp_mksoft_dirty(swp_pte); + set_pte_at(mm, address, pte, swp_pte); BUG_ON(pte_file(*pte)); } else if (IS_ENABLED(CONFIG_MIGRATION) && (TTU_ACTION(flags) == TTU_MIGRATION)) { @@ -1401,8 +1405,12 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, pteval = ptep_clear_flush(vma, address, pte); /* If nonlinear, store the file page offset in the pte. */ - if (page->index != linear_page_index(vma, address)) - set_pte_at(mm, address, pte, pgoff_to_pte(page->index)); + if (page->index != linear_page_index(vma, address)) { + pte_t ptfile = pgoff_to_pte(page->index); + if (pte_soft_dirty(pteval)) + pte_file_mksoft_dirty(ptfile); + set_pte_at(mm, address, pte, ptfile); + } /* Move the dirty bit to the physical page now the pte is gone. */ if (pte_dirty(pteval)) diff --git a/mm/swapfile.c b/mm/swapfile.c index 36af6eeaa67..6cf2e60983b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -866,6 +866,21 @@ unsigned int count_swap_pages(int type, int free) } #endif /* CONFIG_HIBERNATION */ +static inline int maybe_same_pte(pte_t pte, pte_t swp_pte) +{ +#ifdef CONFIG_MEM_SOFT_DIRTY + /* + * When pte keeps soft dirty bit the pte generated + * from swap entry does not has it, still it's same + * pte from logical point of view. + */ + pte_t swp_pte_dirty = pte_swp_mksoft_dirty(swp_pte); + return pte_same(pte, swp_pte) || pte_same(pte, swp_pte_dirty); +#else + return pte_same(pte, swp_pte); +#endif +} + /* * No need to decide whether this PTE shares the swap entry with others, * just let do_wp_page work it out if a write is requested later - to @@ -892,7 +907,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, } pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); - if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { + if (unlikely(!maybe_same_pte(*pte, swp_entry_to_pte(entry)))) { mem_cgroup_cancel_charge_swapin(memcg); ret = 0; goto out; @@ -947,7 +962,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, * swapoff spends a _lot_ of time in this loop! * Test inline before going to call unuse_pte. */ - if (unlikely(pte_same(*pte, swp_pte))) { + if (unlikely(maybe_same_pte(*pte, swp_pte))) { pte_unmap(pte); ret = unuse_pte(vma, pmd, addr, entry, page); if (ret) |