From 31bd61f2bb79e098117d823e054342b03aa87668 Mon Sep 17 00:00:00 2001 From: Lachlan McIlroy Date: Wed, 17 Sep 2008 16:45:37 +1000 Subject: [XFS] Move memory allocations for log tracing out of the critical path Memory allocations for log->l_grant_trace and iclog->ic_trace are done on demand when the first event is logged. In xlog_state_get_iclog_space() we call xlog_trace_iclog() under a spinlock and allocating memory here can cause us to sleep with a spinlock held and deadlock the system. For the log grant tracing we use KM_NOSLEEP but that means we can lose trace entries. Since there is no locking to serialize the log grant tracing we could race and have multiple allocations and leak memory. So move the allocations to where we initialize the log/iclog structures. Use KM_NOFS to avoid recursing into the filesystem and drop log->l_trace since it's not even used. SGI-PV: 983738 SGI-Modid: xfs-linux-melb:xfs-kern:31896a Signed-off-by: Lachlan McIlroy Signed-off-by: Christoph Hellwig --- fs/xfs/xfs_log.c | 60 ++++++++++++++++++++++++++++++++++----------------- fs/xfs/xfs_log_priv.h | 1 - 2 files changed, 40 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index ccba14eb9db..7812bd036e2 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -124,16 +124,27 @@ STATIC void xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog, STATIC int xlog_iclogs_empty(xlog_t *log); #if defined(XFS_LOG_TRACE) + +#define XLOG_TRACE_LOGGRANT_SIZE 2048 +#define XLOG_TRACE_ICLOG_SIZE 256 + +void +xlog_trace_loggrant_alloc(xlog_t *log) +{ + log->l_grant_trace = ktrace_alloc(XLOG_TRACE_LOGGRANT_SIZE, KM_NOFS); +} + +void +xlog_trace_loggrant_dealloc(xlog_t *log) +{ + ktrace_free(log->l_grant_trace); +} + void xlog_trace_loggrant(xlog_t *log, xlog_ticket_t *tic, xfs_caddr_t string) { unsigned long cnts; - if (!log->l_grant_trace) { - log->l_grant_trace = ktrace_alloc(2048, KM_NOSLEEP); - if (!log->l_grant_trace) - return; - } /* ticket counts are 1 byte each */ cnts = ((unsigned long)tic->t_ocnt) | ((unsigned long)tic->t_cnt) << 8; @@ -156,11 +167,21 @@ xlog_trace_loggrant(xlog_t *log, xlog_ticket_t *tic, xfs_caddr_t string) (void *)((unsigned long)tic->t_unit_res)); } +void +xlog_trace_iclog_alloc(xlog_in_core_t *iclog) +{ + iclog->ic_trace = ktrace_alloc(XLOG_TRACE_ICLOG_SIZE, KM_NOFS); +} + +void +xlog_trace_iclog_dealloc(xlog_in_core_t *iclog) +{ + ktrace_free(iclog->ic_trace); +} + void xlog_trace_iclog(xlog_in_core_t *iclog, uint state) { - if (!iclog->ic_trace) - iclog->ic_trace = ktrace_alloc(256, KM_NOFS); ktrace_enter(iclog->ic_trace, (void *)((unsigned long)state), (void *)((unsigned long)current_pid()), @@ -170,8 +191,15 @@ xlog_trace_iclog(xlog_in_core_t *iclog, uint state) (void *)NULL, (void *)NULL); } #else + +#define xlog_trace_loggrant_alloc(log) +#define xlog_trace_loggrant_dealloc(log) #define xlog_trace_loggrant(log,tic,string) + +#define xlog_trace_iclog_alloc(iclog) +#define xlog_trace_iclog_dealloc(iclog) #define xlog_trace_iclog(iclog,state) + #endif /* XFS_LOG_TRACE */ @@ -1231,6 +1259,7 @@ xlog_alloc_log(xfs_mount_t *mp, spin_lock_init(&log->l_grant_lock); sv_init(&log->l_flush_wait, 0, "flush_wait"); + xlog_trace_loggrant_alloc(log); /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */ ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0); @@ -1285,6 +1314,8 @@ xlog_alloc_log(xfs_mount_t *mp, sv_init(&iclog->ic_force_wait, SV_DEFAULT, "iclog-force"); sv_init(&iclog->ic_write_wait, SV_DEFAULT, "iclog-write"); + xlog_trace_iclog_alloc(iclog); + iclogp = &iclog->ic_next; } *iclogp = log->l_iclog; /* complete ring */ @@ -1565,11 +1596,7 @@ xlog_dealloc_log(xlog_t *log) sv_destroy(&iclog->ic_force_wait); sv_destroy(&iclog->ic_write_wait); xfs_buf_free(iclog->ic_bp); -#ifdef XFS_LOG_TRACE - if (iclog->ic_trace != NULL) { - ktrace_free(iclog->ic_trace); - } -#endif + xlog_trace_iclog_dealloc(iclog); next_iclog = iclog->ic_next; kmem_free(iclog); iclog = next_iclog; @@ -1578,14 +1605,7 @@ xlog_dealloc_log(xlog_t *log) spinlock_destroy(&log->l_grant_lock); xfs_buf_free(log->l_xbuf); -#ifdef XFS_LOG_TRACE - if (log->l_trace != NULL) { - ktrace_free(log->l_trace); - } - if (log->l_grant_trace != NULL) { - ktrace_free(log->l_grant_trace); - } -#endif + xlog_trace_loggrant_dealloc(log); log->l_mp->m_log = NULL; kmem_free(log); } /* xlog_dealloc_log */ diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index c8a5b22ee3e..e7d8f84443f 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -448,7 +448,6 @@ typedef struct log { int l_grant_write_bytes; #ifdef XFS_LOG_TRACE - struct ktrace *l_trace; struct ktrace *l_grant_trace; #endif -- cgit v1.2.3-70-g09d2 From 6efdf281777eb07fac28ac2b2d7df1e619ee6da1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 17 Sep 2008 16:49:33 +1000 Subject: [XFS] Fix regression introduced by remount fixup Logically we would return an error in xfs_fs_remount code to prevent users from believing they might have changed mount options using remount which can't be changed. But unfortunately mount(8) adds all options from mtab and fstab to the mount arguments in some cases so we can't blindly reject options, but have to check for each specified option if it actually differs from the currently set option and only reject it if that's the case. Until that is implemented we return success for every remount request, and silently ignore all options that we can't actually change. SGI-PV: 985710 SGI-Modid: xfs-linux-melb:xfs-kern:31908a Signed-off-by: Christoph Hellwig Signed-off-by: Tim Shimmin Signed-off-by: Lachlan McIlroy --- fs/xfs/linux-2.6/xfs_super.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 73c65f19e54..18d3c848783 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -1302,9 +1302,29 @@ xfs_fs_remount( mp->m_flags &= ~XFS_MOUNT_BARRIER; break; default: + /* + * Logically we would return an error here to prevent + * users from believing they might have changed + * mount options using remount which can't be changed. + * + * But unfortunately mount(8) adds all options from + * mtab and fstab to the mount arguments in some cases + * so we can't blindly reject options, but have to + * check for each specified option if it actually + * differs from the currently set option and only + * reject it if that's the case. + * + * Until that is implemented we return success for + * every remount request, and silently ignore all + * options that we can't actually change. + */ +#if 0 printk(KERN_INFO "XFS: mount option \"%s\" not supported for remount\n", p); return -EINVAL; +#else + return 0; +#endif } } -- cgit v1.2.3-70-g09d2 From 364f358a734ddcd827c662ccbfa58ee3ac510762 Mon Sep 17 00:00:00 2001 From: Lachlan McIlroy Date: Wed, 17 Sep 2008 16:50:14 +1000 Subject: [XFS] Prevent direct I/O from mapping extents beyond eof With the help from some tracing I found that we try to map extents beyond eof when doing a direct I/O read. It appears that the way to inform the generic direct I/O path (ie do_direct_IO()) that we have breached eof is to return an unmapped buffer from xfs_get_blocks_direct(). This will cause do_direct_IO() to jump to the hole handling code where is will check for eof and then abort. This problem was found because a direct I/O read was trying to map beyond eof and was encountering delayed allocations. The delayed allocations beyond eof are speculative allocations and they didn't get converted when the direct I/O flushed the file because there was only enough space in the current AG to convert and write out the dirty pages within eof. Note that xfs_iomap_write_allocate() wont necessarily convert all the delayed allocation passed to it - it will return after allocating the first extent - so if the delayed allocation extends beyond eof then it will stay that way. SGI-PV: 983683 SGI-Modid: xfs-linux-melb:xfs-kern:31929a Signed-off-by: Lachlan McIlroy Signed-off-by: Christoph Hellwig --- fs/xfs/linux-2.6/xfs_aops.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index f42f80a3b1f..a44d68eb50b 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -1338,6 +1338,10 @@ __xfs_get_blocks( offset = (xfs_off_t)iblock << inode->i_blkbits; ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); size = bh_result->b_size; + + if (!create && direct && offset >= i_size_read(inode)) + return 0; + error = xfs_iomap(XFS_I(inode), offset, size, create ? flags : BMAPI_READ, &iomap, &niomap); if (error) -- cgit v1.2.3-70-g09d2 From b5b8c9acd547244eb2b7d0280ba38b9dd01971cc Mon Sep 17 00:00:00 2001 From: David Chinner Date: Wed, 17 Sep 2008 16:50:50 +1000 Subject: [XFS] Fix barrier status change detection. The current code in xlog_iodone() uses the wrong macro to check if the barrier has been cleared due to an EOPNOTSUPP error form the lower layer. SGI-PV: 986143 SGI-Modid: xfs-linux-melb:xfs-kern:31984a Signed-off-by: David Chinner Signed-off-by: Nathaniel W. Turner Signed-off-by: Peter Leckie Signed-off-by: Lachlan McIlroy --- fs/xfs/xfs_log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 7812bd036e2..503ea89e8b9 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1037,7 +1037,7 @@ xlog_iodone(xfs_buf_t *bp) * layer, it means the underlyin device no longer supports * barrier I/O. Warn loudly and turn off barriers. */ - if ((l->l_mp->m_flags & XFS_MOUNT_BARRIER) && !XFS_BUF_ORDERED(bp)) { + if ((l->l_mp->m_flags & XFS_MOUNT_BARRIER) && !XFS_BUF_ISORDERED(bp)) { l->l_mp->m_flags &= ~XFS_MOUNT_BARRIER; xfs_fs_cmn_err(CE_WARN, l->l_mp, "xlog_iodone: Barriers are no longer supported" -- cgit v1.2.3-70-g09d2 From f9114eba1eb08ee75fd0f1eee780f0290fb3c043 Mon Sep 17 00:00:00 2001 From: David Chinner Date: Wed, 17 Sep 2008 16:51:21 +1000 Subject: [XFS] Prevent lockdep false positives when locking two inodes. If we call xfs_lock_two_inodes() to grab both the iolock and the ilock, then drop the ilocks on both inodes, then grab them again (as xfs_swap_extents() does) then lockdep will report a locking order problem. This is a false positive. To avoid this, disallow xfs_lock_two_inodes() fom locking both inode locks at once - force calers to make two separate calls. This means that nested dropping and regaining of the ilocks will retain the same lockdep subclass and so lockdep will not see anything wrong with this code. SGI-PV: 986238 SGI-Modid: xfs-linux-melb:xfs-kern:31999a Signed-off-by: David Chinner Signed-off-by: Christoph Hellwig Signed-off-by: Peter Leckie Signed-off-by: Lachlan McIlroy --- fs/xfs/xfs_dfrag.c | 9 ++++++++- fs/xfs/xfs_vnodeops.c | 8 ++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index 760f4c5b516..75b0cd4da0e 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -149,7 +149,14 @@ xfs_swap_extents( sbp = &sxp->sx_stat; - xfs_lock_two_inodes(ip, tip, lock_flags); + /* + * we have to do two separate lock calls here to keep lockdep + * happy. If we try to get all the locks in one call, lock will + * report false positives when we drop the ILOCK and regain them + * below. + */ + xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL); + xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); locked = 1; /* Verify that both files have the same format */ diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index aa238c8fbd7..98a0aecafdd 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -1838,6 +1838,12 @@ again: #endif } +/* + * xfs_lock_two_inodes() can only be used to lock one type of lock + * at a time - the iolock or the ilock, but not both at once. If + * we lock both at once, lockdep will report false positives saying + * we have violated locking orders. + */ void xfs_lock_two_inodes( xfs_inode_t *ip0, @@ -1848,6 +1854,8 @@ xfs_lock_two_inodes( int attempts = 0; xfs_log_item_t *lp; + if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) + ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0); ASSERT(ip0->i_ino != ip1->i_ino); if (ip0->i_ino > ip1->i_ino) { -- cgit v1.2.3-70-g09d2 From e1f5dbd7077eebec794452a516cb02f1669b036d Mon Sep 17 00:00:00 2001 From: Lachlan McIlroy Date: Wed, 17 Sep 2008 16:52:13 +1000 Subject: [XFS] Fix use-after-free with buffers We have a use-after-free issue where log completions access buffers via the buffer log item and the buffer has already been freed. Fix this by taking a reference on the buffer when attaching the buffer log item and release the hold when the buffer log item is detached and we no longer need the buffer. Also create a new function xfs_buf_item_free() to combine some common code. SGI-PV: 985757 SGI-Modid: xfs-linux-melb:xfs-kern:32025a Signed-off-by: Lachlan McIlroy Signed-off-by: Christoph Hellwig --- fs/xfs/xfs_buf_item.c | 44 ++++++++++++++++++++------------------------ 1 file changed, 20 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 608c30c3f76..002fc2617c8 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -732,6 +732,7 @@ xfs_buf_item_init( bip->bli_item.li_ops = &xfs_buf_item_ops; bip->bli_item.li_mountp = mp; bip->bli_buf = bp; + xfs_buf_hold(bp); bip->bli_format.blf_type = XFS_LI_BUF; bip->bli_format.blf_blkno = (__int64_t)XFS_BUF_ADDR(bp); bip->bli_format.blf_len = (ushort)BTOBB(XFS_BUF_COUNT(bp)); @@ -867,6 +868,21 @@ xfs_buf_item_dirty( return (bip->bli_flags & XFS_BLI_DIRTY); } +STATIC void +xfs_buf_item_free( + xfs_buf_log_item_t *bip) +{ +#ifdef XFS_TRANS_DEBUG + kmem_free(bip->bli_orig); + kmem_free(bip->bli_logged); +#endif /* XFS_TRANS_DEBUG */ + +#ifdef XFS_BLI_TRACE + ktrace_free(bip->bli_trace); +#endif + kmem_zone_free(xfs_buf_item_zone, bip); +} + /* * This is called when the buf log item is no longer needed. It should * free the buf log item associated with the given buffer and clear @@ -887,18 +903,8 @@ xfs_buf_item_relse( (XFS_BUF_IODONE_FUNC(bp) != NULL)) { XFS_BUF_CLR_IODONE_FUNC(bp); } - -#ifdef XFS_TRANS_DEBUG - kmem_free(bip->bli_orig); - bip->bli_orig = NULL; - kmem_free(bip->bli_logged); - bip->bli_logged = NULL; -#endif /* XFS_TRANS_DEBUG */ - -#ifdef XFS_BLI_TRACE - ktrace_free(bip->bli_trace); -#endif - kmem_zone_free(xfs_buf_item_zone, bip); + xfs_buf_rele(bp); + xfs_buf_item_free(bip); } @@ -1120,6 +1126,7 @@ xfs_buf_iodone( ASSERT(bip->bli_buf == bp); + xfs_buf_rele(bp); mp = bip->bli_item.li_mountp; /* @@ -1136,18 +1143,7 @@ xfs_buf_iodone( * xfs_trans_delete_ail() drops the AIL lock. */ xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip); - -#ifdef XFS_TRANS_DEBUG - kmem_free(bip->bli_orig); - bip->bli_orig = NULL; - kmem_free(bip->bli_logged); - bip->bli_logged = NULL; -#endif /* XFS_TRANS_DEBUG */ - -#ifdef XFS_BLI_TRACE - ktrace_free(bip->bli_trace); -#endif - kmem_zone_free(xfs_buf_item_zone, bip); + xfs_buf_item_free(bip); } #if defined(XFS_BLI_TRACE) -- cgit v1.2.3-70-g09d2 From 2fd6f6ec64ff347447d26646ac6188f3658b383c Mon Sep 17 00:00:00 2001 From: Lachlan McIlroy Date: Wed, 17 Sep 2008 16:52:50 +1000 Subject: [XFS] Don't do I/O beyond eof when unreserving space When unreserving space with boundaries that are not block aligned we round up the start and round down the end boundaries and then use this function, xfs_zero_remaining_bytes(), to zero the parts of the blocks that got dropped during the rounding. The problem is we don't consider if these blocks are beyond eof. Worse still is if we encounter delayed allocations beyond eof we will try to use the magic delayed allocation block number as a real block number. If the file size is ever extended to expose these blocks then we'll go through xfs_zero_eof() to zero them anyway. SGI-PV: 983683 SGI-Modid: xfs-linux-melb:xfs-kern:32055a Signed-off-by: Lachlan McIlroy Signed-off-by: Christoph Hellwig --- fs/xfs/xfs_vnodeops.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 98a0aecafdd..8b6812f66a1 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -3160,6 +3160,13 @@ error1: /* Just cancel transaction */ /* * Zero file bytes between startoff and endoff inclusive. * The iolock is held exclusive and no blocks are buffered. + * + * This function is used by xfs_free_file_space() to zero + * partial blocks when the range to free is not block aligned. + * When unreserving space with boundaries that are not block + * aligned we round up the start and round down the end + * boundaries and then use this function to zero the parts of + * the blocks that got dropped during the rounding. */ STATIC int xfs_zero_remaining_bytes( @@ -3176,6 +3183,17 @@ xfs_zero_remaining_bytes( int nimap; int error = 0; + /* + * Avoid doing I/O beyond eof - it's not necessary + * since nothing can read beyond eof. The space will + * be zeroed when the file is extended anyway. + */ + if (startoff >= ip->i_size) + return 0; + + if (endoff > ip->i_size) + endoff = ip->i_size; + bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize, XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp); -- cgit v1.2.3-70-g09d2 From 0855f310dff76ae42c5aac32f600f8f692bbd23f Mon Sep 17 00:00:00 2001 From: Sebastian Siewior Date: Tue, 9 Sep 2008 11:17:29 +0200 Subject: UBIFS: create the name of the background thread in every case If the ubifs partition is mounted RO and then remounted RW we end up with no thread name in ubifs_remount_rw() and the thread appears nameless. Signed-off-by: Sebastian Siewior Signed-off-by: Artem Bityutskiy --- fs/ubifs/super.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 7562464ac83..3f4902060c7 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1024,14 +1024,13 @@ static int mount_ubifs(struct ubifs_info *c) goto out_dereg; } + sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num, c->vi.vol_id); if (!mounted_read_only) { err = alloc_wbufs(c); if (err) goto out_cbuf; /* Create background thread */ - sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num, - c->vi.vol_id); c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name); if (!c->bgt) c->bgt = ERR_PTR(-EINVAL); -- cgit v1.2.3-70-g09d2 From 6dcfac4f13d6b32fbaa60b64a23249999e66af8e Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 12 Sep 2008 12:27:47 +0300 Subject: UBIFS: TNC / GC race fixes - update GC sequence number if any nodes may have been moved even if GC did not finish the LEB - don't ignore error return when reading Signed-off-by: Adrian Hunter Signed-off-by: Artem Bityutskiy --- fs/ubifs/gc.c | 14 +++++++++++--- fs/ubifs/tnc.c | 2 +- 2 files changed, 12 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c index 13f1019c859..02aba36fe3d 100644 --- a/fs/ubifs/gc.c +++ b/fs/ubifs/gc.c @@ -334,15 +334,15 @@ int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp) err = move_nodes(c, sleb); if (err) - goto out; + goto out_inc_seq; err = gc_sync_wbufs(c); if (err) - goto out; + goto out_inc_seq; err = ubifs_change_one_lp(c, lnum, c->leb_size, 0, 0, 0, 0); if (err) - goto out; + goto out_inc_seq; /* Allow for races with TNC */ c->gced_lnum = lnum; @@ -369,6 +369,14 @@ int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp) out: ubifs_scan_destroy(sleb); return err; + +out_inc_seq: + /* We may have moved at least some nodes so allow for races with TNC */ + c->gced_lnum = lnum; + smp_wmb(); + c->gc_seq += 1; + smp_wmb(); + goto out; } /** diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index 7da209ab937..7634c597088 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -1476,7 +1476,7 @@ again: } err = fallible_read_node(c, key, &zbr, node); - if (maybe_leb_gced(c, zbr.lnum, gc_seq1)) { + if (err <= 0 || maybe_leb_gced(c, zbr.lnum, gc_seq1)) { /* * The node may have been GC'ed out from under us so try again * while keeping the TNC mutex locked. -- cgit v1.2.3-70-g09d2 From 6e14968c869cd30e236bb626dd0c16421d2a658d Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 17 Sep 2008 10:52:07 +0300 Subject: UBIFS: remove incorrect assert The assert was not valid because one of the variables 'taken_empty_lebs' has transient values out of sync with the other variables. Signed-off-by: Adrian Hunter Signed-off-by: Artem Bityutskiy --- fs/ubifs/find.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c index e045c8b5542..47814cde240 100644 --- a/fs/ubifs/find.c +++ b/fs/ubifs/find.c @@ -507,7 +507,6 @@ int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free, rsvd_idx_lebs = 0; lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt - c->lst.taken_empty_lebs; - ubifs_assert(lebs + c->lst.idx_lebs >= c->min_idx_lebs); if (rsvd_idx_lebs < lebs) /* * OK to allocate an empty LEB, but we still don't want to go -- cgit v1.2.3-70-g09d2 From 7424bac82ff3bd956ea04101550e01bdae17284d Mon Sep 17 00:00:00 2001 From: Alexander Beregalov Date: Wed, 17 Sep 2008 22:09:41 +0400 Subject: UBIFS: fix printk format warnings fs/ubifs/dir.c:428: warning: format '%llu' expects type 'long long unsigned int', but argument 5 has type 'long unsigned int' fs/ubifs/debug.c:541: warning: format '%llu' expects type 'long long unsigned int', but argument 2 has type 'long unsigned int' Signed-off-by: Alexander Beregalov Signed-off-by: Artem Bityutskiy --- fs/ubifs/debug.c | 2 +- fs/ubifs/dir.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index b9cb7747375..d7f7645779f 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -538,7 +538,7 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node) printk(KERN_DEBUG "\t%d orphan inode numbers:\n", n); for (i = 0; i < n; i++) printk(KERN_DEBUG "\t ino %llu\n", - le64_to_cpu(orph->inos[i])); + (unsigned long long)le64_to_cpu(orph->inos[i])); break; } default: diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 2b267c9a180..526c01ec800 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -426,7 +426,7 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir) while (1) { dbg_gen("feed '%s', ino %llu, new f_pos %#x", - dent->name, le64_to_cpu(dent->inum), + dent->name, (unsigned long long)le64_to_cpu(dent->inum), key_hash_flash(c, &dent->key)); ubifs_assert(dent->ch.sqnum > ubifs_inode(dir)->creat_sqnum); -- cgit v1.2.3-70-g09d2 From 62aa528e0299ffef8e138d9d92d13e631d06c5ff Mon Sep 17 00:00:00 2001 From: Julien Brunel Date: Wed, 24 Sep 2008 16:22:22 -0500 Subject: 9p: use an IS_ERR test rather than a NULL test In case of error, the function p9_client_walk returns an ERR pointer, but never returns a NULL pointer. So a NULL test that comes after an IS_ERR test should be deleted. The semantic match that finds this problem is as follows: (http://www.emn.fr/x-info/coccinelle/) // @match_bad_null_test@ expression x, E; statement S1,S2; @@ x = p9_client_walk(...) ... when != x = E * if (x != NULL) S1 else S2 // Signed-off-by: Julien Brunel Signed-off-by: Julia Lawall Signed-off-by: Eric Van Hensbergen Signed-off-by: Andrew Morton --- fs/9p/vfs_inode.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index c95295c6504..e83aa5ebe86 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -626,8 +626,7 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, return NULL; error: - if (fid) - p9_client_clunk(fid); + p9_client_clunk(fid); return ERR_PTR(result); } -- cgit v1.2.3-70-g09d2 From f1ccd2955157e1aff992f6aaaba0944209076220 Mon Sep 17 00:00:00 2001 From: Lachlan McIlroy Date: Fri, 26 Sep 2008 12:16:46 +1000 Subject: [XFS] Fix extent list corruption in xfs_iext_irec_compact_full(). If we don't move all the records from the next buffer into the current buffer then we need to update the er_extoff field of the next buffer as we shift the remaining records to the start of the buffer. SGI-PV: 987159 SGI-Modid: xfs-linux-melb:xfs-kern:32165a Signed-off-by: Lachlan McIlroy Signed-off-by: Eric Sandeen Signed-off-by: Russell Cattelan --- fs/xfs/xfs_inode.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 00e80df9dd9..419cfc2eacb 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -4584,6 +4584,7 @@ xfs_iext_irec_compact_full( (XFS_LINEAR_EXTS - erp_next->er_extcount) * sizeof(xfs_bmbt_rec_t)); + erp_next->er_extoff += ext_diff; } } -- cgit v1.2.3-70-g09d2 From 71a8c87fb300b601eacf7a86cc6c6322fe827bfd Mon Sep 17 00:00:00 2001 From: Lachlan McIlroy Date: Fri, 26 Sep 2008 12:17:57 +1000 Subject: [XFS] Remove xfs_iext_irec_compact_full() Yet another bug was found in xfs_iext_irec_compact_full() and while the source of the bug was found it wasn't an easy task to track it down because the conditions are very difficult to reproduce. A HUGE thank-you goes to Russell Cattelan and Eric Sandeen for their significant effort in tracking down the source of this corruption. xfs_iext_irec_compact_full() and xfs_iext_irec_compact_pages() are almost identical - they both compact indirect extent lists by moving extents from subsequent buffers into earlier ones. xfs_iext_irec_compact_pages() only moves extents if all of the extents in the next buffer will fit into the empty space in the buffer before it. xfs_iext_irec_compact_full() will go a step further and move part of the next buffer if all the extents wont fit. It will then shift the remaining extents in the next buffer up to the start of the buffer. The bug here was that we did not update er_extoff and this caused extent list corruption. It does not appear that this extra functionality gains us much. Calling xfs_iext_irec_compact_pages() instead will do a good enough job at compacting the indirect list and will be quicker too. For the case in xfs_iext_indirect_to_direct() the total number of extents in the indirect list will fit into one buffer so we will never need the extra functionality of xfs_iext_irec_compact_full() there. Also xfs_iext_irec_compact_pages() doesn't need to do a memmove() (the buffers will never overlap) so we don't want the performance hit that can incur. SGI-PV: 987159 SGI-Modid: xfs-linux-melb:xfs-kern:32166a Signed-off-by: Lachlan McIlroy Signed-off-by: Eric Sandeen --- fs/xfs/xfs_inode.c | 95 ++---------------------------------------------------- 1 file changed, 3 insertions(+), 92 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 419cfc2eacb..dbd9cef852e 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -4118,7 +4118,7 @@ xfs_iext_indirect_to_direct( ASSERT(nextents <= XFS_LINEAR_EXTS); size = nextents * sizeof(xfs_bmbt_rec_t); - xfs_iext_irec_compact_full(ifp); + xfs_iext_irec_compact_pages(ifp); ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ); ep = ifp->if_u1.if_ext_irec->er_extbuf; @@ -4449,8 +4449,7 @@ xfs_iext_irec_remove( * compaction policy is as follows: * * Full Compaction: Extents fit into a single page (or inline buffer) - * Full Compaction: Extents occupy less than 10% of allocated space - * Partial Compaction: Extents occupy > 10% and < 50% of allocated space + * Partial Compaction: Extents occupy less than 50% of allocated space * No Compaction: Extents occupy at least 50% of allocated space */ void @@ -4471,8 +4470,6 @@ xfs_iext_irec_compact( xfs_iext_direct_to_inline(ifp, nextents); } else if (nextents <= XFS_LINEAR_EXTS) { xfs_iext_indirect_to_direct(ifp); - } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 3) { - xfs_iext_irec_compact_full(ifp); } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) { xfs_iext_irec_compact_pages(ifp); } @@ -4496,7 +4493,7 @@ xfs_iext_irec_compact_pages( erp_next = erp + 1; if (erp_next->er_extcount <= (XFS_LINEAR_EXTS - erp->er_extcount)) { - memmove(&erp->er_extbuf[erp->er_extcount], + memcpy(&erp->er_extbuf[erp->er_extcount], erp_next->er_extbuf, erp_next->er_extcount * sizeof(xfs_bmbt_rec_t)); erp->er_extcount += erp_next->er_extcount; @@ -4515,92 +4512,6 @@ xfs_iext_irec_compact_pages( } } -/* - * Fully compact the extent records managed by the indirection array. - */ -void -xfs_iext_irec_compact_full( - xfs_ifork_t *ifp) /* inode fork pointer */ -{ - xfs_bmbt_rec_host_t *ep, *ep_next; /* extent record pointers */ - xfs_ext_irec_t *erp, *erp_next; /* extent irec pointers */ - int erp_idx = 0; /* extent irec index */ - int ext_avail; /* empty entries in ex list */ - int ext_diff; /* number of exts to add */ - int nlists; /* number of irec's (ex lists) */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - erp = ifp->if_u1.if_ext_irec; - ep = &erp->er_extbuf[erp->er_extcount]; - erp_next = erp + 1; - ep_next = erp_next->er_extbuf; - - while (erp_idx < nlists - 1) { - /* - * Check how many extent records are available in this irec. - * If there is none skip the whole exercise. - */ - ext_avail = XFS_LINEAR_EXTS - erp->er_extcount; - if (ext_avail) { - - /* - * Copy over as many as possible extent records into - * the previous page. - */ - ext_diff = MIN(ext_avail, erp_next->er_extcount); - memcpy(ep, ep_next, ext_diff * sizeof(xfs_bmbt_rec_t)); - erp->er_extcount += ext_diff; - erp_next->er_extcount -= ext_diff; - - /* - * If the next irec is empty now we can simply - * remove it. - */ - if (erp_next->er_extcount == 0) { - /* - * Free page before removing extent record - * so er_extoffs don't get modified in - * xfs_iext_irec_remove. - */ - kmem_free(erp_next->er_extbuf); - erp_next->er_extbuf = NULL; - xfs_iext_irec_remove(ifp, erp_idx + 1); - erp = &ifp->if_u1.if_ext_irec[erp_idx]; - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - - /* - * If the next irec is not empty move up the content - * that has not been copied to the previous page to - * the beggining of this one. - */ - } else { - memmove(erp_next->er_extbuf, &ep_next[ext_diff], - erp_next->er_extcount * - sizeof(xfs_bmbt_rec_t)); - ep_next = erp_next->er_extbuf; - memset(&ep_next[erp_next->er_extcount], 0, - (XFS_LINEAR_EXTS - - erp_next->er_extcount) * - sizeof(xfs_bmbt_rec_t)); - erp_next->er_extoff += ext_diff; - } - } - - if (erp->er_extcount == XFS_LINEAR_EXTS) { - erp_idx++; - if (erp_idx < nlists) - erp = &ifp->if_u1.if_ext_irec[erp_idx]; - else - break; - } - ep = &erp->er_extbuf[erp->er_extcount]; - erp_next = erp + 1; - ep_next = erp_next->er_extbuf; - } -} - /* * This is called to update the er_extoff field in the indirection * array when extents have been added or removed from one of the -- cgit v1.2.3-70-g09d2 From d0185c0882d76b8126d4a099c7ac82b3b216d103 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 29 Sep 2008 07:42:57 -0700 Subject: Fix NULL pointer dereference in proc_sys_compare The VFS interface for the 'd_compare()' is a bit special (read: 'odd'), because it really just essentially replaces a memcmp(). The filesystem is supposed to just compare the two names with whatever case-independent or other function. And when I say 'is supposed to', I obviously mean that 'procfs does odd things, and actually looks at the dentry that we don't even pass down, rather than just the name'. Which results in problems, because we actually call d_compare before we have even verified that the dentry is still hashed at all. And that causes a problm since the inode that procfs looks at may have been free'd and the d_inode pointer is NULL. procfs just assumes that all dentries are positive, since procfs itself never generates a negative one. But memory pressure will still result in the dentry getting torn down, and as it is removed by RCU, it still remains visible on some lists - and to d_compare. If the filesystem just did a name comparison, we wouldn't care. And we could just fix procfs to know about negative dentries too. But rather than have the low-level filesystems know about internal VFS details, just move the check for a unhashed dentry up a bit, so that we will only call d_compare on dentries that are still active. The actual oops this caused didn't look like a NULL pointer dereference because procfs did a 'container_of(inode, struct proc_inode, vfs_inode)' to get at its internal proc_inode information from the inode pointer, and accessed a field below the inode. So the oops would look something like BUG: unable to handle kernel paging request at fffffffffffffff0 IP: [] proc_sys_compare+0x36/0x50 and was seen on both x86-64 (Alexey Dobriyan and Hugh Dickins) and ppc64 (Hugh Dickins). Reported-by: Alexey Dobriyan Acked-by: Hugh Dickins Cc: Al Viro Reviewed-by: "Eric W. Biederman" Signed-of-by: Linus Torvalds --- fs/dcache.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index 80e93956ace..e7a1a99b746 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1395,6 +1395,10 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name) if (dentry->d_parent != parent) goto next; + /* non-existing due to RCU? */ + if (d_unhashed(dentry)) + goto next; + /* * It is safe to compare names since d_move() cannot * change the qstr (protected by d_lock). @@ -1410,10 +1414,8 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name) goto next; } - if (!d_unhashed(dentry)) { - atomic_inc(&dentry->d_count); - found = dentry; - } + atomic_inc(&dentry->d_count); + found = dentry; spin_unlock(&dentry->d_lock); break; next: -- cgit v1.2.3-70-g09d2 From 31a78f23bac0069004e69f98808b6988baccb6b6 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Sun, 28 Sep 2008 23:09:31 +0100 Subject: mm owner: fix race between swapoff and exit There's a race between mm->owner assignment and swapoff, more easily seen when task slab poisoning is turned on. The condition occurs when try_to_unuse() runs in parallel with an exiting task. A similar race can occur with callers of get_task_mm(), such as /proc// or ptrace or page migration. CPU0 CPU1 try_to_unuse looks at mm = task0->mm increments mm->mm_users task 0 exits mm->owner needs to be updated, but no new owner is found (mm_users > 1, but no other task has task->mm = task0->mm) mm_update_next_owner() leaves mmput(mm) decrements mm->mm_users task0 freed dereferencing mm->owner fails The fix is to notify the subsystem via mm_owner_changed callback(), if no new owner is found, by specifying the new task as NULL. Jiri Slaby: mm->owner was set to NULL prior to calling cgroup_mm_owner_callbacks(), but must be set after that, so as not to pass NULL as old owner causing oops. Daisuke Nishimura: mm_update_next_owner() may set mm->owner to NULL, but mem_cgroup_from_task() and its callers need to take account of this situation to avoid oops. Hugh Dickins: Lockdep warning and hang below exec_mmap() when testing these patches. exit_mm() up_reads mmap_sem before calling mm_update_next_owner(), so exec_mmap() now needs to do the same. And with that repositioning, there's now no point in mm_need_new_owner() allowing for NULL mm. Reported-by: Hugh Dickins Signed-off-by: Balbir Singh Signed-off-by: Jiri Slaby Signed-off-by: Daisuke Nishimura Signed-off-by: Hugh Dickins Cc: KAMEZAWA Hiroyuki Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 2 +- kernel/cgroup.c | 5 +++-- kernel/exit.c | 12 ++++++++++-- mm/memcontrol.c | 17 +++++++++++++++++ 4 files changed, 31 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 32993beecbe..cecee501ce7 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -752,11 +752,11 @@ static int exec_mmap(struct mm_struct *mm) tsk->active_mm = mm; activate_mm(active_mm, mm); task_unlock(tsk); - mm_update_next_owner(old_mm); arch_pick_mmap_layout(mm); if (old_mm) { up_read(&old_mm->mmap_sem); BUG_ON(active_mm != old_mm); + mm_update_next_owner(old_mm); mmput(old_mm); return 0; } diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 13932abde15..a0123d75ec9 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2738,14 +2738,15 @@ void cgroup_fork_callbacks(struct task_struct *child) */ void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new) { - struct cgroup *oldcgrp, *newcgrp; + struct cgroup *oldcgrp, *newcgrp = NULL; if (need_mm_owner_callback) { int i; for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; oldcgrp = task_cgroup(old, ss->subsys_id); - newcgrp = task_cgroup(new, ss->subsys_id); + if (new) + newcgrp = task_cgroup(new, ss->subsys_id); if (oldcgrp == newcgrp) continue; if (ss->mm_owner_changed) diff --git a/kernel/exit.c b/kernel/exit.c index 16395644a98..85a83c83185 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -583,8 +583,6 @@ mm_need_new_owner(struct mm_struct *mm, struct task_struct *p) * If there are other users of the mm and the owner (us) is exiting * we need to find a new owner to take on the responsibility. */ - if (!mm) - return 0; if (atomic_read(&mm->mm_users) <= 1) return 0; if (mm->owner != p) @@ -627,6 +625,16 @@ retry: } while_each_thread(g, c); read_unlock(&tasklist_lock); + /* + * We found no owner yet mm_users > 1: this implies that we are + * most likely racing with swapoff (try_to_unuse()) or /proc or + * ptrace or page migration (get_task_mm()). Mark owner as NULL, + * so that subsystems can understand the callback and take action. + */ + down_write(&mm->mmap_sem); + cgroup_mm_owner_callbacks(mm->owner, NULL); + mm->owner = NULL; + up_write(&mm->mmap_sem); return; assign_new_owner: diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c0500e4d3a2..36896f3eb7f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -250,6 +250,14 @@ static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) { + /* + * mm_update_next_owner() may clear mm->owner to NULL + * if it races with swapoff, page migration, etc. + * So this can be called with p == NULL. + */ + if (unlikely(!p)) + return NULL; + return container_of(task_subsys_state(p, mem_cgroup_subsys_id), struct mem_cgroup, css); } @@ -549,6 +557,11 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, if (likely(!memcg)) { rcu_read_lock(); mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); + if (unlikely(!mem)) { + rcu_read_unlock(); + kmem_cache_free(page_cgroup_cache, pc); + return 0; + } /* * For every charge from the cgroup, increment reference count */ @@ -801,6 +814,10 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask) rcu_read_lock(); mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); + if (unlikely(!mem)) { + rcu_read_unlock(); + return 0; + } css_get(&mem->css); rcu_read_unlock(); -- cgit v1.2.3-70-g09d2 From 16dbc6c9616363fe53811abcbd935336dc0a0f01 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Thu, 2 Oct 2008 14:50:12 -0700 Subject: inotify: fix lock ordering wrt do_page_fault's mmap_sem Fix inotify lock order reversal with mmap_sem due to holding locks over copy_to_user. Signed-off-by: Nick Piggin Reported-by: "Daniel J Blueman" Tested-by: "Daniel J Blueman" Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/inotify_user.c | 27 ++++++++++++++++++++------- include/asm-x86/uaccess_64.h | 1 + 2 files changed, 21 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/inotify_user.c b/fs/inotify_user.c index 60249429a25..d85c7d931cd 100644 --- a/fs/inotify_user.c +++ b/fs/inotify_user.c @@ -323,7 +323,7 @@ out: } /* - * remove_kevent - cleans up and ultimately frees the given kevent + * remove_kevent - cleans up the given kevent * * Caller must hold dev->ev_mutex. */ @@ -334,7 +334,13 @@ static void remove_kevent(struct inotify_device *dev, dev->event_count--; dev->queue_size -= sizeof(struct inotify_event) + kevent->event.len; +} +/* + * free_kevent - frees the given kevent. + */ +static void free_kevent(struct inotify_kernel_event *kevent) +{ kfree(kevent->name); kmem_cache_free(event_cachep, kevent); } @@ -350,6 +356,7 @@ static void inotify_dev_event_dequeue(struct inotify_device *dev) struct inotify_kernel_event *kevent; kevent = inotify_dev_get_event(dev); remove_kevent(dev, kevent); + free_kevent(kevent); } } @@ -433,17 +440,15 @@ static ssize_t inotify_read(struct file *file, char __user *buf, dev = file->private_data; while (1) { - int events; prepare_to_wait(&dev->wq, &wait, TASK_INTERRUPTIBLE); mutex_lock(&dev->ev_mutex); - events = !list_empty(&dev->events); - mutex_unlock(&dev->ev_mutex); - if (events) { + if (!list_empty(&dev->events)) { ret = 0; break; } + mutex_unlock(&dev->ev_mutex); if (file->f_flags & O_NONBLOCK) { ret = -EAGAIN; @@ -462,7 +467,6 @@ static ssize_t inotify_read(struct file *file, char __user *buf, if (ret) return ret; - mutex_lock(&dev->ev_mutex); while (1) { struct inotify_kernel_event *kevent; @@ -481,6 +485,13 @@ static ssize_t inotify_read(struct file *file, char __user *buf, } break; } + remove_kevent(dev, kevent); + + /* + * Must perform the copy_to_user outside the mutex in order + * to avoid a lock order reversal with mmap_sem. + */ + mutex_unlock(&dev->ev_mutex); if (copy_to_user(buf, &kevent->event, event_size)) { ret = -EFAULT; @@ -498,7 +509,9 @@ static ssize_t inotify_read(struct file *file, char __user *buf, count -= kevent->event.len; } - remove_kevent(dev, kevent); + free_kevent(kevent); + + mutex_lock(&dev->ev_mutex); } mutex_unlock(&dev->ev_mutex); diff --git a/include/asm-x86/uaccess_64.h b/include/asm-x86/uaccess_64.h index 515d4dce96b..45806d60bcb 100644 --- a/include/asm-x86/uaccess_64.h +++ b/include/asm-x86/uaccess_64.h @@ -7,6 +7,7 @@ #include #include #include +#include #include /* -- cgit v1.2.3-70-g09d2 From 4b19de6d1cb07c8bcb6778e771f9cfd5bcfdfd3e Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Thu, 2 Oct 2008 14:50:16 -0700 Subject: mm: tiny-shmem nommu fix The previous patch db203d53d474aa068984e409d807628f5841da1b ("mm: tiny-shmem fix lock ordering: mmap_sem vs i_mutex") to fix the lock ordering in tiny-shmem breaks shared anonymous and IPC memory on NOMMU architectures because it was using the expanding truncate to signal ramfs to allocate a physically contiguous RAM backing the inode (otherwise it is unusable for "memory mapping" it to userspace). However do_truncate is what caused the lock ordering error, due to it taking i_mutex. In this case, we can actually just call ramfs directly to allocate memory for the mapping, rather than go via truncate. Acked-by: David Howells Acked-by: Hugh Dickins Signed-off-by: Nick Piggin Cc: Matt Mackall Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ramfs/file-nommu.c | 2 +- include/linux/ramfs.h | 1 + mm/tiny-shmem.c | 6 ++++++ 3 files changed, 8 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 52312ec93ff..5145cb9125a 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -58,7 +58,7 @@ const struct inode_operations ramfs_file_inode_operations = { * size 0 on the assumption that it's going to be used for an mmap of shared * memory */ -static int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) +int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) { struct pagevec lru_pvec; unsigned long npages, xpages, loop, limit; diff --git a/include/linux/ramfs.h b/include/linux/ramfs.h index b160fb18e8d..37aaf2b3986 100644 --- a/include/linux/ramfs.h +++ b/include/linux/ramfs.h @@ -6,6 +6,7 @@ extern int ramfs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, struct vfsmount *mnt); #ifndef CONFIG_MMU +extern int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize); extern unsigned long ramfs_nommu_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c index d17cb6f6ab1..8d7a27a6335 100644 --- a/mm/tiny-shmem.c +++ b/mm/tiny-shmem.c @@ -80,6 +80,12 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) inode->i_nlink = 0; /* It is unlinked */ init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ, &ramfs_file_operations); + +#ifndef CONFIG_MMU + error = ramfs_nommu_expand_for_mapping(inode, size); + if (error) + goto close_file; +#endif return file; close_file: -- cgit v1.2.3-70-g09d2