From 73c101011926c5832e6e141682180c4debe2cf45 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 8 Mar 2011 13:19:51 +0100 Subject: block: initial patch for on-stack per-task plugging This patch adds support for creating a queuing context outside of the queue itself. This enables us to batch up pieces of IO before grabbing the block device queue lock and submitting them to the IO scheduler. The context is created on the stack of the process and assigned in the task structure, so that we can auto-unplug it if we hit a schedule event. The current queue plugging happens implicitly if IO is submitted to an empty device, yet callers have to remember to unplug that IO when they are going to wait for it. This is an ugly API and has caused bugs in the past. Additionally, it requires hacks in the vm (->sync_page() callback) to handle that logic. By switching to an explicit plugging scheme we make the API a lot nicer and can get rid of the ->sync_page() hack in the vm. Signed-off-by: Jens Axboe --- kernel/exit.c | 1 + kernel/fork.c | 3 +++ kernel/sched.c | 12 ++++++++++++ 3 files changed, 16 insertions(+) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index f9a45ebcc7b..6a488ad2dce 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -908,6 +908,7 @@ NORET_TYPE void do_exit(long code) profile_task_exit(tsk); WARN_ON(atomic_read(&tsk->fs_excl)); + WARN_ON(blk_needs_flush_plug(tsk)); if (unlikely(in_interrupt())) panic("Aiee, killing interrupt handler!"); diff --git a/kernel/fork.c b/kernel/fork.c index 25e429152dd..027c80e5162 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1204,6 +1204,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, * Clear TID on mm_release()? */ p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; +#ifdef CONFIG_BLOCK + p->plug = NULL; +#endif #ifdef CONFIG_FUTEX p->robust_list = NULL; #ifdef CONFIG_COMPAT diff --git a/kernel/sched.c b/kernel/sched.c index 18d38e4ec7b..ca098bf4cc6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3978,6 +3978,16 @@ need_resched_nonpreemptible: switch_count = &prev->nvcsw; } + /* + * If we are going to sleep and we have plugged IO queued, make + * sure to submit it to avoid deadlocks. + */ + if (prev->state != TASK_RUNNING && blk_needs_flush_plug(prev)) { + raw_spin_unlock(&rq->lock); + blk_flush_plug(prev); + raw_spin_lock(&rq->lock); + } + pre_schedule(rq, prev); if (unlikely(!rq->nr_running)) @@ -5333,6 +5343,7 @@ void __sched io_schedule(void) delayacct_blkio_start(); atomic_inc(&rq->nr_iowait); + blk_flush_plug(current); current->in_iowait = 1; schedule(); current->in_iowait = 0; @@ -5348,6 +5359,7 @@ long __sched io_schedule_timeout(long timeout) delayacct_blkio_start(); atomic_inc(&rq->nr_iowait); + blk_flush_plug(current); current->in_iowait = 1; ret = schedule_timeout(timeout); current->in_iowait = 0; -- cgit v1.2.3-70-g09d2 From 721a9602e6607417c6bc15b18e97a2f35266c690 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 9 Mar 2011 11:56:30 +0100 Subject: block: kill off REQ_UNPLUG With the plugging now being explicitly controlled by the submitter, callers need not pass down unplugging hints to the block layer. If they want to unplug, it's because they manually plugged on their own - in which case, they should just unplug at will. Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- drivers/block/drbd/drbd_actlog.c | 2 +- drivers/block/drbd/drbd_int.h | 2 +- drivers/block/drbd/drbd_main.c | 3 +-- drivers/block/drbd/drbd_receiver.c | 9 +-------- drivers/md/bitmap.c | 2 +- drivers/md/dm-io.c | 2 +- drivers/md/dm-kcopyd.c | 5 +---- drivers/md/md.c | 5 ++--- fs/btrfs/extent_io.c | 2 +- fs/buffer.c | 14 ++++---------- fs/direct-io.c | 2 +- fs/ext4/page-io.c | 3 +-- fs/gfs2/log.c | 4 ++-- fs/gfs2/lops.c | 12 ++++++------ fs/gfs2/meta_io.c | 2 +- fs/jbd/commit.c | 2 +- fs/jbd2/commit.c | 6 +++--- fs/nilfs2/segbuf.c | 2 +- fs/xfs/linux-2.6/xfs_aops.c | 3 +-- include/linux/blk_types.h | 2 -- include/linux/fs.h | 28 +++++++++------------------- kernel/power/block_io.c | 2 +- mm/page_io.c | 2 +- 24 files changed, 43 insertions(+), 75 deletions(-) (limited to 'kernel') diff --git a/block/blk-core.c b/block/blk-core.c index 82a45898ba7..7e9715ae18c 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1290,7 +1290,7 @@ get_rq: } plug = current->plug; - if (plug && !sync) { + if (plug) { if (!plug->should_sort && !list_empty(&plug->list)) { struct request *__rq; diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index 2096628d6e6..aca302492ff 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -80,7 +80,7 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev, if ((rw & WRITE) && !test_bit(MD_NO_FUA, &mdev->flags)) rw |= REQ_FUA; - rw |= REQ_UNPLUG | REQ_SYNC; + rw |= REQ_SYNC; bio = bio_alloc(GFP_NOIO, 1); bio->bi_bdev = bdev->md_bdev; diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 0b5718e1958..b0bd27dfc1e 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -377,7 +377,7 @@ union p_header { #define DP_HARDBARRIER 1 /* depricated */ #define DP_RW_SYNC 2 /* equals REQ_SYNC */ #define DP_MAY_SET_IN_SYNC 4 -#define DP_UNPLUG 8 /* equals REQ_UNPLUG */ +#define DP_UNPLUG 8 /* not used anymore */ #define DP_FUA 16 /* equals REQ_FUA */ #define DP_FLUSH 32 /* equals REQ_FLUSH */ #define DP_DISCARD 64 /* equals REQ_DISCARD */ diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 6049cb85310..8a43ce0edee 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2477,12 +2477,11 @@ static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw) { if (mdev->agreed_pro_version >= 95) return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) | - (bi_rw & REQ_UNPLUG ? DP_UNPLUG : 0) | (bi_rw & REQ_FUA ? DP_FUA : 0) | (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) | (bi_rw & REQ_DISCARD ? DP_DISCARD : 0); else - return bi_rw & (REQ_SYNC | REQ_UNPLUG) ? DP_RW_SYNC : 0; + return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0; } /* Used to send write requests diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 84132f8bf8a..8e68be939de 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -1100,8 +1100,6 @@ next_bio: /* > e->sector, unless this is the first bio */ bio->bi_sector = sector; bio->bi_bdev = mdev->ldev->backing_bdev; - /* we special case some flags in the multi-bio case, see below - * (REQ_UNPLUG) */ bio->bi_rw = rw; bio->bi_private = e; bio->bi_end_io = drbd_endio_sec; @@ -1130,10 +1128,6 @@ next_bio: bios = bios->bi_next; bio->bi_next = NULL; - /* strip off REQ_UNPLUG unless it is the last bio */ - if (bios) - bio->bi_rw &= ~REQ_UNPLUG; - drbd_generic_make_request(mdev, fault_type, bio); } while (bios); return 0; @@ -1621,12 +1615,11 @@ static unsigned long write_flags_to_bio(struct drbd_conf *mdev, u32 dpf) { if (mdev->agreed_pro_version >= 95) return (dpf & DP_RW_SYNC ? REQ_SYNC : 0) | - (dpf & DP_UNPLUG ? REQ_UNPLUG : 0) | (dpf & DP_FUA ? REQ_FUA : 0) | (dpf & DP_FLUSH ? REQ_FUA : 0) | (dpf & DP_DISCARD ? REQ_DISCARD : 0); else - return dpf & DP_RW_SYNC ? (REQ_SYNC | REQ_UNPLUG) : 0; + return dpf & DP_RW_SYNC ? REQ_SYNC : 0; } /* mirrored write */ diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 54bfc274b39..ca203cb23f3 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -347,7 +347,7 @@ static void write_page(struct bitmap *bitmap, struct page *page, int wait) atomic_inc(&bitmap->pending_writes); set_buffer_locked(bh); set_buffer_mapped(bh); - submit_bh(WRITE | REQ_UNPLUG | REQ_SYNC, bh); + submit_bh(WRITE | REQ_SYNC, bh); bh = bh->b_this_page; } diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index 136d4f71a11..76a5af00a26 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -352,7 +352,7 @@ static void dispatch_io(int rw, unsigned int num_regions, BUG_ON(num_regions > DM_IO_MAX_REGIONS); if (sync) - rw |= REQ_SYNC | REQ_UNPLUG; + rw |= REQ_SYNC; /* * For multiple regions we need to be careful to rewind diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index 400cf35094a..1bb73a13ca4 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c @@ -356,11 +356,8 @@ static int run_io_job(struct kcopyd_job *job) if (job->rw == READ) r = dm_io(&io_req, 1, &job->source, NULL); - else { - if (job->num_dests > 1) - io_req.bi_rw |= REQ_UNPLUG; + else r = dm_io(&io_req, job->num_dests, job->dests, NULL); - } return r; } diff --git a/drivers/md/md.c b/drivers/md/md.c index ca0d79c264b..28f9c1ee4e3 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -777,8 +777,7 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, bio->bi_end_io = super_written; atomic_inc(&mddev->pending_writes); - submit_bio(REQ_WRITE | REQ_SYNC | REQ_UNPLUG | REQ_FLUSH | REQ_FUA, - bio); + submit_bio(REQ_WRITE | REQ_SYNC | REQ_FLUSH | REQ_FUA, bio); } void md_super_wait(mddev_t *mddev) @@ -806,7 +805,7 @@ int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size, struct completion event; int ret; - rw |= REQ_SYNC | REQ_UNPLUG; + rw |= REQ_SYNC; bio->bi_bdev = (metadata_op && rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 92ac5192c51..b76f7cd4740 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2182,7 +2182,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, unsigned long nr_written = 0; if (wbc->sync_mode == WB_SYNC_ALL) - write_flags = WRITE_SYNC_PLUG; + write_flags = WRITE_SYNC; else write_flags = WRITE; diff --git a/fs/buffer.c b/fs/buffer.c index f903f2e5b4f..42534f67d71 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -767,7 +767,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) * still in flight on potentially older * contents. */ - write_dirty_buffer(bh, WRITE_SYNC_PLUG); + write_dirty_buffer(bh, WRITE_SYNC); /* * Kick off IO for the previous mapping. Note @@ -1602,14 +1602,8 @@ EXPORT_SYMBOL(unmap_underlying_metadata); * prevents this contention from occurring. * * If block_write_full_page() is called with wbc->sync_mode == - * WB_SYNC_ALL, the writes are posted using WRITE_SYNC_PLUG; this - * causes the writes to be flagged as synchronous writes, but the - * block device queue will NOT be unplugged, since usually many pages - * will be pushed to the out before the higher-level caller actually - * waits for the writes to be completed. The various wait functions, - * such as wait_on_writeback_range() will ultimately call sync_page() - * which will ultimately call blk_run_backing_dev(), which will end up - * unplugging the device queue. + * WB_SYNC_ALL, the writes are posted using WRITE_SYNC; this + * causes the writes to be flagged as synchronous writes. */ static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block, struct writeback_control *wbc, @@ -1622,7 +1616,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page, const unsigned blocksize = 1 << inode->i_blkbits; int nr_underway = 0; int write_op = (wbc->sync_mode == WB_SYNC_ALL ? - WRITE_SYNC_PLUG : WRITE); + WRITE_SYNC : WRITE); BUG_ON(!PageLocked(page)); diff --git a/fs/direct-io.c b/fs/direct-io.c index df709b3b860..42608313609 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1173,7 +1173,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, struct dio *dio; if (rw & WRITE) - rw = WRITE_ODIRECT_PLUG; + rw = WRITE_ODIRECT; if (bdev) bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev)); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 955cc309142..e2cd90e4bb7 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -310,8 +310,7 @@ static int io_submit_init(struct ext4_io_submit *io, io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh); io->io_bio = bio; - io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? - WRITE_SYNC_PLUG : WRITE); + io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); io->io_next_block = bh->b_blocknr; return 0; } diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index eb01f3575e1..7f1c1120234 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -121,7 +121,7 @@ __acquires(&sdp->sd_log_lock) lock_buffer(bh); if (test_clear_buffer_dirty(bh)) { bh->b_end_io = end_buffer_write_sync; - submit_bh(WRITE_SYNC_PLUG, bh); + submit_bh(WRITE_SYNC, bh); } else { unlock_buffer(bh); brelse(bh); @@ -647,7 +647,7 @@ static void gfs2_ordered_write(struct gfs2_sbd *sdp) lock_buffer(bh); if (buffer_mapped(bh) && test_clear_buffer_dirty(bh)) { bh->b_end_io = end_buffer_write_sync; - submit_bh(WRITE_SYNC_PLUG, bh); + submit_bh(WRITE_SYNC, bh); } else { unlock_buffer(bh); brelse(bh); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index bf33f822058..48b545a1979 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -200,7 +200,7 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp) } gfs2_log_unlock(sdp); - submit_bh(WRITE_SYNC_PLUG, bh); + submit_bh(WRITE_SYNC, bh); gfs2_log_lock(sdp); n = 0; @@ -210,7 +210,7 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp) gfs2_log_unlock(sdp); lock_buffer(bd2->bd_bh); bh = gfs2_log_fake_buf(sdp, bd2->bd_bh); - submit_bh(WRITE_SYNC_PLUG, bh); + submit_bh(WRITE_SYNC, bh); gfs2_log_lock(sdp); if (++n >= num) break; @@ -352,7 +352,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp) sdp->sd_log_num_revoke--; if (offset + sizeof(u64) > sdp->sd_sb.sb_bsize) { - submit_bh(WRITE_SYNC_PLUG, bh); + submit_bh(WRITE_SYNC, bh); bh = gfs2_log_get_buf(sdp); mh = (struct gfs2_meta_header *)bh->b_data; @@ -369,7 +369,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp) } gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke); - submit_bh(WRITE_SYNC_PLUG, bh); + submit_bh(WRITE_SYNC, bh); } static void revoke_lo_before_scan(struct gfs2_jdesc *jd, @@ -571,7 +571,7 @@ static void gfs2_write_blocks(struct gfs2_sbd *sdp, struct buffer_head *bh, ptr = bh_log_ptr(bh); get_bh(bh); - submit_bh(WRITE_SYNC_PLUG, bh); + submit_bh(WRITE_SYNC, bh); gfs2_log_lock(sdp); while(!list_empty(list)) { bd = list_entry(list->next, struct gfs2_bufdata, bd_le.le_list); @@ -597,7 +597,7 @@ static void gfs2_write_blocks(struct gfs2_sbd *sdp, struct buffer_head *bh, } else { bh1 = gfs2_log_fake_buf(sdp, bd->bd_bh); } - submit_bh(WRITE_SYNC_PLUG, bh1); + submit_bh(WRITE_SYNC, bh1); gfs2_log_lock(sdp); ptr += 2; } diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index a566331db4e..867b713cba9 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -37,7 +37,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb struct buffer_head *bh, *head; int nr_underway = 0; int write_op = REQ_META | - (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC_PLUG : WRITE); + (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); BUG_ON(!PageLocked(page)); BUG_ON(!page_has_buffers(page)); diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 34a4861c14b..66be299acb1 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -333,7 +333,7 @@ void journal_commit_transaction(journal_t *journal) * instead we rely on sync_buffer() doing the unplug for us. */ if (commit_transaction->t_synchronous_commit) - write_op = WRITE_SYNC_PLUG; + write_op = WRITE_SYNC; spin_lock(&commit_transaction->t_handle_lock); while (commit_transaction->t_updates) { DEFINE_WAIT(wait); diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index f3ad1598b20..3da1cc4346d 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -137,9 +137,9 @@ static int journal_submit_commit_record(journal_t *journal, if (journal->j_flags & JBD2_BARRIER && !JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) - ret = submit_bh(WRITE_SYNC_PLUG | WRITE_FLUSH_FUA, bh); + ret = submit_bh(WRITE_SYNC | WRITE_FLUSH_FUA, bh); else - ret = submit_bh(WRITE_SYNC_PLUG, bh); + ret = submit_bh(WRITE_SYNC, bh); *cbh = bh; return ret; @@ -369,7 +369,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) * instead we rely on sync_buffer() doing the unplug for us. */ if (commit_transaction->t_synchronous_commit) - write_op = WRITE_SYNC_PLUG; + write_op = WRITE_SYNC; trace_jbd2_commit_locking(journal, commit_transaction); stats.run.rs_wait = commit_transaction->t_max_wait; stats.run.rs_locked = jiffies; diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index 0f83e93935b..2853ff20f85 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -509,7 +509,7 @@ static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf, * Last BIO is always sent through the following * submission. */ - rw |= REQ_SYNC | REQ_UNPLUG; + rw |= REQ_SYNC; res = nilfs_segbuf_submit_bio(segbuf, &wi, rw); } diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 83c1c20d145..6bbb0ee3325 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -413,8 +413,7 @@ xfs_submit_ioend_bio( if (xfs_ioend_new_eof(ioend)) xfs_mark_inode_dirty(XFS_I(ioend->io_inode)); - submit_bio(wbc->sync_mode == WB_SYNC_ALL ? - WRITE_SYNC_PLUG : WRITE, bio); + submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio); } STATIC struct bio * diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 16b28647304..be50d9e70a7 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -128,7 +128,6 @@ enum rq_flag_bits { __REQ_NOIDLE, /* don't anticipate more IO after this one */ /* bio only flags */ - __REQ_UNPLUG, /* unplug the immediately after submission */ __REQ_RAHEAD, /* read ahead, can fail anytime */ __REQ_THROTTLED, /* This bio has already been subjected to * throttling rules. Don't do it again. */ @@ -172,7 +171,6 @@ enum rq_flag_bits { REQ_NOIDLE | REQ_FLUSH | REQ_FUA) #define REQ_CLONE_MASK REQ_COMMON_MASK -#define REQ_UNPLUG (1 << __REQ_UNPLUG) #define REQ_RAHEAD (1 << __REQ_RAHEAD) #define REQ_THROTTLED (1 << __REQ_THROTTLED) diff --git a/include/linux/fs.h b/include/linux/fs.h index 9f2cf69911b..543e226ea6a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -135,16 +135,10 @@ struct inodes_stat_t { * block layer could (in theory) choose to ignore this * request if it runs into resource problems. * WRITE A normal async write. Device will be plugged. - * WRITE_SYNC_PLUG Synchronous write. Identical to WRITE, but passes down + * WRITE_SYNC Synchronous write. Identical to WRITE, but passes down * the hint that someone will be waiting on this IO - * shortly. The device must still be unplugged explicitly, - * WRITE_SYNC_PLUG does not do this as we could be - * submitting more writes before we actually wait on any - * of them. - * WRITE_SYNC Like WRITE_SYNC_PLUG, but also unplugs the device - * immediately after submission. The write equivalent - * of READ_SYNC. - * WRITE_ODIRECT_PLUG Special case write for O_DIRECT only. + * shortly. The write equivalent of READ_SYNC. + * WRITE_ODIRECT Special case write for O_DIRECT only. * WRITE_FLUSH Like WRITE_SYNC but with preceding cache flush. * WRITE_FUA Like WRITE_SYNC but data is guaranteed to be on * non-volatile media on completion. @@ -160,18 +154,14 @@ struct inodes_stat_t { #define WRITE RW_MASK #define READA RWA_MASK -#define READ_SYNC (READ | REQ_SYNC | REQ_UNPLUG) +#define READ_SYNC (READ | REQ_SYNC) #define READ_META (READ | REQ_META) -#define WRITE_SYNC_PLUG (WRITE | REQ_SYNC | REQ_NOIDLE) -#define WRITE_SYNC (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG) -#define WRITE_ODIRECT_PLUG (WRITE | REQ_SYNC) +#define WRITE_SYNC (WRITE | REQ_SYNC | REQ_NOIDLE) +#define WRITE_ODIRECT (WRITE | REQ_SYNC) #define WRITE_META (WRITE | REQ_META) -#define WRITE_FLUSH (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \ - REQ_FLUSH) -#define WRITE_FUA (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \ - REQ_FUA) -#define WRITE_FLUSH_FUA (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \ - REQ_FLUSH | REQ_FUA) +#define WRITE_FLUSH (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_FLUSH) +#define WRITE_FUA (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_FUA) +#define WRITE_FLUSH_FUA (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_FLUSH | REQ_FUA) #define SEL_IN 1 #define SEL_OUT 2 diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c index 83bbc7c02df..d09dd10c5a5 100644 --- a/kernel/power/block_io.c +++ b/kernel/power/block_io.c @@ -28,7 +28,7 @@ static int submit(int rw, struct block_device *bdev, sector_t sector, struct page *page, struct bio **bio_chain) { - const int bio_rw = rw | REQ_SYNC | REQ_UNPLUG; + const int bio_rw = rw | REQ_SYNC; struct bio *bio; bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); diff --git a/mm/page_io.c b/mm/page_io.c index 2dee975bf46..dc76b4d0611 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -106,7 +106,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) goto out; } if (wbc->sync_mode == WB_SYNC_ALL) - rw |= REQ_SYNC | REQ_UNPLUG; + rw |= REQ_SYNC; count_vm_event(PSWPOUT); set_page_writeback(page); unlock_page(page); -- cgit v1.2.3-70-g09d2 From 805f6b5e1cbfedfb9b3d354013e7f4b13a79270f Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Fri, 11 Mar 2011 20:11:59 +0100 Subject: blktrace: Use rq->cmd_flags directly in blk_add_trace_rq. In blk_add_trace_rq, we only chose the minor 2 bits from request's cmd_flags and did some check for discard. so most of other flags(e.g, REQ_SYNC) are missing. For example, with a sync write after blkparse we get: 8,16 1 1 0.001776503 7509 A WS 1349632 + 1024 <- (8,17) 1347584 8,16 1 2 0.001776813 7509 Q WS 1349632 + 1024 [dd] 8,16 1 3 0.001780395 7509 G WS 1349632 + 1024 [dd] 8,16 1 5 0.001783186 7509 I W 1349632 + 1024 [dd] 8,16 1 11 0.001816987 7509 D W 1349632 + 1024 [dd] 8,16 0 2 0.006218192 0 C W 1349632 + 1024 [0] Since now we have integrated the flags of both bio and request, it is safe to pass rq->cmd_flags directly to __blk_add_trace. With this patch, after a sync write we get: 8,16 1 1 0.001776900 5425 A WS 1189888 + 1024 <- (8,17) 1187840 8,16 1 2 0.001777179 5425 Q WS 1189888 + 1024 [dd] 8,16 1 3 0.001780797 5425 G WS 1189888 + 1024 [dd] 8,16 1 5 0.001783402 5425 I WS 1189888 + 1024 [dd] 8,16 1 11 0.001817468 5425 D WS 1189888 + 1024 [dd] 8,16 0 2 0.005640709 0 C WS 1189888 + 1024 [0] Signed-off-by: Tao Ma Acked-by: Jeff Moyer Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index cbafed7d4f3..7aa40f8e182 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -703,28 +703,21 @@ void blk_trace_shutdown(struct request_queue *q) * **/ static void blk_add_trace_rq(struct request_queue *q, struct request *rq, - u32 what) + u32 what) { struct blk_trace *bt = q->blk_trace; - int rw = rq->cmd_flags & 0x03; if (likely(!bt)) return; - if (rq->cmd_flags & REQ_DISCARD) - rw |= REQ_DISCARD; - - if (rq->cmd_flags & REQ_SECURE) - rw |= REQ_SECURE; - if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { what |= BLK_TC_ACT(BLK_TC_PC); - __blk_add_trace(bt, 0, blk_rq_bytes(rq), rw, + __blk_add_trace(bt, 0, blk_rq_bytes(rq), rq->cmd_flags, what, rq->errors, rq->cmd_len, rq->cmd); } else { what |= BLK_TC_ACT(BLK_TC_FS); - __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rw, - what, rq->errors, 0, NULL); + __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), + rq->cmd_flags, what, rq->errors, 0, NULL); } } -- cgit v1.2.3-70-g09d2 From ed3cd4a86562eee79de25b567a00e648cc3dc2bf Mon Sep 17 00:00:00 2001 From: matt mooney Date: Fri, 14 Jan 2011 06:12:24 -0800 Subject: kernel: change to new flag variable Replace EXTRA_CFLAGS with ccflags-y. Signed-off-by: matt mooney Acked-by: WANG Cong Signed-off-by: Michal Marek --- kernel/gcov/Makefile | 2 +- kernel/power/Makefile | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile index 3f761001d51..e97ca59e252 100644 --- a/kernel/gcov/Makefile +++ b/kernel/gcov/Makefile @@ -1,3 +1,3 @@ -EXTRA_CFLAGS := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"' +ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"' obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o gcc_3_4.o diff --git a/kernel/power/Makefile b/kernel/power/Makefile index c350e18b53e..c5ebc6a9064 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -1,4 +1,5 @@ -ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG + +ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG obj-$(CONFIG_PM) += main.o obj-$(CONFIG_PM_SLEEP) += console.o -- cgit v1.2.3-70-g09d2 From d57f078b193981d1b7d24193f3118c6b806db0ff Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 18 Mar 2011 16:54:31 +0000 Subject: KGDB: Notify GDB of machine halt, reboot or power off Notify GDB of the machine halting, rebooting or powering off by sending it an exited command (remote protocol command 'W'). This is done by calling: void gdbstub_exit(int status) from the arch's machine_{halt,restart,power_off}() functions with an appropriate exit status to be reported to GDB. Signed-off-by: David Howells --- include/linux/kgdb.h | 1 + kernel/debug/gdbstub.c | 30 ++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) (limited to 'kernel') diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h index 092e4250a45..10ca03d0a25 100644 --- a/include/linux/kgdb.h +++ b/include/linux/kgdb.h @@ -297,6 +297,7 @@ extern int kgdb_handle_exception(int ex_vector, int signo, int err_code, struct pt_regs *regs); extern int kgdb_nmicallback(int cpu, void *regs); +extern void gdbstub_exit(int status); extern int kgdb_single_step; extern atomic_t kgdb_active; diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index 481a7bd2dfe..a11db956dd6 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -1093,3 +1093,33 @@ int gdbstub_state(struct kgdb_state *ks, char *cmd) put_packet(remcom_out_buffer); return 0; } + +/** + * gdbstub_exit - Send an exit message to GDB + * @status: The exit code to report. + */ +void gdbstub_exit(int status) +{ + unsigned char checksum, ch, buffer[3]; + int loop; + + buffer[0] = 'W'; + buffer[1] = hex_asc_hi(status); + buffer[2] = hex_asc_lo(status); + + dbg_io_ops->write_char('$'); + checksum = 0; + + for (loop = 0; loop < 3; loop++) { + ch = buffer[loop]; + checksum += ch; + dbg_io_ops->write_char(ch); + } + + dbg_io_ops->write_char('#'); + dbg_io_ops->write_char(hex_asc_hi(checksum)); + dbg_io_ops->write_char(hex_asc_lo(checksum)); + + /* make sure the output is flushed, lest the bootloader clobber it */ + dbg_io_ops->flush(); +} -- cgit v1.2.3-70-g09d2 From 16addf954d3954a72fd56abc02ffcba3c18529a1 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 18 Mar 2011 09:34:53 -0700 Subject: sched: Fix yield_to kernel-doc Add missing function parameters for yield_to(): Warning(kernel/sched.c:5470): No description found for parameter 'p' Warning(kernel/sched.c:5470): No description found for parameter 'preempt' Signed-off-by: Randy Dunlap Cc: Peter Zijlstra LKML-Reference: <20110318093453.8f7489a4.randy.dunlap@oracle.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 58d66ea7d20..052120d6770 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5467,6 +5467,8 @@ EXPORT_SYMBOL(yield); * yield_to - yield the current processor to another thread in * your thread group, or accelerate that thread toward the * processor it's on. + * @p: target task + * @preempt: whether task preemption is allowed or not * * It's the caller's job to ensure that the target task struct * can't go away on us before we can do any checks. -- cgit v1.2.3-70-g09d2 From da48524eb20662618854bb3df2db01fc65f3070c Mon Sep 17 00:00:00 2001 From: Julien Tinnes Date: Fri, 18 Mar 2011 15:05:21 -0700 Subject: Prevent rt_sigqueueinfo and rt_tgsigqueueinfo from spoofing the signal code Userland should be able to trust the pid and uid of the sender of a signal if the si_code is SI_TKILL. Unfortunately, the kernel has historically allowed sigqueueinfo() to send any si_code at all (as long as it was negative - to distinguish it from kernel-generated signals like SIGILL etc), so it could spoof a SI_TKILL with incorrect siginfo values. Happily, it looks like glibc has always set si_code to the appropriate SI_QUEUE, so there are probably no actual user code that ever uses anything but the appropriate SI_QUEUE flag. So just tighten the check for si_code (we used to allow any negative value), and add a (one-time) warning in case there are binaries out there that might depend on using other si_code values. Signed-off-by: Julien Tinnes Acked-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- kernel/signal.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 4e3cff10fdc..31751868de8 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2421,9 +2421,13 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig, return -EFAULT; /* Not even root can pretend to send signals from the kernel. - Nor can they impersonate a kill(), which adds source info. */ - if (info.si_code >= 0) + * Nor can they impersonate a kill()/tgkill(), which adds source info. + */ + if (info.si_code != SI_QUEUE) { + /* We used to allow any < 0 si_code */ + WARN_ON_ONCE(info.si_code < 0); return -EPERM; + } info.si_signo = sig; /* POSIX.1b doesn't mention process groups. */ @@ -2437,9 +2441,13 @@ long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info) return -EINVAL; /* Not even root can pretend to send signals from the kernel. - Nor can they impersonate a kill(), which adds source info. */ - if (info->si_code >= 0) + * Nor can they impersonate a kill()/tgkill(), which adds source info. + */ + if (info->si_code != SI_QUEUE) { + /* We used to allow any < 0 si_code */ + WARN_ON_ONCE(info->si_code < 0); return -EPERM; + } info->si_signo = sig; return do_send_specific(tgid, pid, sig, info); -- cgit v1.2.3-70-g09d2 From 1106b6997df7d0c0487e21fd9c9dd2ce3d4a52db Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 16 Feb 2011 17:35:34 +0100 Subject: tracing: Fix set_ftrace_filter probe function display If one or more function probes (like traceon) are enabled, and there's no other function filter, the first probe func is skipped (which one depends on the position in the hash). $ echo sys_open:traceon sys_close:traceon > ./set_ftrace_filter $ cat set_ftrace_filter #### all functions enabled #### sys_close:traceon:unlimited $ The reason was, that in the case of no other function filter, the func_pos was not properly updated before calling t_hash_start. Signed-off-by: Jiri Olsa LKML-Reference: <1297874134-7008-1-git-send-email-jolsa@redhat.com> Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 888b611897d..c075f4ea6b9 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1467,7 +1467,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos) return t_hash_next(m, pos); (*pos)++; - iter->pos = *pos; + iter->pos = iter->func_pos = *pos; if (iter->flags & FTRACE_ITER_PRINTALL) return t_hash_start(m, pos); @@ -1502,7 +1502,6 @@ t_next(struct seq_file *m, void *v, loff_t *pos) if (!rec) return t_hash_start(m, pos); - iter->func_pos = *pos; iter->func = rec; return iter; -- cgit v1.2.3-70-g09d2 From 8d2587970b8bdf7c8d9208e3f4bb93182aef1a0f Mon Sep 17 00:00:00 2001 From: Phil Carmody Date: Tue, 22 Mar 2011 16:30:13 -0700 Subject: cgroups: if you list_empty() a head then don't list_del() it list_del() leaves poison in the prev and next pointers. The next list_empty() will compare those poisons, and say the list isn't empty. Any list operations that assume the node is on a list because of such a check will be fooled into dereferencing poison. One needs to INIT the node after the del, and fortunately there's already a wrapper for that - list_del_init(). Some of the dels are followed by deallocations, so can be ignored, and one can be merged with an add to make a move. Apart from that, I erred on the side of caution in making nodes list_empty()-queriable. Signed-off-by: Phil Carmody Reviewed-by: Paul Menage Cc: Li Zefan Acked-by: Kirill A. Shutemov Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 95362d15128..e31b220a743 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1813,10 +1813,8 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) /* Update the css_set linked lists if we're using them */ write_lock(&css_set_lock); - if (!list_empty(&tsk->cg_list)) { - list_del(&tsk->cg_list); - list_add(&tsk->cg_list, &newcg->tasks); - } + if (!list_empty(&tsk->cg_list)) + list_move(&tsk->cg_list, &newcg->tasks); write_unlock(&css_set_lock); for_each_subsys(root, ss) { @@ -3655,12 +3653,12 @@ again: spin_lock(&release_list_lock); set_bit(CGRP_REMOVED, &cgrp->flags); if (!list_empty(&cgrp->release_list)) - list_del(&cgrp->release_list); + list_del_init(&cgrp->release_list); spin_unlock(&release_list_lock); cgroup_lock_hierarchy(cgrp->root); /* delete this cgroup from parent->children */ - list_del(&cgrp->sibling); + list_del_init(&cgrp->sibling); cgroup_unlock_hierarchy(cgrp->root); d = dget(cgrp->dentry); @@ -3879,7 +3877,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) subsys[ss->subsys_id] = NULL; /* remove subsystem from rootnode's list of subsystems */ - list_del(&ss->sibling); + list_del_init(&ss->sibling); /* * disentangle the css from all css_sets attached to the dummytop. as @@ -4241,7 +4239,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) if (!list_empty(&tsk->cg_list)) { write_lock(&css_set_lock); if (!list_empty(&tsk->cg_list)) - list_del(&tsk->cg_list); + list_del_init(&tsk->cg_list); write_unlock(&css_set_lock); } -- cgit v1.2.3-70-g09d2 From 504f52b5439aaf26d3e2c1d45ec10fce38c8dd27 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 22 Mar 2011 16:30:41 -0700 Subject: mm: NUMA aware alloc_task_struct_node() All kthreads being created from a single helper task, they all use memory from a single node for their kernel stack and task struct. This patch suite creates kthread_create_on_cpu(), adding a 'cpu' parameter to parameters already used by kthread_create(). This parameter serves in allocating memory for the new kthread on its memory node if available. Users of this new function are : ksoftirqd, kworker, migration, pktgend... This patch: Add a node parameter to alloc_task_struct(), and change its name to alloc_task_struct_node() This change is needed to allow NUMA aware kthread_create_on_cpu() Signed-off-by: Eric Dumazet Acked-by: David S. Miller Reviewed-by: Andi Kleen Acked-by: Rusty Russell Cc: Tejun Heo Cc: Tony Luck Cc: Fenghua Yu Cc: David Howells Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/frv/include/asm/processor.h | 2 +- arch/frv/kernel/process.c | 5 +++-- arch/ia64/include/asm/thread_info.h | 9 ++++++++- arch/um/include/asm/processor-generic.h | 2 +- kernel/fork.c | 10 ++++++---- 5 files changed, 19 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/arch/frv/include/asm/processor.h b/arch/frv/include/asm/processor.h index 3744f2e47f4..4b789ab182b 100644 --- a/arch/frv/include/asm/processor.h +++ b/arch/frv/include/asm/processor.h @@ -137,7 +137,7 @@ unsigned long get_wchan(struct task_struct *p); #define KSTK_ESP(tsk) ((tsk)->thread.frame0->sp) /* Allocation and freeing of basic task resources. */ -extern struct task_struct *alloc_task_struct(void); +extern struct task_struct *alloc_task_struct_node(int node); extern void free_task_struct(struct task_struct *p); #define cpu_relax() barrier() diff --git a/arch/frv/kernel/process.c b/arch/frv/kernel/process.c index efad12071c2..9d359752646 100644 --- a/arch/frv/kernel/process.c +++ b/arch/frv/kernel/process.c @@ -44,9 +44,10 @@ asmlinkage void ret_from_fork(void); void (*pm_power_off)(void); EXPORT_SYMBOL(pm_power_off); -struct task_struct *alloc_task_struct(void) +struct task_struct *alloc_task_struct_node(int node) { - struct task_struct *p = kmalloc(THREAD_SIZE, GFP_KERNEL); + struct task_struct *p = kmalloc_node(THREAD_SIZE, GFP_KERNEL, node); + if (p) atomic_set((atomic_t *)(p+1), 1); return p; diff --git a/arch/ia64/include/asm/thread_info.h b/arch/ia64/include/asm/thread_info.h index b6a5ba2aca3..342004bbefe 100644 --- a/arch/ia64/include/asm/thread_info.h +++ b/arch/ia64/include/asm/thread_info.h @@ -84,7 +84,14 @@ struct thread_info { #define end_of_stack(p) (unsigned long *)((void *)(p) + IA64_RBS_OFFSET) #define __HAVE_ARCH_TASK_STRUCT_ALLOCATOR -#define alloc_task_struct() ((struct task_struct *)__get_free_pages(GFP_KERNEL | __GFP_COMP, KERNEL_STACK_SIZE_ORDER)) +#define alloc_task_struct_node(node) \ +({ \ + struct page *page = alloc_pages_node(node, GFP_KERNEL | __GFP_COMP, \ + KERNEL_STACK_SIZE_ORDER); \ + struct task_struct *ret = page ? page_address(page) : NULL; \ + \ + ret; +}) #define free_task_struct(tsk) free_pages((unsigned long) (tsk), KERNEL_STACK_SIZE_ORDER) #endif /* !__ASSEMBLY */ diff --git a/arch/um/include/asm/processor-generic.h b/arch/um/include/asm/processor-generic.h index bed668824b5..d1d1b0d8a0c 100644 --- a/arch/um/include/asm/processor-generic.h +++ b/arch/um/include/asm/processor-generic.h @@ -66,7 +66,7 @@ struct thread_struct { .request = { 0 } \ } -extern struct task_struct *alloc_task_struct(void); +extern struct task_struct *alloc_task_struct_node(int node); static inline void release_thread(struct task_struct *task) { diff --git a/kernel/fork.c b/kernel/fork.c index 05b92c45701..cffbe8a4e1f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -109,8 +109,10 @@ int nr_processes(void) } #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR -# define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL) -# define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk)) +# define alloc_task_struct_node(node) \ + kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node) +# define free_task_struct(tsk) \ + kmem_cache_free(task_struct_cachep, (tsk)) static struct kmem_cache *task_struct_cachep; #endif @@ -249,12 +251,12 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) struct task_struct *tsk; struct thread_info *ti; unsigned long *stackend; - + int node = numa_node_id(); int err; prepare_to_copy(orig); - tsk = alloc_task_struct(); + tsk = alloc_task_struct_node(node); if (!tsk) return NULL; -- cgit v1.2.3-70-g09d2 From b6a84016bd2598e35ead635147fa53619982648d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 22 Mar 2011 16:30:42 -0700 Subject: mm: NUMA aware alloc_thread_info_node() Add a node parameter to alloc_thread_info(), and change its name to alloc_thread_info_node() This change is needed to allow NUMA aware kthread_create_on_cpu() Signed-off-by: Eric Dumazet Acked-by: David S. Miller Reviewed-by: Andi Kleen Acked-by: Rusty Russell Cc: Tejun Heo Cc: Tony Luck Cc: Fenghua Yu Cc: David Howells Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/cris/include/asm/thread_info.h | 2 +- arch/frv/include/asm/thread_info.h | 13 ++++--------- arch/ia64/include/asm/thread_info.h | 5 +++-- arch/m32r/include/asm/thread_info.h | 13 ++++--------- arch/mips/include/asm/thread_info.h | 6 ++++-- arch/mn10300/include/asm/thread_info.h | 6 ++++-- arch/powerpc/include/asm/thread_info.h | 2 +- arch/powerpc/kernel/process.c | 4 ++-- arch/score/include/asm/thread_info.h | 2 +- arch/sh/include/asm/thread_info.h | 2 +- arch/sh/kernel/process.c | 16 +++++++++------- arch/sparc/include/asm/thread_info_32.h | 6 +++--- arch/sparc/include/asm/thread_info_64.h | 24 ++++++++++++------------ arch/sparc/mm/srmmu.c | 4 ++-- arch/sparc/mm/sun4c.c | 4 ++-- arch/tile/include/asm/thread_info.h | 2 +- arch/tile/kernel/process.c | 4 ++-- arch/x86/include/asm/thread_info.h | 10 ++++++++-- kernel/fork.c | 9 ++++++--- 19 files changed, 70 insertions(+), 64 deletions(-) (limited to 'kernel') diff --git a/arch/cris/include/asm/thread_info.h b/arch/cris/include/asm/thread_info.h index 91776069ca8..29b74a10583 100644 --- a/arch/cris/include/asm/thread_info.h +++ b/arch/cris/include/asm/thread_info.h @@ -68,7 +68,7 @@ struct thread_info { #define init_thread_info (init_thread_union.thread_info) /* thread information allocation */ -#define alloc_thread_info(tsk) ((struct thread_info *) __get_free_pages(GFP_KERNEL,1)) +#define alloc_thread_info(tsk, node) ((struct thread_info *) __get_free_pages(GFP_KERNEL,1)) #define free_thread_info(ti) free_pages((unsigned long) (ti), 1) #endif /* !__ASSEMBLY__ */ diff --git a/arch/frv/include/asm/thread_info.h b/arch/frv/include/asm/thread_info.h index 11f33ead29b..8582e9c7531 100644 --- a/arch/frv/include/asm/thread_info.h +++ b/arch/frv/include/asm/thread_info.h @@ -84,16 +84,11 @@ register struct thread_info *__current_thread_info asm("gr15"); /* thread information allocation */ #ifdef CONFIG_DEBUG_STACK_USAGE -#define alloc_thread_info(tsk) \ - ({ \ - struct thread_info *ret; \ - \ - ret = kzalloc(THREAD_SIZE, GFP_KERNEL); \ - \ - ret; \ - }) +#define alloc_thread_info_node(tsk, node) \ + kzalloc_node(THREAD_SIZE, GFP_KERNEL, node) #else -#define alloc_thread_info(tsk) kmalloc(THREAD_SIZE, GFP_KERNEL) +#define alloc_thread_info_node(tsk) \ + kmalloc_node(THREAD_SIZE, GFP_KERNEL, node) #endif #define free_thread_info(info) kfree(info) diff --git a/arch/ia64/include/asm/thread_info.h b/arch/ia64/include/asm/thread_info.h index 342004bbefe..6392908e8f9 100644 --- a/arch/ia64/include/asm/thread_info.h +++ b/arch/ia64/include/asm/thread_info.h @@ -59,11 +59,12 @@ struct thread_info { #ifndef ASM_OFFSETS_C /* how to get the thread information struct from C */ #define current_thread_info() ((struct thread_info *) ((char *) current + IA64_TASK_SIZE)) -#define alloc_thread_info(tsk) ((struct thread_info *) ((char *) (tsk) + IA64_TASK_SIZE)) +#define alloc_thread_info_node(tsk, node) \ + ((struct thread_info *) ((char *) (tsk) + IA64_TASK_SIZE)) #define task_thread_info(tsk) ((struct thread_info *) ((char *) (tsk) + IA64_TASK_SIZE)) #else #define current_thread_info() ((struct thread_info *) 0) -#define alloc_thread_info(tsk) ((struct thread_info *) 0) +#define alloc_thread_info_node(tsk, node) ((struct thread_info *) 0) #define task_thread_info(tsk) ((struct thread_info *) 0) #endif #define free_thread_info(ti) /* nothing */ diff --git a/arch/m32r/include/asm/thread_info.h b/arch/m32r/include/asm/thread_info.h index 71faff5bcc2..0227dba4406 100644 --- a/arch/m32r/include/asm/thread_info.h +++ b/arch/m32r/include/asm/thread_info.h @@ -96,16 +96,11 @@ static inline struct thread_info *current_thread_info(void) /* thread information allocation */ #ifdef CONFIG_DEBUG_STACK_USAGE -#define alloc_thread_info(tsk) \ - ({ \ - struct thread_info *ret; \ - \ - ret = kzalloc(THREAD_SIZE, GFP_KERNEL); \ - \ - ret; \ - }) +#define alloc_thread_info_node(tsk, node) \ + kzalloc_node(THREAD_SIZE, GFP_KERNEL, node) #else -#define alloc_thread_info(tsk) kmalloc(THREAD_SIZE, GFP_KERNEL) +#define alloc_thread_info_node(tsk, node) \ + kmalloc_node(THREAD_SIZE, GFP_KERNEL, node) #endif #define free_thread_info(info) kfree(info) diff --git a/arch/mips/include/asm/thread_info.h b/arch/mips/include/asm/thread_info.h index d309556cacf..d71160de4d1 100644 --- a/arch/mips/include/asm/thread_info.h +++ b/arch/mips/include/asm/thread_info.h @@ -88,9 +88,11 @@ register struct thread_info *__current_thread_info __asm__("$28"); #define __HAVE_ARCH_THREAD_INFO_ALLOCATOR #ifdef CONFIG_DEBUG_STACK_USAGE -#define alloc_thread_info(tsk) kzalloc(THREAD_SIZE, GFP_KERNEL) +#define alloc_thread_info_node(tsk, node) \ + kzalloc_node(THREAD_SIZE, GFP_KERNEL, node) #else -#define alloc_thread_info(tsk) kmalloc(THREAD_SIZE, GFP_KERNEL) +#define alloc_thread_info_node(tsk, node) \ + kmalloc_node(THREAD_SIZE, GFP_KERNEL, node) #endif #define free_thread_info(info) kfree(info) diff --git a/arch/mn10300/include/asm/thread_info.h b/arch/mn10300/include/asm/thread_info.h index aa07a4a5d79..8d53f09c878 100644 --- a/arch/mn10300/include/asm/thread_info.h +++ b/arch/mn10300/include/asm/thread_info.h @@ -124,9 +124,11 @@ static inline unsigned long current_stack_pointer(void) /* thread information allocation */ #ifdef CONFIG_DEBUG_STACK_USAGE -#define alloc_thread_info(tsk) kzalloc(THREAD_SIZE, GFP_KERNEL) +#define alloc_thread_info_node(tsk, node) \ + kzalloc_node(THREAD_SIZE, GFP_KERNEL, node) #else -#define alloc_thread_info(tsk) kmalloc(THREAD_SIZE, GFP_KERNEL) +#define alloc_thread_info_node(tsk, node) \ + kmalloc_node(THREAD_SIZE, GFP_KERNEL, node) #endif #define free_thread_info(ti) kfree((ti)) diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h index 65eb85976a0..d8529ef13b2 100644 --- a/arch/powerpc/include/asm/thread_info.h +++ b/arch/powerpc/include/asm/thread_info.h @@ -72,7 +72,7 @@ struct thread_info { #define __HAVE_ARCH_THREAD_INFO_ALLOCATOR -extern struct thread_info *alloc_thread_info(struct task_struct *tsk); +extern struct thread_info *alloc_thread_info_node(struct task_struct *tsk, int node); extern void free_thread_info(struct thread_info *ti); #endif /* THREAD_SHIFT < PAGE_SHIFT */ diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 8303a6c65ef..f74f355a961 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1218,11 +1218,11 @@ void __ppc64_runlatch_off(void) static struct kmem_cache *thread_info_cache; -struct thread_info *alloc_thread_info(struct task_struct *tsk) +struct thread_info *alloc_thread_info_node(struct task_struct *tsk, int node) { struct thread_info *ti; - ti = kmem_cache_alloc(thread_info_cache, GFP_KERNEL); + ti = kmem_cache_alloc_node(thread_info_cache, GFP_KERNEL, node); if (unlikely(ti == NULL)) return NULL; #ifdef CONFIG_DEBUG_STACK_USAGE diff --git a/arch/score/include/asm/thread_info.h b/arch/score/include/asm/thread_info.h index 8570d08f58c..2205c62284d 100644 --- a/arch/score/include/asm/thread_info.h +++ b/arch/score/include/asm/thread_info.h @@ -71,7 +71,7 @@ struct thread_info { register struct thread_info *__current_thread_info __asm__("r28"); #define current_thread_info() __current_thread_info -#define alloc_thread_info(tsk) kmalloc(THREAD_SIZE, GFP_KERNEL) +#define alloc_thread_info_node(tsk, node) kmalloc_node(THREAD_SIZE, GFP_KERNEL, node) #define free_thread_info(info) kfree(info) #endif /* !__ASSEMBLY__ */ diff --git a/arch/sh/include/asm/thread_info.h b/arch/sh/include/asm/thread_info.h index c228946926e..ea2d5089de1 100644 --- a/arch/sh/include/asm/thread_info.h +++ b/arch/sh/include/asm/thread_info.h @@ -95,7 +95,7 @@ static inline struct thread_info *current_thread_info(void) #endif -extern struct thread_info *alloc_thread_info(struct task_struct *tsk); +extern struct thread_info *alloc_thread_info_node(struct task_struct *tsk, int node); extern void free_thread_info(struct thread_info *ti); extern void arch_task_cache_init(void); #define arch_task_cache_init arch_task_cache_init diff --git a/arch/sh/kernel/process.c b/arch/sh/kernel/process.c index dcb126dc76f..f39ad57296b 100644 --- a/arch/sh/kernel/process.c +++ b/arch/sh/kernel/process.c @@ -32,16 +32,16 @@ void free_thread_xstate(struct task_struct *tsk) #if THREAD_SHIFT < PAGE_SHIFT static struct kmem_cache *thread_info_cache; -struct thread_info *alloc_thread_info(struct task_struct *tsk) +struct thread_info *alloc_thread_info(struct task_struct *tsk, int node) { struct thread_info *ti; - - ti = kmem_cache_alloc(thread_info_cache, GFP_KERNEL); - if (unlikely(ti == NULL)) - return NULL; #ifdef CONFIG_DEBUG_STACK_USAGE - memset(ti, 0, THREAD_SIZE); + gfp_t mask = GFP_KERNEL | __GFP_ZERO; +#else + gfp_t mask = GFP_KERNEL; #endif + + ti = kmem_cache_alloc_node(thread_info_cache, mask, node); return ti; } @@ -64,7 +64,9 @@ struct thread_info *alloc_thread_info(struct task_struct *tsk) #else gfp_t mask = GFP_KERNEL; #endif - return (struct thread_info *)__get_free_pages(mask, THREAD_SIZE_ORDER); + struct page *page = alloc_pages_node(node, mask, THREAD_SIZE_ORDER); + + return page ? page_address(page) : NULL; } void free_thread_info(struct thread_info *ti) diff --git a/arch/sparc/include/asm/thread_info_32.h b/arch/sparc/include/asm/thread_info_32.h index 9dd0318d3dd..fa575323341 100644 --- a/arch/sparc/include/asm/thread_info_32.h +++ b/arch/sparc/include/asm/thread_info_32.h @@ -82,8 +82,8 @@ register struct thread_info *current_thread_info_reg asm("g6"); #define __HAVE_ARCH_THREAD_INFO_ALLOCATOR -BTFIXUPDEF_CALL(struct thread_info *, alloc_thread_info, void) -#define alloc_thread_info(tsk) BTFIXUP_CALL(alloc_thread_info)() +BTFIXUPDEF_CALL(struct thread_info *, alloc_thread_info_node, int) +#define alloc_thread_info_node(tsk, node) BTFIXUP_CALL(alloc_thread_info_node)(node) BTFIXUPDEF_CALL(void, free_thread_info, struct thread_info *) #define free_thread_info(ti) BTFIXUP_CALL(free_thread_info)(ti) @@ -92,7 +92,7 @@ BTFIXUPDEF_CALL(void, free_thread_info, struct thread_info *) /* * Size of kernel stack for each process. - * Observe the order of get_free_pages() in alloc_thread_info(). + * Observe the order of get_free_pages() in alloc_thread_info_node(). * The sun4 has 8K stack too, because it's short on memory, and 16K is a waste. */ #define THREAD_SIZE 8192 diff --git a/arch/sparc/include/asm/thread_info_64.h b/arch/sparc/include/asm/thread_info_64.h index fb2ea7705a4..60d86be1a53 100644 --- a/arch/sparc/include/asm/thread_info_64.h +++ b/arch/sparc/include/asm/thread_info_64.h @@ -146,21 +146,21 @@ register struct thread_info *current_thread_info_reg asm("g6"); #define __HAVE_ARCH_THREAD_INFO_ALLOCATOR #ifdef CONFIG_DEBUG_STACK_USAGE -#define alloc_thread_info(tsk) \ -({ \ - struct thread_info *ret; \ - \ - ret = (struct thread_info *) \ - __get_free_pages(GFP_KERNEL, __THREAD_INFO_ORDER); \ - if (ret) \ - memset(ret, 0, PAGE_SIZE<<__THREAD_INFO_ORDER); \ - ret; \ -}) +#define THREAD_FLAGS (GFP_KERNEL | __GFP_ZERO) #else -#define alloc_thread_info(tsk) \ - ((struct thread_info *)__get_free_pages(GFP_KERNEL, __THREAD_INFO_ORDER)) +#define THREAD_FLAGS (GFP_KERNEL) #endif +#define alloc_thread_info_node(tsk, node) \ +({ \ + struct page *page = alloc_pages_node(node, THREAD_FLAGS, \ + __THREAD_INFO_ORDER); \ + struct thread_info *ret; \ + \ + ret = page ? page_address(page) : NULL; \ + ret; \ +}) + #define free_thread_info(ti) \ free_pages((unsigned long)(ti),__THREAD_INFO_ORDER) diff --git a/arch/sparc/mm/srmmu.c b/arch/sparc/mm/srmmu.c index 92319aa8b66..fe09fd8be69 100644 --- a/arch/sparc/mm/srmmu.c +++ b/arch/sparc/mm/srmmu.c @@ -650,7 +650,7 @@ static void srmmu_unmapiorange(unsigned long virt_addr, unsigned int len) * mappings on the kernel stack without any special code as we did * need on the sun4c. */ -static struct thread_info *srmmu_alloc_thread_info(void) +static struct thread_info *srmmu_alloc_thread_info_node(int node) { struct thread_info *ret; @@ -2271,7 +2271,7 @@ void __init ld_mmu_srmmu(void) BTFIXUPSET_CALL(mmu_info, srmmu_mmu_info, BTFIXUPCALL_NORM); - BTFIXUPSET_CALL(alloc_thread_info, srmmu_alloc_thread_info, BTFIXUPCALL_NORM); + BTFIXUPSET_CALL(alloc_thread_info_node, srmmu_alloc_thread_info_node, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(free_thread_info, srmmu_free_thread_info, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(pte_to_pgoff, srmmu_pte_to_pgoff, BTFIXUPCALL_NORM); diff --git a/arch/sparc/mm/sun4c.c b/arch/sparc/mm/sun4c.c index b5137cc2aba..a2350b5e68a 100644 --- a/arch/sparc/mm/sun4c.c +++ b/arch/sparc/mm/sun4c.c @@ -922,7 +922,7 @@ static inline void garbage_collect(int entry) free_locked_segment(BUCKET_ADDR(entry)); } -static struct thread_info *sun4c_alloc_thread_info(void) +static struct thread_info *sun4c_alloc_thread_info_node(int node) { unsigned long addr, pages; int entry; @@ -2155,7 +2155,7 @@ void __init ld_mmu_sun4c(void) BTFIXUPSET_CALL(__swp_offset, sun4c_swp_offset, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(__swp_entry, sun4c_swp_entry, BTFIXUPCALL_NORM); - BTFIXUPSET_CALL(alloc_thread_info, sun4c_alloc_thread_info, BTFIXUPCALL_NORM); + BTFIXUPSET_CALL(alloc_thread_info_node, sun4c_alloc_thread_info_node, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(free_thread_info, sun4c_free_thread_info, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(mmu_info, sun4c_mmu_info, BTFIXUPCALL_NORM); diff --git a/arch/tile/include/asm/thread_info.h b/arch/tile/include/asm/thread_info.h index 9e8e9c4dfa2..3405b52853b 100644 --- a/arch/tile/include/asm/thread_info.h +++ b/arch/tile/include/asm/thread_info.h @@ -84,7 +84,7 @@ register unsigned long stack_pointer __asm__("sp"); ((struct thread_info *)(stack_pointer & -THREAD_SIZE)) #define __HAVE_ARCH_THREAD_INFO_ALLOCATOR -extern struct thread_info *alloc_thread_info(struct task_struct *task); +extern struct thread_info *alloc_thread_info_node(struct task_struct *task, int node); extern void free_thread_info(struct thread_info *info); /* Sit on a nap instruction until interrupted. */ diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c index b9cd962e1d3..d0065103eb7 100644 --- a/arch/tile/kernel/process.c +++ b/arch/tile/kernel/process.c @@ -109,7 +109,7 @@ void cpu_idle(void) } } -struct thread_info *alloc_thread_info(struct task_struct *task) +struct thread_info *alloc_thread_info_node(struct task_struct *task, int node) { struct page *page; gfp_t flags = GFP_KERNEL; @@ -118,7 +118,7 @@ struct thread_info *alloc_thread_info(struct task_struct *task) flags |= __GFP_ZERO; #endif - page = alloc_pages(flags, THREAD_SIZE_ORDER); + page = alloc_pages_node(node, flags, THREAD_SIZE_ORDER); if (!page) return NULL; diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index f0b6e5dbc5a..1f2e61e2898 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -161,8 +161,14 @@ struct thread_info { #define __HAVE_ARCH_THREAD_INFO_ALLOCATOR -#define alloc_thread_info(tsk) \ - ((struct thread_info *)__get_free_pages(THREAD_FLAGS, THREAD_ORDER)) +#define alloc_thread_info_node(tsk, node) \ +({ \ + struct page *page = alloc_pages_node(node, THREAD_FLAGS, \ + THREAD_ORDER); \ + struct thread_info *ret = page ? page_address(page) : NULL; \ + \ + ret; \ +}) #ifdef CONFIG_X86_32 diff --git a/kernel/fork.c b/kernel/fork.c index cffbe8a4e1f..cbc6adc6e89 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -117,14 +117,17 @@ static struct kmem_cache *task_struct_cachep; #endif #ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR -static inline struct thread_info *alloc_thread_info(struct task_struct *tsk) +static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, + int node) { #ifdef CONFIG_DEBUG_STACK_USAGE gfp_t mask = GFP_KERNEL | __GFP_ZERO; #else gfp_t mask = GFP_KERNEL; #endif - return (struct thread_info *)__get_free_pages(mask, THREAD_SIZE_ORDER); + struct page *page = alloc_pages_node(node, mask, THREAD_SIZE_ORDER); + + return page ? page_address(page) : NULL; } static inline void free_thread_info(struct thread_info *ti) @@ -260,7 +263,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) if (!tsk) return NULL; - ti = alloc_thread_info(tsk); + ti = alloc_thread_info_node(tsk, node); if (!ti) { free_task_struct(tsk); return NULL; -- cgit v1.2.3-70-g09d2 From 207205a2ba2655652fe46a60b49838af6c16a919 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 22 Mar 2011 16:30:44 -0700 Subject: kthread: NUMA aware kthread_create_on_node() All kthreads being created from a single helper task, they all use memory from a single node for their kernel stack and task struct. This patch suite creates kthread_create_on_node(), adding a 'cpu' parameter to parameters already used by kthread_create(). This parameter serves in allocating memory for the new kthread on its memory node if possible. Signed-off-by: Eric Dumazet Acked-by: David S. Miller Reviewed-by: Andi Kleen Acked-by: Rusty Russell Cc: Tejun Heo Cc: Tony Luck Cc: Fenghua Yu Cc: David Howells Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kthread.h | 14 ++++++++++---- include/linux/sched.h | 1 + kernel/fork.c | 3 ++- kernel/kthread.c | 31 +++++++++++++++++++++++++------ 4 files changed, 38 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 7ff16f7d3ed..1e923e5e88e 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -4,10 +4,15 @@ #include #include -struct task_struct *kthread_create(int (*threadfn)(void *data), - void *data, - const char namefmt[], ...) - __attribute__((format(printf, 3, 4))); +struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), + void *data, + int node, + const char namefmt[], ...) + __attribute__((format(printf, 4, 5))); + +#define kthread_create(threadfn, data, namefmt, arg...) \ + kthread_create_on_node(threadfn, data, -1, namefmt, ##arg) + /** * kthread_run - create and wake a thread. @@ -34,6 +39,7 @@ void *kthread_data(struct task_struct *k); int kthreadd(void *unused); extern struct task_struct *kthreadd_task; +extern int tsk_fork_get_node(struct task_struct *tsk); /* * Simple work processor based on kthread. diff --git a/include/linux/sched.h b/include/linux/sched.h index c15936fe998..4b601be3dac 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1471,6 +1471,7 @@ struct task_struct { #ifdef CONFIG_NUMA struct mempolicy *mempolicy; /* Protected by alloc_lock */ short il_next; + short pref_node_fork; #endif atomic_t fs_excl; /* holding fs exclusive resources */ struct rcu_head rcu; diff --git a/kernel/fork.c b/kernel/fork.c index cbc6adc6e89..a8f64f8ec7e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include #include @@ -254,7 +255,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) struct task_struct *tsk; struct thread_info *ti; unsigned long *stackend; - int node = numa_node_id(); + int node = tsk_fork_get_node(orig); int err; prepare_to_copy(orig); diff --git a/kernel/kthread.c b/kernel/kthread.c index c55afba990a..684ab3f7dd7 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -27,6 +27,7 @@ struct kthread_create_info /* Information passed to kthread() from kthreadd. */ int (*threadfn)(void *data); void *data; + int node; /* Result passed back to kthread_create() from kthreadd. */ struct task_struct *result; @@ -98,10 +99,23 @@ static int kthread(void *_create) do_exit(ret); } +/* called from do_fork() to get node information for about to be created task */ +int tsk_fork_get_node(struct task_struct *tsk) +{ +#ifdef CONFIG_NUMA + if (tsk == kthreadd_task) + return tsk->pref_node_fork; +#endif + return numa_node_id(); +} + static void create_kthread(struct kthread_create_info *create) { int pid; +#ifdef CONFIG_NUMA + current->pref_node_fork = create->node; +#endif /* We want our own signal handler (we take no signals by default). */ pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD); if (pid < 0) { @@ -111,15 +125,18 @@ static void create_kthread(struct kthread_create_info *create) } /** - * kthread_create - create a kthread. + * kthread_create_on_node - create a kthread. * @threadfn: the function to run until signal_pending(current). * @data: data ptr for @threadfn. + * @node: memory node number. * @namefmt: printf-style name for the thread. * * Description: This helper function creates and names a kernel * thread. The thread will be stopped: use wake_up_process() to start * it. See also kthread_run(). * + * If thread is going to be bound on a particular cpu, give its node + * in @node, to get NUMA affinity for kthread stack, or else give -1. * When woken, the thread will run @threadfn() with @data as its * argument. @threadfn() can either call do_exit() directly if it is a * standalone thread for which noone will call kthread_stop(), or @@ -129,15 +146,17 @@ static void create_kthread(struct kthread_create_info *create) * * Returns a task_struct or ERR_PTR(-ENOMEM). */ -struct task_struct *kthread_create(int (*threadfn)(void *data), - void *data, - const char namefmt[], - ...) +struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), + void *data, + int node, + const char namefmt[], + ...) { struct kthread_create_info create; create.threadfn = threadfn; create.data = data; + create.node = node; init_completion(&create.done); spin_lock(&kthread_create_lock); @@ -164,7 +183,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), } return create.result; } -EXPORT_SYMBOL(kthread_create); +EXPORT_SYMBOL(kthread_create_on_node); /** * kthread_bind - bind a just-created kthread to a cpu. -- cgit v1.2.3-70-g09d2 From 94dcf29a11b3d20a28790598d701f98484a969da Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 22 Mar 2011 16:30:45 -0700 Subject: kthread: use kthread_create_on_node() ksoftirqd, kworker, migration, and pktgend kthreads can be created with kthread_create_on_node(), to get proper NUMA affinities for their stack and task_struct. Signed-off-by: Eric Dumazet Acked-by: David S. Miller Reviewed-by: Andi Kleen Acked-by: Rusty Russell Acked-by: Tejun Heo Cc: Tony Luck Cc: Fenghua Yu Cc: David Howells Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/softirq.c | 5 ++++- kernel/stop_machine.c | 6 ++++-- kernel/workqueue.c | 6 ++++-- net/core/pktgen.c | 5 ++++- 4 files changed, 16 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 56e5dec837f..735d8709517 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -845,7 +845,10 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - p = kthread_create(run_ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); + p = kthread_create_on_node(run_ksoftirqd, + hcpu, + cpu_to_node(hotcpu), + "ksoftirqd/%d", hotcpu); if (IS_ERR(p)) { printk("ksoftirqd for %i failed\n", hotcpu); return notifier_from_errno(PTR_ERR(p)); diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 2df820b03be..e3516b29076 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -301,8 +301,10 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, case CPU_UP_PREPARE: BUG_ON(stopper->thread || stopper->enabled || !list_empty(&stopper->works)); - p = kthread_create(cpu_stopper_thread, stopper, "migration/%d", - cpu); + p = kthread_create_on_node(cpu_stopper_thread, + stopper, + cpu_to_node(cpu), + "migration/%d", cpu); if (IS_ERR(p)) return notifier_from_errno(PTR_ERR(p)); get_task_struct(p); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 5ca7ce9ce75..04ef830690e 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1366,8 +1366,10 @@ static struct worker *create_worker(struct global_cwq *gcwq, bool bind) worker->id = id; if (!on_unbound_cpu) - worker->task = kthread_create(worker_thread, worker, - "kworker/%u:%d", gcwq->cpu, id); + worker->task = kthread_create_on_node(worker_thread, + worker, + cpu_to_node(gcwq->cpu), + "kworker/%u:%d", gcwq->cpu, id); else worker->task = kthread_create(worker_thread, worker, "kworker/u:%d", id); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 0c55eaa70e3..aeeece72b72 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3761,7 +3761,10 @@ static int __init pktgen_create_thread(int cpu) list_add_tail(&t->th_list, &pktgen_threads); init_completion(&t->start_done); - p = kthread_create(pktgen_thread_worker, t, "kpktgend_%d", cpu); + p = kthread_create_on_node(pktgen_thread_worker, + t, + cpu_to_node(cpu), + "kpktgend_%d", cpu); if (IS_ERR(p)) { pr_err("kernel_thread() failed for cpu %d\n", t->cpu); list_del(&t->th_list); -- cgit v1.2.3-70-g09d2 From d404ab0a1133e95557bb7deab2a49b348dfeba85 Mon Sep 17 00:00:00 2001 From: Olaf Hering Date: Tue, 22 Mar 2011 16:34:04 -0700 Subject: move x86 specific oops=panic to generic code The oops=panic cmdline option is not x86 specific, move it to generic code. Update documentation. Signed-off-by: Olaf Hering Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/kernel-parameters.txt | 5 +++++ Documentation/x86/x86_64/boot-options.txt | 5 ----- arch/x86/kernel/dumpstack.c | 10 ---------- kernel/panic.c | 10 ++++++++++ 4 files changed, 15 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index d18a9e12152..a3b688287a8 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1825,6 +1825,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted. perfmon on Intel CPUs instead of the CPU specific event set. + oops=panic Always panic on oopses. Default is to just kill the process, + but there is a small probability of deadlocking the machine. + This will also cause panics on machine check exceptions. + Useful together with panic=30 to trigger a reboot. + OSS [HW,OSS] See Documentation/sound/oss/oss-parameters.txt diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt index 48c13b8ab90..092e596a130 100644 --- a/Documentation/x86/x86_64/boot-options.txt +++ b/Documentation/x86/x86_64/boot-options.txt @@ -293,11 +293,6 @@ IOMMU (input/output memory management unit) Debugging - oops=panic Always panic on oopses. Default is to just kill the process, - but there is a small probability of deadlocking the machine. - This will also cause panics on machine check exceptions. - Useful together with panic=30 to trigger a reboot. - kstack=N Print N words from the kernel stack in oops dumps. pagefaulttrace Dump all page faults. Only useful for extreme debugging diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 999e2793590..81ac6c78c01 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -322,16 +322,6 @@ void die(const char *str, struct pt_regs *regs, long err) oops_end(flags, regs, sig); } -static int __init oops_setup(char *s) -{ - if (!s) - return -EINVAL; - if (!strcmp(s, "panic")) - panic_on_oops = 1; - return 0; -} -early_param("oops", oops_setup); - static int __init kstack_setup(char *s) { if (!s) diff --git a/kernel/panic.c b/kernel/panic.c index 991bb87a170..69231670eb9 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -433,3 +433,13 @@ EXPORT_SYMBOL(__stack_chk_fail); core_param(panic, panic_timeout, int, 0644); core_param(pause_on_oops, pause_on_oops, int, 0644); + +static int __init oops_setup(char *s) +{ + if (!s) + return -EINVAL; + if (!strcmp(s, "panic")) + panic_on_oops = 1; + return 0; +} +early_param("oops", oops_setup); -- cgit v1.2.3-70-g09d2 From 34db18a054c600b6f81787165669dc572fe4de25 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Tue, 22 Mar 2011 16:34:06 -0700 Subject: smp: move smp setup functions to kernel/smp.c Move setup_nr_cpu_ids(), smp_init() and some other SMP boot parameter setup functions from init/main.c to kenrel/smp.c, saves some #ifdef CONFIG_SMP. Signed-off-by: WANG Cong Cc: Rakib Mullick Cc: David Howells Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Tejun Heo Cc: Arnd Bergmann Cc: Akinobu Mita Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/smp.h | 2 ++ init/main.c | 90 +---------------------------------------------------- kernel/smp.c | 81 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 84 insertions(+), 89 deletions(-) (limited to 'kernel') diff --git a/include/linux/smp.h b/include/linux/smp.h index 6dc95cac6b3..48159dd320d 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -114,6 +114,8 @@ int on_each_cpu(smp_call_func_t func, void *info, int wait); void smp_prepare_boot_cpu(void); extern unsigned int setup_max_cpus; +extern void __init setup_nr_cpu_ids(void); +extern void __init smp_init(void); #else /* !SMP */ diff --git a/init/main.c b/init/main.c index 33c37c379e9..3627bb37225 100644 --- a/init/main.c +++ b/init/main.c @@ -129,63 +129,6 @@ static char *static_command_line; static char *execute_command; static char *ramdisk_execute_command; -#ifdef CONFIG_SMP -/* Setup configured maximum number of CPUs to activate */ -unsigned int setup_max_cpus = NR_CPUS; -EXPORT_SYMBOL(setup_max_cpus); - - -/* - * Setup routine for controlling SMP activation - * - * Command-line option of "nosmp" or "maxcpus=0" will disable SMP - * activation entirely (the MPS table probe still happens, though). - * - * Command-line option of "maxcpus=", where is an integer - * greater than 0, limits the maximum number of CPUs activated in - * SMP mode to . - */ - -void __weak arch_disable_smp_support(void) { } - -static int __init nosmp(char *str) -{ - setup_max_cpus = 0; - arch_disable_smp_support(); - - return 0; -} - -early_param("nosmp", nosmp); - -/* this is hard limit */ -static int __init nrcpus(char *str) -{ - int nr_cpus; - - get_option(&str, &nr_cpus); - if (nr_cpus > 0 && nr_cpus < nr_cpu_ids) - nr_cpu_ids = nr_cpus; - - return 0; -} - -early_param("nr_cpus", nrcpus); - -static int __init maxcpus(char *str) -{ - get_option(&str, &setup_max_cpus); - if (setup_max_cpus == 0) - arch_disable_smp_support(); - - return 0; -} - -early_param("maxcpus", maxcpus); -#else -static const unsigned int setup_max_cpus = NR_CPUS; -#endif - /* * If set, this is an indication to the drivers that reset the underlying * device before going ahead with the initialization otherwise driver might @@ -362,7 +305,7 @@ static int __init rdinit_setup(char *str) __setup("rdinit=", rdinit_setup); #ifndef CONFIG_SMP - +static const unsigned int setup_max_cpus = NR_CPUS; #ifdef CONFIG_X86_LOCAL_APIC static void __init smp_init(void) { @@ -374,37 +317,6 @@ static void __init smp_init(void) static inline void setup_nr_cpu_ids(void) { } static inline void smp_prepare_cpus(unsigned int maxcpus) { } - -#else - -/* Setup number of possible processor ids */ -int nr_cpu_ids __read_mostly = NR_CPUS; -EXPORT_SYMBOL(nr_cpu_ids); - -/* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */ -static void __init setup_nr_cpu_ids(void) -{ - nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1; -} - -/* Called by boot processor to activate the rest. */ -static void __init smp_init(void) -{ - unsigned int cpu; - - /* FIXME: This should be done in userspace --RR */ - for_each_present_cpu(cpu) { - if (num_online_cpus() >= setup_max_cpus) - break; - if (!cpu_online(cpu)) - cpu_up(cpu); - } - - /* Any cleanup work */ - printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus()); - smp_cpus_done(setup_max_cpus); -} - #endif /* diff --git a/kernel/smp.c b/kernel/smp.c index 7cbd0f293df..73a19519355 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -604,6 +604,87 @@ void ipi_call_unlock_irq(void) } #endif /* USE_GENERIC_SMP_HELPERS */ +/* Setup configured maximum number of CPUs to activate */ +unsigned int setup_max_cpus = NR_CPUS; +EXPORT_SYMBOL(setup_max_cpus); + + +/* + * Setup routine for controlling SMP activation + * + * Command-line option of "nosmp" or "maxcpus=0" will disable SMP + * activation entirely (the MPS table probe still happens, though). + * + * Command-line option of "maxcpus=", where is an integer + * greater than 0, limits the maximum number of CPUs activated in + * SMP mode to . + */ + +void __weak arch_disable_smp_support(void) { } + +static int __init nosmp(char *str) +{ + setup_max_cpus = 0; + arch_disable_smp_support(); + + return 0; +} + +early_param("nosmp", nosmp); + +/* this is hard limit */ +static int __init nrcpus(char *str) +{ + int nr_cpus; + + get_option(&str, &nr_cpus); + if (nr_cpus > 0 && nr_cpus < nr_cpu_ids) + nr_cpu_ids = nr_cpus; + + return 0; +} + +early_param("nr_cpus", nrcpus); + +static int __init maxcpus(char *str) +{ + get_option(&str, &setup_max_cpus); + if (setup_max_cpus == 0) + arch_disable_smp_support(); + + return 0; +} + +early_param("maxcpus", maxcpus); + +/* Setup number of possible processor ids */ +int nr_cpu_ids __read_mostly = NR_CPUS; +EXPORT_SYMBOL(nr_cpu_ids); + +/* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */ +void __init setup_nr_cpu_ids(void) +{ + nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1; +} + +/* Called by boot processor to activate the rest. */ +void __init smp_init(void) +{ + unsigned int cpu; + + /* FIXME: This should be done in userspace --RR */ + for_each_present_cpu(cpu) { + if (num_online_cpus() >= setup_max_cpus) + break; + if (!cpu_online(cpu)) + cpu_up(cpu); + } + + /* Any cleanup work */ + printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus()); + smp_cpus_done(setup_max_cpus); +} + /* * Call a function on all processors. May be used during early boot while * early_boot_irqs_disabled is set. Use local_irq_save/restore() instead -- cgit v1.2.3-70-g09d2 From 4d51985e484dd11d9047dfcd1278ec9ccfb435d5 Mon Sep 17 00:00:00 2001 From: Michael Rodriguez Date: Tue, 22 Mar 2011 16:34:07 -0700 Subject: kernel/cpu.c: fix many errors related to style. Change the printk() calls to have the KERN_INFO/KERN_ERROR stuff, and fixes other coding style errors. Not _all_ of them are gone, though. [akpm@linux-foundation.org: revert the bits I disagree with] Signed-off-by: Michael Rodriguez Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpu.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 156cc555614..c95fc4df0fa 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -160,7 +160,6 @@ static void cpu_notify_nofail(unsigned long val, void *v) { BUG_ON(cpu_notify(val, v)); } - EXPORT_SYMBOL(register_cpu_notifier); void __ref unregister_cpu_notifier(struct notifier_block *nb) @@ -205,7 +204,6 @@ static int __ref take_cpu_down(void *_param) return err; cpu_notify(CPU_DYING | param->mod, param->hcpu); - return 0; } @@ -227,6 +225,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) return -EINVAL; cpu_hotplug_begin(); + err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); if (err) { nr_calls--; @@ -304,7 +303,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); if (ret) { nr_calls--; - printk("%s: attempt to bring up CPU %u failed\n", + printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n", __func__, cpu); goto out_notify; } @@ -450,14 +449,14 @@ void __ref enable_nonboot_cpus(void) if (cpumask_empty(frozen_cpus)) goto out; - printk("Enabling non-boot CPUs ...\n"); + printk(KERN_INFO "Enabling non-boot CPUs ...\n"); arch_enable_nonboot_cpus_begin(); for_each_cpu(cpu, frozen_cpus) { error = _cpu_up(cpu, 1); if (!error) { - printk("CPU%d is up\n", cpu); + printk(KERN_INFO "CPU%d is up\n", cpu); continue; } printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); @@ -509,7 +508,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu) */ /* cpu_bit_bitmap[0] is empty - so we can back into it */ -#define MASK_DECLARE_1(x) [x+1][0] = 1UL << (x) +#define MASK_DECLARE_1(x) [x+1][0] = (1UL << (x)) #define MASK_DECLARE_2(x) MASK_DECLARE_1(x), MASK_DECLARE_1(x+1) #define MASK_DECLARE_4(x) MASK_DECLARE_2(x), MASK_DECLARE_2(x+2) #define MASK_DECLARE_8(x) MASK_DECLARE_4(x), MASK_DECLARE_4(x+4) -- cgit v1.2.3-70-g09d2 From 9bfb23fc4a481650e60d22dbe84c0fd5a9d49bba Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 22 Mar 2011 16:34:09 -0700 Subject: sys_unshare: remove the dead CLONE_THREAD/SIGHAND/VM code Cleanup: kill the dead code which does nothing but complicates the code and confuses the reader. sys_unshare(CLONE_THREAD/SIGHAND/VM) is not really implemented, and I doubt very much it will ever work. At least, nobody even tried since the original 99d1419d96d7df9cfa56 ("unshare system call -v5: system call handler function") was applied more than 4 years ago. And the code is not consistent. unshare_thread() always fails unconditionally, while unshare_sighand() and unshare_vm() pretend to work if there is nothing to unshare. Remove unshare_thread(), unshare_sighand(), unshare_vm() helpers and related variables and add a simple CLONE_THREAD | CLONE_SIGHAND| CLONE_VM check into check_unshare_flags(). Also, move the "CLONE_NEWNS needs CLONE_FS" check from check_unshare_flags() to sys_unshare(). This looks more consistent and matches the similar do_sysvsem check in sys_unshare(). Note: with or without this patch "atomic_read(mm->mm_users) > 1" can give a false positive due to get_task_mm(). Signed-off-by: Oleg Nesterov Acked-by: Roland McGrath Cc: Janak Desai Cc: Daniel Lezcano Cc: "Eric W. Biederman" Cc: KOSAKI Motohiro Cc: Alexey Dobriyan Acked-by: Serge Hallyn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 123 ++++++++++++---------------------------------------------- 1 file changed, 25 insertions(+), 98 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index a8f64f8ec7e..f2b494d7c55 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1519,38 +1519,24 @@ void __init proc_caches_init(void) } /* - * Check constraints on flags passed to the unshare system call and - * force unsharing of additional process context as appropriate. + * Check constraints on flags passed to the unshare system call. */ -static void check_unshare_flags(unsigned long *flags_ptr) +static int check_unshare_flags(unsigned long unshare_flags) { + if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| + CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| + CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET)) + return -EINVAL; /* - * If unsharing a thread from a thread group, must also - * unshare vm. - */ - if (*flags_ptr & CLONE_THREAD) - *flags_ptr |= CLONE_VM; - - /* - * If unsharing vm, must also unshare signal handlers. - */ - if (*flags_ptr & CLONE_VM) - *flags_ptr |= CLONE_SIGHAND; - - /* - * If unsharing namespace, must also unshare filesystem information. + * Not implemented, but pretend it works if there is nothing to + * unshare. Note that unsharing CLONE_THREAD or CLONE_SIGHAND + * needs to unshare vm. */ - if (*flags_ptr & CLONE_NEWNS) - *flags_ptr |= CLONE_FS; -} - -/* - * Unsharing of tasks created with CLONE_THREAD is not supported yet - */ -static int unshare_thread(unsigned long unshare_flags) -{ - if (unshare_flags & CLONE_THREAD) - return -EINVAL; + if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) { + /* FIXME: get_task_mm() increments ->mm_users */ + if (atomic_read(¤t->mm->mm_users) > 1) + return -EINVAL; + } return 0; } @@ -1576,34 +1562,6 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) return 0; } -/* - * Unsharing of sighand is not supported yet - */ -static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp) -{ - struct sighand_struct *sigh = current->sighand; - - if ((unshare_flags & CLONE_SIGHAND) && atomic_read(&sigh->count) > 1) - return -EINVAL; - else - return 0; -} - -/* - * Unshare vm if it is being shared - */ -static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp) -{ - struct mm_struct *mm = current->mm; - - if ((unshare_flags & CLONE_VM) && - (mm && atomic_read(&mm->mm_users) > 1)) { - return -EINVAL; - } - - return 0; -} - /* * Unshare file descriptor table if it is being shared */ @@ -1632,23 +1590,21 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp */ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) { - int err = 0; struct fs_struct *fs, *new_fs = NULL; - struct sighand_struct *new_sigh = NULL; - struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; struct files_struct *fd, *new_fd = NULL; struct nsproxy *new_nsproxy = NULL; int do_sysvsem = 0; + int err; - check_unshare_flags(&unshare_flags); - - /* Return -EINVAL for all unsupported flags */ - err = -EINVAL; - if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| - CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| - CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET)) + err = check_unshare_flags(unshare_flags); + if (err) goto bad_unshare_out; + /* + * If unsharing namespace, must also unshare filesystem information. + */ + if (unshare_flags & CLONE_NEWNS) + unshare_flags |= CLONE_FS; /* * CLONE_NEWIPC must also detach from the undolist: after switching * to a new ipc namespace, the semaphore arrays from the old @@ -1656,21 +1612,15 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) */ if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM)) do_sysvsem = 1; - if ((err = unshare_thread(unshare_flags))) - goto bad_unshare_out; if ((err = unshare_fs(unshare_flags, &new_fs))) - goto bad_unshare_cleanup_thread; - if ((err = unshare_sighand(unshare_flags, &new_sigh))) - goto bad_unshare_cleanup_fs; - if ((err = unshare_vm(unshare_flags, &new_mm))) - goto bad_unshare_cleanup_sigh; + goto bad_unshare_out; if ((err = unshare_fd(unshare_flags, &new_fd))) - goto bad_unshare_cleanup_vm; + goto bad_unshare_cleanup_fs; if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs))) goto bad_unshare_cleanup_fd; - if (new_fs || new_mm || new_fd || do_sysvsem || new_nsproxy) { + if (new_fs || new_fd || do_sysvsem || new_nsproxy) { if (do_sysvsem) { /* * CLONE_SYSVSEM is equivalent to sys_exit(). @@ -1696,19 +1646,6 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) spin_unlock(&fs->lock); } - if (new_mm) { - mm = current->mm; - active_mm = current->active_mm; - current->mm = new_mm; - current->active_mm = new_mm; - if (current->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { - atomic_dec(&mm->oom_disable_count); - atomic_inc(&new_mm->oom_disable_count); - } - activate_mm(active_mm, new_mm); - new_mm = mm; - } - if (new_fd) { fd = current->files; current->files = new_fd; @@ -1725,20 +1662,10 @@ bad_unshare_cleanup_fd: if (new_fd) put_files_struct(new_fd); -bad_unshare_cleanup_vm: - if (new_mm) - mmput(new_mm); - -bad_unshare_cleanup_sigh: - if (new_sigh) - if (atomic_dec_and_test(&new_sigh->count)) - kmem_cache_free(sighand_cachep, new_sigh); - bad_unshare_cleanup_fs: if (new_fs) free_fs_struct(new_fs); -bad_unshare_cleanup_thread: bad_unshare_out: return err; } -- cgit v1.2.3-70-g09d2 From fef2c9bc1b54c0261324a96e948c0b849796e896 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Tue, 22 Mar 2011 16:34:16 -0700 Subject: kernel/watchdog.c: allow hardlockup to panic by default When a cpu is considered stuck, instead of limping along and just printing a warning, it is sometimes preferred to just panic, let kdump capture the vmcore and reboot. This gets the machine back into a stable state quickly while saving the info that got it into a stuck state to begin with. Add a Kconfig option to allow users to set the hardlockup to panic by default. Also add in a 'nmi_watchdog=nopanic' to override this. [akpm@linux-foundation.org: fix strncmp length] Signed-off-by: Don Zickus Acked-by: Peter Zijlstra Reviewed-by: WANG Cong Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/kernel-parameters.txt | 5 +++-- kernel/watchdog.c | 5 ++++- lib/Kconfig.debug | 17 +++++++++++++++++ 3 files changed, 24 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index a3b688287a8..e9261e938f6 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1597,11 +1597,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted. Format: [state][,regs][,debounce][,die] nmi_watchdog= [KNL,BUGS=X86] Debugging features for SMP kernels - Format: [panic,][num] + Format: [panic,][nopanic,][num] Valid num: 0 0 - turn nmi_watchdog off When panic is specified, panic when an NMI watchdog - timeout occurs. + timeout occurs (or 'nopanic' to override the opposite + default). This is useful when you use a panic=... timeout and need the box quickly up again. diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 18bb15776c5..054a67cca9d 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -48,12 +48,15 @@ static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); * Should we panic when a soft-lockup or hard-lockup occurs: */ #ifdef CONFIG_HARDLOCKUP_DETECTOR -static int hardlockup_panic; +static int hardlockup_panic = + CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; static int __init hardlockup_panic_setup(char *str) { if (!strncmp(str, "panic", 5)) hardlockup_panic = 1; + else if (!strncmp(str, "nopanic", 7)) + hardlockup_panic = 0; else if (!strncmp(str, "0", 1)) watchdog_enabled = 0; return 1; diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 191c5c4c89f..fb0afeff943 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -171,6 +171,23 @@ config HARDLOCKUP_DETECTOR def_bool LOCKUP_DETECTOR && PERF_EVENTS && HAVE_PERF_EVENTS_NMI && \ !ARCH_HAS_NMI_WATCHDOG +config BOOTPARAM_HARDLOCKUP_PANIC + bool "Panic (Reboot) On Hard Lockups" + depends on LOCKUP_DETECTOR + help + Say Y here to enable the kernel to panic on "hard lockups", + which are bugs that cause the kernel to loop in kernel + mode with interrupts disabled for more than 60 seconds. + + Say N if unsure. + +config BOOTPARAM_HARDLOCKUP_PANIC_VALUE + int + depends on LOCKUP_DETECTOR + range 0 1 + default 0 if !BOOTPARAM_HARDLOCKUP_PANIC + default 1 if BOOTPARAM_HARDLOCKUP_PANIC + config BOOTPARAM_SOFTLOCKUP_PANIC bool "Panic (Reboot) On Soft Lockups" depends on LOCKUP_DETECTOR -- cgit v1.2.3-70-g09d2 From f99a99330f85a84c346ddeb4adc72dbfad9b9e3e Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Tue, 22 Mar 2011 16:34:17 -0700 Subject: kernel/watchdog.c: always return NOTIFY_OK during cpu up/down events This patch addresses a couple of problems. One was the case when the hardlockup failed to start, it also failed to start the softlockup. There were valid cases when the hardlockup shouldn't start and that shouldn't block the softlockup (no lapic, bios controls perf counters). The second problem was when the hardlockup failed to start on boxes (from a no lapic or bios controlled perf counter case), it reported failure to the cpu notifier chain. This blocked the notifier from continuing to start other more critical pieces of cpu bring-up (in our case based on a 2.6.32 fork, it was the mce). As a result, during soft cpu online/offline testing, the system would panic when a cpu was offlined because the cpu notifier would succeed in processing a watchdog disable cpu event and would panic in the mce case as a result of un-initialized variables from a never executed cpu up event. I realized the hardlockup/softlockup cases are really just debugging aids and should never impede the progress of a cpu up/down event. Therefore I modified the code to always return NOTIFY_OK and instead rely on printks to inform the user of problems. Signed-off-by: Don Zickus Acked-by: Peter Zijlstra Reviewed-by: WANG Cong Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 054a67cca9d..140dce75045 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -418,19 +418,22 @@ static int watchdog_prepare_cpu(int cpu) static int watchdog_enable(int cpu) { struct task_struct *p = per_cpu(softlockup_watchdog, cpu); - int err; + int err = 0; /* enable the perf event */ err = watchdog_nmi_enable(cpu); - if (err) - return err; + + /* Regardless of err above, fall through and start softlockup */ /* create the watchdog thread */ if (!p) { p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu); if (IS_ERR(p)) { printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); - return PTR_ERR(p); + if (!err) + /* if hardlockup hasn't already set this */ + err = PTR_ERR(p); + goto out; } kthread_bind(p, cpu); per_cpu(watchdog_touch_ts, cpu) = 0; @@ -438,7 +441,8 @@ static int watchdog_enable(int cpu) wake_up_process(p); } - return 0; +out: + return err; } static void watchdog_disable(int cpu) @@ -550,7 +554,13 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) break; #endif /* CONFIG_HOTPLUG_CPU */ } - return notifier_from_errno(err); + + /* + * hardlockup and softlockup are not important enough + * to block cpu bring up. Just always succeed and + * rely on printk output to flag problems. + */ + return NOTIFY_OK; } static struct notifier_block __cpuinitdata cpu_nfb = { -- cgit v1.2.3-70-g09d2 From 7bf693951a8e5f7e600a45b74d91d962a453146e Mon Sep 17 00:00:00 2001 From: "Fabio M. Di Nitto" Date: Tue, 22 Mar 2011 16:34:20 -0700 Subject: console: allow to retain boot console via boot option keep_bootcon On some architectures, the boot process involves de-registering the boot console (early boot), initialize drivers and then re-register the console. This mechanism introduces a window in which no printk can happen on the console and messages are buffered and then printed once the new console is available. If a kernel crashes during this window, all it's left on the boot console is "console [foo] enabled, bootconsole disabled" making debug of the crash rather 'interesting'. By adding "keep_bootcon" option, do not unregister the boot console, that will allow to printk everything that is happening up to the crash. The option is clearly meant only for debugging purposes as it introduces lots of duplicated info printed on console, but will make bug report from users easier as it doesn't require a kernel build just to figure out where we crash. Signed-off-by: Fabio M. Di Nitto Acked-by: David S. Miller Cc: Alan Cox Cc: Greg KH Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/kernel-parameters.txt | 6 ++++++ kernel/printk.c | 16 +++++++++++++++- 2 files changed, 21 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index e9261e938f6..c357a31411c 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -872,6 +872,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted. If specified, z/VM IUCV HVC accepts connections from listed z/VM user IDs only. + keep_bootcon [KNL] + Do not unregister boot console at start. This is only + useful for debugging when something happens in the window + between unregistering the boot console and initializing + the real console. + i2c_bus= [HW] Override the default board specific I2C bus speed or register an additional I2C bus that is not registered from board initialization code. diff --git a/kernel/printk.c b/kernel/printk.c index 33284adb218..2b591f252e5 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1316,6 +1316,18 @@ void console_start(struct console *console) } EXPORT_SYMBOL(console_start); +static int __read_mostly keep_bootcon; + +static int __init keep_bootcon_setup(char *str) +{ + keep_bootcon = 1; + printk(KERN_INFO "debug: skip boot console de-registration.\n"); + + return 0; +} + +early_param("keep_bootcon", keep_bootcon_setup); + /* * The console driver calls this routine during kernel initialization * to register the console printing procedure with printk() and to @@ -1463,7 +1475,9 @@ void register_console(struct console *newcon) * users know there might be something in the kernel's log buffer that * went to the bootconsole (that they do not see on the real console) */ - if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) { + if (bcon && + ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) && + !keep_bootcon) { /* we need to iterate through twice, to make sure we print * everything out, before we unregister the console(s) */ -- cgit v1.2.3-70-g09d2 From fe3d8ad31cf51b062bbb8a9609eeb1d0c41a7f30 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Tue, 22 Mar 2011 16:34:21 -0700 Subject: console: prevent registered consoles from dumping old kernel message over again For a platform with many consoles like: "console=tty1 console=ttyMFD2 console=ttyS0 earlyprintk=mrst" Each time when the non "selected_console" (tty1 and ttyMFD2 here) get registered, the existing kernel message will be printed out on registered consoles again, the "mrst" early console will get some same message for 3 times, and "tty1" will get some for twice. As suggested by Andrew Morton, every time a new console is registered, it will be set as the "exclusive" console which will dump the already existing kernel messages. Signed-off-by: Feng Tang Cc: Greg KH Cc: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 2b591f252e5..a53607eea6d 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -112,6 +112,11 @@ static unsigned log_start; /* Index into log_buf: next char to be read by syslog static unsigned con_start; /* Index into log_buf: next char to be sent to consoles */ static unsigned log_end; /* Index into log_buf: most-recently-written-char + 1 */ +/* + * If exclusive_console is non-NULL then only this console is to be printed to. + */ +static struct console *exclusive_console; + /* * Array of consoles built from command line options (console=) */ @@ -476,6 +481,8 @@ static void __call_console_drivers(unsigned start, unsigned end) struct console *con; for_each_console(con) { + if (exclusive_console && con != exclusive_console) + continue; if ((con->flags & CON_ENABLED) && con->write && (cpu_online(smp_processor_id()) || (con->flags & CON_ANYTIME))) @@ -1230,6 +1237,11 @@ void console_unlock(void) local_irq_restore(flags); } console_locked = 0; + + /* Release the exclusive_console once it is used */ + if (unlikely(exclusive_console)) + exclusive_console = NULL; + up(&console_sem); spin_unlock_irqrestore(&logbuf_lock, flags); if (wake_klogd) @@ -1464,6 +1476,12 @@ void register_console(struct console *newcon) spin_lock_irqsave(&logbuf_lock, flags); con_start = log_start; spin_unlock_irqrestore(&logbuf_lock, flags); + /* + * We're about to replay the log buffer. Only do this to the + * just-registered console to avoid excessive message spam to + * the already-registered consoles. + */ + exclusive_console = newcon; } console_unlock(); console_sysfs_notify(); -- cgit v1.2.3-70-g09d2 From 9f36e2c448007b54851e7e4fa48da97d1477a175 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 22 Mar 2011 16:34:22 -0700 Subject: printk: use %pK for /proc/kallsyms and /proc/modules In an effort to reduce kernel address leaks that might be used to help target kernel privilege escalation exploits, this patch uses %pK when displaying addresses in /proc/kallsyms, /proc/modules, and /sys/module/*/sections/*. Note that this changes %x to %p, so some legitimately 0 values in /proc/kallsyms would have changed from 00000000 to "(null)". To avoid this, "(null)" is not used when using the "K" format. Anything that was already successfully parsing "(null)" in addition to full hex digits should have no problem with this change. (Thanks to Joe Perches for the suggestion.) Due to the %x to %p, "void *" casts are needed since these addresses are already "unsigned long" everywhere internally, due to their starting life as ELF section offsets. Signed-off-by: Kees Cook Cc: Eugene Teo Cc: Dan Rosenberg Cc: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kallsyms.c | 10 ++++------ kernel/module.c | 4 ++-- lib/vsprintf.c | 2 +- 3 files changed, 7 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 6f6d091b575..75dcca37d61 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -477,13 +477,11 @@ static int s_show(struct seq_file *m, void *p) */ type = iter->exported ? toupper(iter->type) : tolower(iter->type); - seq_printf(m, "%0*lx %c %s\t[%s]\n", - (int)(2 * sizeof(void *)), - iter->value, type, iter->name, iter->module_name); + seq_printf(m, "%pK %c %s\t[%s]\n", (void *)iter->value, + type, iter->name, iter->module_name); } else - seq_printf(m, "%0*lx %c %s\n", - (int)(2 * sizeof(void *)), - iter->value, iter->type, iter->name); + seq_printf(m, "%pK %c %s\n", (void *)iter->value, + iter->type, iter->name); return 0; } diff --git a/kernel/module.c b/kernel/module.c index efa290ea94b..1f9f7bc56ca 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1168,7 +1168,7 @@ static ssize_t module_sect_show(struct module_attribute *mattr, { struct module_sect_attr *sattr = container_of(mattr, struct module_sect_attr, mattr); - return sprintf(buf, "0x%lx\n", sattr->address); + return sprintf(buf, "0x%pK\n", (void *)sattr->address); } static void free_sect_attrs(struct module_sect_attrs *sect_attrs) @@ -3224,7 +3224,7 @@ static int m_show(struct seq_file *m, void *p) mod->state == MODULE_STATE_COMING ? "Loading": "Live"); /* Used by oprofile and other similar tools. */ - seq_printf(m, " 0x%p", mod->module_core); + seq_printf(m, " 0x%pK", mod->module_core); /* Taints info */ if (mod->taints) diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 070d134eef7..ac444ff0165 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -991,7 +991,7 @@ static noinline_for_stack char *pointer(const char *fmt, char *buf, char *end, void *ptr, struct printf_spec spec) { - if (!ptr) { + if (!ptr && *fmt != 'K') { /* * Print (null) with the same width as a pointer so it makes * tabular output look nice. -- cgit v1.2.3-70-g09d2 From 5af5bcb8d37f99ba415a1adc6da71051b84f93a5 Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Tue, 22 Mar 2011 16:34:23 -0700 Subject: printk: allow setting DEFAULT_MESSAGE_LEVEL via Kconfig We've been burned by regressions/bugs which we later realized could have been triaged quicker if only we'd paid closer attention to dmesg. To make it easier to audit dmesg, we'd like to make DEFAULT_MESSAGE_LEVEL Kconfig-settable. That way we can set it to KERN_NOTICE and audit any messages <= KERN_WARNING. Signed-off-by: Mandeep Singh Baines Cc: Ingo Molnar Cc: Joe Perches Cc: Olof Johansson Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 2 +- lib/Kconfig.debug | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index a53607eea6d..da8ca817eae 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -53,7 +53,7 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) /* printk's without a loglevel use this.. */ -#define DEFAULT_MESSAGE_LOGLEVEL 4 /* KERN_WARNING */ +#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL /* We show everything that is MORE important than this.. */ #define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */ diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index fb0afeff943..bef5faab660 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -9,6 +9,17 @@ config PRINTK_TIME operations. This is useful for identifying long delays in kernel startup. +config DEFAULT_MESSAGE_LOGLEVEL + int "Default message log level (1-7)" + range 1 7 + default "4" + help + Default log level for printk statements with no specified priority. + + This was hard-coded to KERN_WARNING since at least 2.6.10 but folks + that are auditing their logs closely may want to set it to a lower + priority. + config ENABLE_WARN_DEPRECATED bool "Enable __deprecated logic" default y -- cgit v1.2.3-70-g09d2 From 20dd67407160eac577656cd2f8ee9a1fead960b8 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 23 Mar 2011 13:17:23 +0200 Subject: sched: Remove unused 'rq' variable and cpu_rq() call from alloc_fair_sched_group() Signed-off-by: Sergey Senozhatsky Cc: Steven Rostedt Cc: Peter Zijlstra LKML-Reference: <20110323111722.GA4244@swordfish.minsk.epam.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 052120d6770..a361e20ec2c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8443,7 +8443,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) { struct cfs_rq *cfs_rq; struct sched_entity *se; - struct rq *rq; int i; tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); @@ -8456,8 +8455,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) tg->shares = NICE_0_LOAD; for_each_possible_cpu(i) { - rq = cpu_rq(i); - cfs_rq = kzalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, cpu_to_node(i)); if (!cfs_rq) -- cgit v1.2.3-70-g09d2 From dec2960827c85253d76938dbfa909df3be34958b Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 23 Mar 2011 14:38:28 +0200 Subject: lockdep: Remove unused 'factor' variable from lockdep_stats_show() Signed-off-by: Sergey Senozhatsky Cc: Peter Zijlstra LKML-Reference: <20110323123828.GB4244@swordfish.minsk.epam.com> Signed-off-by: Ingo Molnar --- kernel/lockdep_proc.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 1969d2fc4b3..71edd2f60c0 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -225,7 +225,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v) nr_irq_read_safe = 0, nr_irq_read_unsafe = 0, nr_softirq_read_safe = 0, nr_softirq_read_unsafe = 0, nr_hardirq_read_safe = 0, nr_hardirq_read_unsafe = 0, - sum_forward_deps = 0, factor = 0; + sum_forward_deps = 0; list_for_each_entry(class, &all_lock_classes, lock_entry) { @@ -283,13 +283,6 @@ static int lockdep_stats_show(struct seq_file *m, void *v) nr_hardirq_unsafe * nr_hardirq_safe + nr_list_entries); - /* - * Estimated factor between direct and indirect - * dependencies: - */ - if (nr_list_entries) - factor = sum_forward_deps / nr_list_entries; - #ifdef CONFIG_PROVE_LOCKING seq_printf(m, " dependency chains: %11lu [max: %lu]\n", nr_lock_chains, MAX_LOCKDEP_CHAINS); -- cgit v1.2.3-70-g09d2 From 1232d6132a986125f6a687ab9b61a4330e319270 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 22 Mar 2011 18:46:18 +0100 Subject: sched, doc: Update sched-design-CFS.txt Correct ->dequeue_tree() thinko into sched_class->dequeue_task and drop all references to ->task_new() since it is obviously gone. Signed-off-by: Borislav Petkov Cc: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <1300815978-16618-1-git-send-email-bp@amd64.org> Signed-off-by: Ingo Molnar --- Documentation/scheduler/sched-design-CFS.txt | 7 +------ kernel/sched_idletask.c | 2 -- kernel/sched_stoptask.c | 2 -- 3 files changed, 1 insertion(+), 10 deletions(-) (limited to 'kernel') diff --git a/Documentation/scheduler/sched-design-CFS.txt b/Documentation/scheduler/sched-design-CFS.txt index 8239ebbcddc..99961993257 100644 --- a/Documentation/scheduler/sched-design-CFS.txt +++ b/Documentation/scheduler/sched-design-CFS.txt @@ -164,7 +164,7 @@ This is the (partial) list of the hooks: It puts the scheduling entity (task) into the red-black tree and increments the nr_running variable. - - dequeue_tree(...) + - dequeue_task(...) When a task is no longer runnable, this function is called to keep the corresponding scheduling entity out of the red-black tree. It decrements @@ -195,11 +195,6 @@ This is the (partial) list of the hooks: This function is mostly called from time tick functions; it might lead to process switch. This drives the running preemption. - - task_new(...) - - The core scheduler gives the scheduling module an opportunity to manage new - task startup. The CFS scheduling module uses it for group scheduling, while - the scheduling module for a real-time task does not use it. diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index c82f26c1b7c..a776a639642 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -94,6 +94,4 @@ static const struct sched_class idle_sched_class = { .prio_changed = prio_changed_idle, .switched_to = switched_to_idle, - - /* no .task_new for idle tasks */ }; diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c index 84ec9bcf82d..1ba2bd40fda 100644 --- a/kernel/sched_stoptask.c +++ b/kernel/sched_stoptask.c @@ -102,6 +102,4 @@ static const struct sched_class stop_sched_class = { .prio_changed = prio_changed_stop, .switched_to = switched_to_stop, - - /* no .task_new for stop tasks */ }; -- cgit v1.2.3-70-g09d2 From 68cacd29167b1926d237bd1b153aa2a990201729 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Wed, 23 Mar 2011 16:03:06 +0100 Subject: perf_events: Fix stale ->cgrp pointer in update_cgrp_time_from_cpuctx() This patch solves a stale pointer problem in update_cgrp_time_from_cpuctx(). The cpuctx->cgrp was not cleared on all possible event exit paths, including: close() perf_release() perf_release_kernel() list_del_event() This patch fixes list_del_event() to clear cpuctx->cgrp when there are no cgroup events left in the context. [ This second version makes the code compile when CONFIG_CGROUP_PERF is not enabled. We unconditionally define perf_cpu_context->cgrp. ] Signed-off-by: Stephane Eranian Cc: peterz@infradead.org Cc: perfmon2-devel@lists.sf.net Cc: paulus@samba.org Cc: davem@davemloft.net LKML-Reference: <20110323150306.GA1580@quad> Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 2 -- kernel/perf_event.c | 12 +++++++++++- 2 files changed, 11 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index f495c014724..311b4dc785a 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -938,9 +938,7 @@ struct perf_cpu_context { struct list_head rotation_list; int jiffies_interval; struct pmu *active_pmu; -#ifdef CONFIG_CGROUP_PERF struct perf_cgroup *cgrp; -#endif }; struct perf_output_handle { diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 3472bb1a070..0c714226ae0 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -941,6 +941,7 @@ static void perf_group_attach(struct perf_event *event) static void list_del_event(struct perf_event *event, struct perf_event_context *ctx) { + struct perf_cpu_context *cpuctx; /* * We can have double detach due to exit/hot-unplug + close. */ @@ -949,8 +950,17 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) event->attach_state &= ~PERF_ATTACH_CONTEXT; - if (is_cgroup_event(event)) + if (is_cgroup_event(event)) { ctx->nr_cgroups--; + cpuctx = __get_cpu_context(ctx); + /* + * if there are no more cgroup events + * then cler cgrp to avoid stale pointer + * in update_cgrp_time_from_cpuctx() + */ + if (!ctx->nr_cgroups) + cpuctx->cgrp = NULL; + } ctx->nr_events--; if (event->attr.inherit_stat) -- cgit v1.2.3-70-g09d2 From 3b9038912828384e38d82409c281124631c8533b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Mar 2011 00:24:11 +0100 Subject: genirq; Remove the last leftovers of the old sparse irq code All users converted. Get rid of it. Signed-off-by: Thomas Gleixner --- include/linux/irqdesc.h | 7 ------- kernel/irq/irqdesc.c | 14 -------------- 2 files changed, 21 deletions(-) (limited to 'kernel') diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index 00218371518..0b30662bc36 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -100,13 +100,6 @@ struct irq_desc { extern struct irq_desc irq_desc[NR_IRQS]; #endif -/* Will be removed once the last users in power and sh are gone */ -extern struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node); -static inline struct irq_desc *move_irq_desc(struct irq_desc *desc, int node) -{ - return desc; -} - #ifdef CONFIG_GENERIC_HARDIRQS static inline struct irq_data *irq_desc_get_irq_data(struct irq_desc *desc) diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index dbccc799407..6fb014f172f 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -198,15 +198,6 @@ err: return -ENOMEM; } -struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) -{ - int res = irq_alloc_descs(irq, irq, 1, node); - - if (res == -EEXIST || res == irq) - return irq_to_desc(irq); - return NULL; -} - static int irq_expand_nr_irqs(unsigned int nr) { if (nr > IRQ_BITMAP_BITS) @@ -283,11 +274,6 @@ struct irq_desc *irq_to_desc(unsigned int irq) return (irq < NR_IRQS) ? irq_desc + irq : NULL; } -struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node) -{ - return irq_to_desc(irq); -} - static void free_desc(unsigned int irq) { dynamic_irq_cleanup(irq); -- cgit v1.2.3-70-g09d2 From 880f57318450dbead6a03f9e31a1468924d6dd88 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 23 Mar 2011 19:29:39 +0100 Subject: perf: Better fit max unprivileged mlock pages for tools needs The maximum kilobytes of locked memory that an unprivileged user can reserve is of 512 kB = 128 pages by default, scaled to the number of onlined CPUs, which fits well with the tools that use 128 data pages by default. However tools actually use 129 pages, because they need one more for the user control page. Thus the default mlock threshold is not sufficient for the default tools needs and we always end up to evaluate the constant mlock rlimit policy, which doesn't have this scaling with the number of online CPUs. Hence, on systems that have more than 16 CPUs, we overlap the rlimit threshold and fail to mmap: $ perf record ls Error: failed to mmap with 1 (Operation not permitted) Just increase the max unprivileged mlock threshold by one page so that it supports well perf tools even after 16 CPUs. Reported-by: Han Pingtian Reported-by: Peter Zijlstra Reported-by: Arnaldo Carvalho de Melo Signed-off-by: Frederic Weisbecker Acked-by: Arnaldo Carvalho de Melo Cc: Stephane Eranian Cc: Stable LKML-Reference: <1300904979-5508-1-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 0c714226ae0..c75925c4d1e 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -145,7 +145,8 @@ static struct srcu_struct pmus_srcu; */ int sysctl_perf_event_paranoid __read_mostly = 1; -int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */ +/* Minimum for 128 pages + 1 for the user control page */ +int sysctl_perf_event_mlock __read_mostly = 516; /* 'free' kb per user */ /* * max perf event sample rate -- cgit v1.2.3-70-g09d2 From cae5d39032acf26c265f6b1dc73d7ce6ff4bc387 Mon Sep 17 00:00:00 2001 From: Stephen Wilson Date: Sun, 13 Mar 2011 15:49:17 -0400 Subject: mm: arch: rename in_gate_area_no_task to in_gate_area_no_mm Now that gate vma's are referenced with respect to a particular mm and not a particular task it only makes sense to propagate the change to this predicate as well. Signed-off-by: Stephen Wilson Reviewed-by: Michel Lespinasse Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Al Viro --- arch/powerpc/kernel/vdso.c | 2 +- arch/s390/kernel/vdso.c | 2 +- arch/sh/kernel/vsyscall/vsyscall.c | 2 +- arch/x86/mm/init_64.c | 8 ++++---- arch/x86/vdso/vdso32-setup.c | 2 +- include/linux/mm.h | 6 +++--- kernel/kallsyms.c | 4 ++-- mm/memory.c | 2 +- mm/nommu.c | 2 +- 9 files changed, 15 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index 467aa9ecbf9..142ab1008c3 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -820,7 +820,7 @@ static int __init vdso_init(void) } arch_initcall(vdso_init); -int in_gate_area_no_task(unsigned long addr) +int in_gate_area_no_mm(unsigned long addr) { return 0; } diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index 9006e966ef0..d73630b4fe1 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -337,7 +337,7 @@ static int __init vdso_init(void) } arch_initcall(vdso_init); -int in_gate_area_no_task(unsigned long addr) +int in_gate_area_no_mm(unsigned long addr) { return 0; } diff --git a/arch/sh/kernel/vsyscall/vsyscall.c b/arch/sh/kernel/vsyscall/vsyscall.c index 62c36a8961d..1d6d51a1ce7 100644 --- a/arch/sh/kernel/vsyscall/vsyscall.c +++ b/arch/sh/kernel/vsyscall/vsyscall.c @@ -104,7 +104,7 @@ int in_gate_area(struct mm_struct *mm, unsigned long address) return 0; } -int in_gate_area_no_task(unsigned long address) +int in_gate_area_no_mm(unsigned long address) { return 0; } diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 43c441622c8..835393c8554 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -881,11 +881,11 @@ int in_gate_area(struct mm_struct *mm, unsigned long addr) } /* - * Use this when you have no reliable task/vma, typically from interrupt - * context. It is less reliable than using the task's vma and may give - * false positives: + * Use this when you have no reliable mm, typically from interrupt + * context. It is less reliable than using a task's mm and may give + * false positives. */ -int in_gate_area_no_task(unsigned long addr) +int in_gate_area_no_mm(unsigned long addr) { return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END); } diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index f849bb29fda..468d591dde3 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -435,7 +435,7 @@ int in_gate_area(struct mm_struct *mm, unsigned long addr) return vma && addr >= vma->vm_start && addr < vma->vm_end; } -int in_gate_area_no_task(unsigned long addr) +int in_gate_area_no_mm(unsigned long addr) { return 0; } diff --git a/include/linux/mm.h b/include/linux/mm.h index 5c6d916cd30..9d6efefdde5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1580,11 +1580,11 @@ static inline bool kernel_page_present(struct page *page) { return true; } extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm); #ifdef __HAVE_ARCH_GATE_AREA -int in_gate_area_no_task(unsigned long addr); +int in_gate_area_no_mm(unsigned long addr); int in_gate_area(struct mm_struct *mm, unsigned long addr); #else -int in_gate_area_no_task(unsigned long addr); -#define in_gate_area(mm, addr) ({(void)mm; in_gate_area_no_task(addr);}) +int in_gate_area_no_mm(unsigned long addr); +#define in_gate_area(mm, addr) ({(void)mm; in_gate_area_no_mm(addr);}) #endif /* __HAVE_ARCH_GATE_AREA */ int drop_caches_sysctl_handler(struct ctl_table *, int, diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 6f6d091b575..b9d0fd1d21c 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -64,14 +64,14 @@ static inline int is_kernel_text(unsigned long addr) if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) || arch_is_kernel_text(addr)) return 1; - return in_gate_area_no_task(addr); + return in_gate_area_no_mm(addr); } static inline int is_kernel(unsigned long addr) { if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end) return 1; - return in_gate_area_no_task(addr); + return in_gate_area_no_mm(addr); } static int is_ksym_addr(unsigned long addr) diff --git a/mm/memory.c b/mm/memory.c index 931d479b80c..5f5b5de5a40 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3505,7 +3505,7 @@ struct vm_area_struct *get_gate_vma(struct mm_struct *mm) #endif } -int in_gate_area_no_task(unsigned long addr) +int in_gate_area_no_mm(unsigned long addr) { #ifdef AT_SYSINFO_EHDR if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END)) diff --git a/mm/nommu.c b/mm/nommu.c index f59e1424d3d..e629143f944 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1963,7 +1963,7 @@ error: return -ENOMEM; } -int in_gate_area_no_task(unsigned long addr) +int in_gate_area_no_mm(unsigned long addr) { return 0; } -- cgit v1.2.3-70-g09d2 From e1a85b2c519551d4792180cdab4074d7e99bf2c9 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 23 Mar 2011 22:16:04 +0100 Subject: timekeeping: Use syscore_ops instead of sysdev class and sysdev The timekeeping subsystem uses a sysdev class and a sysdev for executing timekeeping_suspend() after interrupts have been turned off on the boot CPU (during system suspend) and for executing timekeeping_resume() before turning on interrupts on the boot CPU (during system resume). However, since both of these functions ignore their arguments, the entire mechanism may be replaced with a struct syscore_ops object which is simpler. Signed-off-by: Rafael J. Wysocki Reviewed-by: Thomas Gleixner --- kernel/time/timekeeping.c | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 3bd7e3d5c63..8ad5d576755 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #include #include @@ -597,13 +597,12 @@ static struct timespec timekeeping_suspend_time; /** * timekeeping_resume - Resumes the generic timekeeping subsystem. - * @dev: unused * * This is for the generic clocksource timekeeping. * xtime/wall_to_monotonic/jiffies/etc are * still managed by arch specific suspend/resume code. */ -static int timekeeping_resume(struct sys_device *dev) +static void timekeeping_resume(void) { unsigned long flags; struct timespec ts; @@ -632,11 +631,9 @@ static int timekeeping_resume(struct sys_device *dev) /* Resume hrtimers */ hres_timers_resume(); - - return 0; } -static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) +static int timekeeping_suspend(void) { unsigned long flags; @@ -654,26 +651,18 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) } /* sysfs resume/suspend bits for timekeeping */ -static struct sysdev_class timekeeping_sysclass = { - .name = "timekeeping", +static struct syscore_ops timekeeping_syscore_ops = { .resume = timekeeping_resume, .suspend = timekeeping_suspend, }; -static struct sys_device device_timer = { - .id = 0, - .cls = &timekeeping_sysclass, -}; - -static int __init timekeeping_init_device(void) +static int __init timekeeping_init_ops(void) { - int error = sysdev_class_register(&timekeeping_sysclass); - if (!error) - error = sysdev_register(&device_timer); - return error; + register_syscore_ops(&timekeeping_syscore_ops); + return 0; } -device_initcall(timekeeping_init_device); +device_initcall(timekeeping_init_ops); /* * If the error is already larger, we look ahead even further -- cgit v1.2.3-70-g09d2 From 6c191cd01a935e5b53ef43c9403c771bb7a32b60 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 23 Mar 2011 16:42:18 -0700 Subject: memcg: res_counter_read_u64(): fix potential races on 32-bit machines res_counter_read_u64 reads u64 value without lock. It's dangerous in a 32bit environment. Add locking. Signed-off-by: KAMEZAWA Hiroyuki Cc: Greg Thelen Cc: Johannes Weiner Cc: David Rientjes Cc: KOSAKI Motohiro Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/res_counter.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'kernel') diff --git a/kernel/res_counter.c b/kernel/res_counter.c index c7eaa37a768..34683efa2cc 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -126,10 +126,24 @@ ssize_t res_counter_read(struct res_counter *counter, int member, pos, buf, s - buf); } +#if BITS_PER_LONG == 32 +u64 res_counter_read_u64(struct res_counter *counter, int member) +{ + unsigned long flags; + u64 ret; + + spin_lock_irqsave(&counter->lock, flags); + ret = *res_counter_member(counter, member); + spin_unlock_irqrestore(&counter->lock, flags); + + return ret; +} +#else u64 res_counter_read_u64(struct res_counter *counter, int member) { return *res_counter_member(counter, member); } +#endif int res_counter_memparse_write_strategy(const char *buf, unsigned long long *res) -- cgit v1.2.3-70-g09d2 From 6b3ae58efca06623c197fd6d91ded4aa3a8fe039 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 23 Mar 2011 16:42:30 -0700 Subject: memcg: remove direct page_cgroup-to-page pointer In struct page_cgroup, we have a full word for flags but only a few are reserved. Use the remaining upper bits to encode, depending on configuration, the node or the section, to enable page_cgroup-to-page lookups without a direct pointer. This saves a full word for every page in a system with memory cgroups enabled. Signed-off-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Cc: Daisuke Nishimura Cc: Balbir Singh Cc: Minchan Kim Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page_cgroup.h | 75 ++++++++++++++++++++++++++++--------- kernel/bounds.c | 2 + mm/memcontrol.c | 4 +- mm/page_cgroup.c | 91 +++++++++++++++++++++++++++------------------ 4 files changed, 117 insertions(+), 55 deletions(-) (limited to 'kernel') diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h index 6b63679ce8a..f5de21de31d 100644 --- a/include/linux/page_cgroup.h +++ b/include/linux/page_cgroup.h @@ -1,8 +1,26 @@ #ifndef __LINUX_PAGE_CGROUP_H #define __LINUX_PAGE_CGROUP_H +enum { + /* flags for mem_cgroup */ + PCG_LOCK, /* Lock for pc->mem_cgroup and following bits. */ + PCG_CACHE, /* charged as cache */ + PCG_USED, /* this object is in use. */ + PCG_MIGRATION, /* under page migration */ + /* flags for mem_cgroup and file and I/O status */ + PCG_MOVE_LOCK, /* For race between move_account v.s. following bits */ + PCG_FILE_MAPPED, /* page is accounted as "mapped" */ + /* No lock in page_cgroup */ + PCG_ACCT_LRU, /* page has been accounted for (under lru_lock) */ + __NR_PCG_FLAGS, +}; + +#ifndef __GENERATING_BOUNDS_H +#include + #ifdef CONFIG_CGROUP_MEM_RES_CTLR #include + /* * Page Cgroup can be considered as an extended mem_map. * A page_cgroup page is associated with every page descriptor. The @@ -13,7 +31,6 @@ struct page_cgroup { unsigned long flags; struct mem_cgroup *mem_cgroup; - struct page *page; struct list_head lru; /* per cgroup LRU list */ }; @@ -32,19 +49,7 @@ static inline void __init page_cgroup_init(void) #endif struct page_cgroup *lookup_page_cgroup(struct page *page); - -enum { - /* flags for mem_cgroup */ - PCG_LOCK, /* Lock for pc->mem_cgroup and following bits. */ - PCG_CACHE, /* charged as cache */ - PCG_USED, /* this object is in use. */ - PCG_MIGRATION, /* under page migration */ - /* flags for mem_cgroup and file and I/O status */ - PCG_MOVE_LOCK, /* For race between move_account v.s. following bits */ - PCG_FILE_MAPPED, /* page is accounted as "mapped" */ - /* No lock in page_cgroup */ - PCG_ACCT_LRU, /* page has been accounted for (under lru_lock) */ -}; +struct page *lookup_cgroup_page(struct page_cgroup *pc); #define TESTPCGFLAG(uname, lname) \ static inline int PageCgroup##uname(struct page_cgroup *pc) \ @@ -117,6 +122,39 @@ static inline void move_unlock_page_cgroup(struct page_cgroup *pc, local_irq_restore(*flags); } +#ifdef CONFIG_SPARSEMEM +#define PCG_ARRAYID_WIDTH SECTIONS_SHIFT +#else +#define PCG_ARRAYID_WIDTH NODES_SHIFT +#endif + +#if (PCG_ARRAYID_WIDTH > BITS_PER_LONG - NR_PCG_FLAGS) +#error Not enough space left in pc->flags to store page_cgroup array IDs +#endif + +/* pc->flags: ARRAY-ID | FLAGS */ + +#define PCG_ARRAYID_MASK ((1UL << PCG_ARRAYID_WIDTH) - 1) + +#define PCG_ARRAYID_OFFSET (BITS_PER_LONG - PCG_ARRAYID_WIDTH) +/* + * Zero the shift count for non-existant fields, to prevent compiler + * warnings and ensure references are optimized away. + */ +#define PCG_ARRAYID_SHIFT (PCG_ARRAYID_OFFSET * (PCG_ARRAYID_WIDTH != 0)) + +static inline void set_page_cgroup_array_id(struct page_cgroup *pc, + unsigned long id) +{ + pc->flags &= ~(PCG_ARRAYID_MASK << PCG_ARRAYID_SHIFT); + pc->flags |= (id & PCG_ARRAYID_MASK) << PCG_ARRAYID_SHIFT; +} + +static inline unsigned long page_cgroup_array_id(struct page_cgroup *pc) +{ + return (pc->flags >> PCG_ARRAYID_SHIFT) & PCG_ARRAYID_MASK; +} + #else /* CONFIG_CGROUP_MEM_RES_CTLR */ struct page_cgroup; @@ -137,7 +175,7 @@ static inline void __init page_cgroup_init_flatmem(void) { } -#endif +#endif /* CONFIG_CGROUP_MEM_RES_CTLR */ #include @@ -173,5 +211,8 @@ static inline void swap_cgroup_swapoff(int type) return; } -#endif -#endif +#endif /* CONFIG_CGROUP_MEM_RES_CTLR_SWAP */ + +#endif /* !__GENERATING_BOUNDS_H */ + +#endif /* __LINUX_PAGE_CGROUP_H */ diff --git a/kernel/bounds.c b/kernel/bounds.c index 98a51f26c13..0c9b862292b 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c @@ -9,11 +9,13 @@ #include #include #include +#include void foo(void) { /* The enum constants to put into include/generated/bounds.h */ DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); + DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS); /* End of constants */ } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e286e1603e4..660dfc27d97 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1080,7 +1080,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, if (unlikely(!PageCgroupUsed(pc))) continue; - page = pc->page; + page = lookup_cgroup_page(pc); if (unlikely(!PageLRU(page))) continue; @@ -3344,7 +3344,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, } spin_unlock_irqrestore(&zone->lru_lock, flags); - page = pc->page; + page = lookup_cgroup_page(pc); ret = mem_cgroup_move_parent(page, pc, mem, GFP_KERNEL); if (ret == -ENOMEM) diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 59a3cd4c799..6c3f7a6a481 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -11,12 +11,11 @@ #include #include -static void __meminit -__init_page_cgroup(struct page_cgroup *pc, unsigned long pfn) +static void __meminit init_page_cgroup(struct page_cgroup *pc, unsigned long id) { pc->flags = 0; + set_page_cgroup_array_id(pc, id); pc->mem_cgroup = NULL; - pc->page = pfn_to_page(pfn); INIT_LIST_HEAD(&pc->lru); } static unsigned long total_usage; @@ -43,6 +42,19 @@ struct page_cgroup *lookup_page_cgroup(struct page *page) return base + offset; } +struct page *lookup_cgroup_page(struct page_cgroup *pc) +{ + unsigned long pfn; + struct page *page; + pg_data_t *pgdat; + + pgdat = NODE_DATA(page_cgroup_array_id(pc)); + pfn = pc - pgdat->node_page_cgroup + pgdat->node_start_pfn; + page = pfn_to_page(pfn); + VM_BUG_ON(pc != lookup_page_cgroup(page)); + return page; +} + static int __init alloc_node_page_cgroup(int nid) { struct page_cgroup *base, *pc; @@ -63,7 +75,7 @@ static int __init alloc_node_page_cgroup(int nid) return -ENOMEM; for (index = 0; index < nr_pages; index++) { pc = base + index; - __init_page_cgroup(pc, start_pfn + index); + init_page_cgroup(pc, nid); } NODE_DATA(nid)->node_page_cgroup = base; total_usage += table_size; @@ -105,46 +117,53 @@ struct page_cgroup *lookup_page_cgroup(struct page *page) return section->page_cgroup + pfn; } +struct page *lookup_cgroup_page(struct page_cgroup *pc) +{ + struct mem_section *section; + struct page *page; + unsigned long nr; + + nr = page_cgroup_array_id(pc); + section = __nr_to_section(nr); + page = pfn_to_page(pc - section->page_cgroup); + VM_BUG_ON(pc != lookup_page_cgroup(page)); + return page; +} + /* __alloc_bootmem...() is protected by !slab_available() */ static int __init_refok init_section_page_cgroup(unsigned long pfn) { - struct mem_section *section = __pfn_to_section(pfn); struct page_cgroup *base, *pc; + struct mem_section *section; unsigned long table_size; + unsigned long nr; int nid, index; - if (!section->page_cgroup) { - nid = page_to_nid(pfn_to_page(pfn)); - table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; - VM_BUG_ON(!slab_is_available()); - if (node_state(nid, N_HIGH_MEMORY)) { - base = kmalloc_node(table_size, - GFP_KERNEL | __GFP_NOWARN, nid); - if (!base) - base = vmalloc_node(table_size, nid); - } else { - base = kmalloc(table_size, GFP_KERNEL | __GFP_NOWARN); - if (!base) - base = vmalloc(table_size); - } - /* - * The value stored in section->page_cgroup is (base - pfn) - * and it does not point to the memory block allocated above, - * causing kmemleak false positives. - */ - kmemleak_not_leak(base); + nr = pfn_to_section_nr(pfn); + section = __nr_to_section(nr); + + if (section->page_cgroup) + return 0; + + nid = page_to_nid(pfn_to_page(pfn)); + table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; + VM_BUG_ON(!slab_is_available()); + if (node_state(nid, N_HIGH_MEMORY)) { + base = kmalloc_node(table_size, + GFP_KERNEL | __GFP_NOWARN, nid); + if (!base) + base = vmalloc_node(table_size, nid); } else { - /* - * We don't have to allocate page_cgroup again, but - * address of memmap may be changed. So, we have to initialize - * again. - */ - base = section->page_cgroup + pfn; - table_size = 0; - /* check address of memmap is changed or not. */ - if (base->page == pfn_to_page(pfn)) - return 0; + base = kmalloc(table_size, GFP_KERNEL | __GFP_NOWARN); + if (!base) + base = vmalloc(table_size); } + /* + * The value stored in section->page_cgroup is (base - pfn) + * and it does not point to the memory block allocated above, + * causing kmemleak false positives. + */ + kmemleak_not_leak(base); if (!base) { printk(KERN_ERR "page cgroup allocation failure\n"); @@ -153,7 +172,7 @@ static int __init_refok init_section_page_cgroup(unsigned long pfn) for (index = 0; index < PAGES_PER_SECTION; index++) { pc = base + index; - __init_page_cgroup(pc, pfn + index); + init_page_cgroup(pc, nr); } section->page_cgroup = base - pfn; -- cgit v1.2.3-70-g09d2 From 9303e0c4814d2a6afca878cc35433291e862169c Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 23 Mar 2011 16:42:45 -0700 Subject: cpuset: remove unneeded NODEMASK_ALLOC() in cpuset_sprintf_memlist() It's not necessary to copy cpuset->mems_allowed to a buffer allocated by NODEMASK_ALLOC(). Just pass it to nodelist_scnprintf(). As spotted by Paul, a side effect is we fix a bug that the function can return -ENOMEM but the caller doesn't expect negative return value. Therefore change the return value of cpuset_sprintf_cpulist() and cpuset_sprintf_memlist() from int to size_t. Signed-off-by: Li Zefan Acked-by: Paul Menage Acked-by: David Rientjes Cc: Miao Xie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index e92e9818903..4683fe728c9 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1610,34 +1610,26 @@ out: * across a page fault. */ -static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) +static size_t cpuset_sprintf_cpulist(char *page, struct cpuset *cs) { - int ret; + size_t count; mutex_lock(&callback_mutex); - ret = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed); + count = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed); mutex_unlock(&callback_mutex); - return ret; + return count; } -static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) +static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs) { - NODEMASK_ALLOC(nodemask_t, mask, GFP_KERNEL); - int retval; - - if (mask == NULL) - return -ENOMEM; + size_t count; mutex_lock(&callback_mutex); - *mask = cs->mems_allowed; + count = nodelist_scnprintf(page, PAGE_SIZE, cs->mems_allowed); mutex_unlock(&callback_mutex); - retval = nodelist_scnprintf(page, PAGE_SIZE, *mask); - - NODEMASK_FREE(mask); - - return retval; + return count; } static ssize_t cpuset_common_file_read(struct cgroup *cont, -- cgit v1.2.3-70-g09d2 From c8163ca8afcac0fc54593fc60d1e1110edbd0eb2 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 23 Mar 2011 16:42:46 -0700 Subject: cpuset: remove unneeded NODEMASK_ALLOC() in cpuset_attach() oldcs->mems_allowed is not modified during cpuset_attach(), so we don't have to copy it to a buffer allocated by NODEMASK_ALLOC(). Just pass it to cpuset_migrate_mm(). Signed-off-by: Li Zefan Cc: Paul Menage Acked-by: David Rientjes Cc: Miao Xie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 4683fe728c9..7f384f4013b 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1438,10 +1438,9 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, struct mm_struct *mm; struct cpuset *cs = cgroup_cs(cont); struct cpuset *oldcs = cgroup_cs(oldcont); - NODEMASK_ALLOC(nodemask_t, from, GFP_KERNEL); NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL); - if (from == NULL || to == NULL) + if (to == NULL) goto alloc_fail; if (cs == &top_cpuset) { @@ -1463,18 +1462,16 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, } /* change mm; only needs to be done once even if threadgroup */ - *from = oldcs->mems_allowed; *to = cs->mems_allowed; mm = get_task_mm(tsk); if (mm) { mpol_rebind_mm(mm, to); if (is_memory_migrate(cs)) - cpuset_migrate_mm(mm, from, to); + cpuset_migrate_mm(mm, &oldcs->mems_allowed, to); mmput(mm); } alloc_fail: - NODEMASK_FREE(from); NODEMASK_FREE(to); } -- cgit v1.2.3-70-g09d2 From ee24d3797780eee6ffe581a7b78d27896f9b494a Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 23 Mar 2011 16:42:47 -0700 Subject: cpuset: fix unchecked calls to NODEMASK_ALLOC() Those functions that use NODEMASK_ALLOC() can't propagate errno to users, but will fail silently. Fix it by using a static nodemask_t variable for each function, and those variables are protected by cgroup_mutex; [akpm@linux-foundation.org: fix comment spelling, strengthen cgroup_lock comment] Signed-off-by: Li Zefan Cc: Paul Menage Acked-by: David Rientjes Cc: Miao Xie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 51 ++++++++++++++++----------------------------------- 1 file changed, 16 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 7f384f4013b..e472fe13919 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1015,17 +1015,12 @@ static void cpuset_change_nodemask(struct task_struct *p, struct cpuset *cs; int migrate; const nodemask_t *oldmem = scan->data; - NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL); - - if (!newmems) - return; + static nodemask_t newmems; /* protected by cgroup_mutex */ cs = cgroup_cs(scan->cg); - guarantee_online_mems(cs, newmems); + guarantee_online_mems(cs, &newmems); - cpuset_change_task_nodemask(p, newmems); - - NODEMASK_FREE(newmems); + cpuset_change_task_nodemask(p, &newmems); mm = get_task_mm(p); if (!mm) @@ -1438,41 +1433,35 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, struct mm_struct *mm; struct cpuset *cs = cgroup_cs(cont); struct cpuset *oldcs = cgroup_cs(oldcont); - NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL); - - if (to == NULL) - goto alloc_fail; + static nodemask_t to; /* protected by cgroup_mutex */ if (cs == &top_cpuset) { cpumask_copy(cpus_attach, cpu_possible_mask); } else { guarantee_online_cpus(cs, cpus_attach); } - guarantee_online_mems(cs, to); + guarantee_online_mems(cs, &to); /* do per-task migration stuff possibly for each in the threadgroup */ - cpuset_attach_task(tsk, to, cs); + cpuset_attach_task(tsk, &to, cs); if (threadgroup) { struct task_struct *c; rcu_read_lock(); list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { - cpuset_attach_task(c, to, cs); + cpuset_attach_task(c, &to, cs); } rcu_read_unlock(); } /* change mm; only needs to be done once even if threadgroup */ - *to = cs->mems_allowed; + to = cs->mems_allowed; mm = get_task_mm(tsk); if (mm) { - mpol_rebind_mm(mm, to); + mpol_rebind_mm(mm, &to); if (is_memory_migrate(cs)) - cpuset_migrate_mm(mm, &oldcs->mems_allowed, to); + cpuset_migrate_mm(mm, &oldcs->mems_allowed, &to); mmput(mm); } - -alloc_fail: - NODEMASK_FREE(to); } /* The various types of files and directories in a cpuset file system */ @@ -2055,10 +2044,7 @@ static void scan_for_empty_cpusets(struct cpuset *root) struct cpuset *cp; /* scans cpusets being updated */ struct cpuset *child; /* scans child cpusets of cp */ struct cgroup *cont; - NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL); - - if (oldmems == NULL) - return; + static nodemask_t oldmems; /* protected by cgroup_mutex */ list_add_tail((struct list_head *)&root->stack_list, &queue); @@ -2075,7 +2061,7 @@ static void scan_for_empty_cpusets(struct cpuset *root) nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) continue; - *oldmems = cp->mems_allowed; + oldmems = cp->mems_allowed; /* Remove offline cpus and mems from this cpuset. */ mutex_lock(&callback_mutex); @@ -2091,10 +2077,9 @@ static void scan_for_empty_cpusets(struct cpuset *root) remove_tasks_in_empty_cpuset(cp); else { update_tasks_cpumask(cp, NULL); - update_tasks_nodemask(cp, oldmems, NULL); + update_tasks_nodemask(cp, &oldmems, NULL); } } - NODEMASK_FREE(oldmems); } /* @@ -2136,19 +2121,16 @@ void cpuset_update_active_cpus(void) static int cpuset_track_online_nodes(struct notifier_block *self, unsigned long action, void *arg) { - NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL); - - if (oldmems == NULL) - return NOTIFY_DONE; + static nodemask_t oldmems; /* protected by cgroup_mutex */ cgroup_lock(); switch (action) { case MEM_ONLINE: - *oldmems = top_cpuset.mems_allowed; + oldmems = top_cpuset.mems_allowed; mutex_lock(&callback_mutex); top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; mutex_unlock(&callback_mutex); - update_tasks_nodemask(&top_cpuset, oldmems, NULL); + update_tasks_nodemask(&top_cpuset, &oldmems, NULL); break; case MEM_OFFLINE: /* @@ -2162,7 +2144,6 @@ static int cpuset_track_online_nodes(struct notifier_block *self, } cgroup_unlock(); - NODEMASK_FREE(oldmems); return NOTIFY_OK; } #endif -- cgit v1.2.3-70-g09d2 From 523fb486bfd94e3a3b16a42bcb21b1959cf14df8 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 23 Mar 2011 16:42:48 -0700 Subject: cpuset: hold callback_mutex in cpuset_post_clone() Chaning cpuset->mems/cpuset->cpus should be protected under callback_mutex. cpuset_clone() doesn't follow this rule. It's ok because it's called when creating and initializing a cgroup, but we'd better hold the lock to avoid subtil break in the future. Signed-off-by: Li Zefan Acked-by: Paul Menage Acked-by: David Rientjes Cc: Miao Xie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index e472fe13919..33eee16addb 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1840,8 +1840,10 @@ static void cpuset_post_clone(struct cgroup_subsys *ss, cs = cgroup_cs(cgroup); parent_cs = cgroup_cs(parent); + mutex_lock(&callback_mutex); cs->mems_allowed = parent_cs->mems_allowed; cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed); + mutex_unlock(&callback_mutex); return; } -- cgit v1.2.3-70-g09d2 From 814ecf6e5b7854504ae83255173e53836c5d8420 Mon Sep 17 00:00:00 2001 From: Denis Kirjanov Date: Wed, 23 Mar 2011 16:43:08 -0700 Subject: sysctl_check: drop table->procname checks Since the for loop checks for the table->procname drop useless table->procname checks inside the loop body Signed-off-by: Denis Kirjanov Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl_check.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c index 10b90d8a03c..3a01c3e4649 100644 --- a/kernel/sysctl_check.c +++ b/kernel/sysctl_check.c @@ -111,11 +111,9 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) const char *fail = NULL; if (table->parent) { - if (table->procname && !table->parent->procname) + if (!table->parent->procname) set_fail(&fail, table, "Parent without procname"); } - if (!table->procname) - set_fail(&fail, table, "No procname"); if (table->child) { if (table->data) set_fail(&fail, table, "Directory with data?"); @@ -144,7 +142,7 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) set_fail(&fail, table, "No maxlen"); } #ifdef CONFIG_PROC_SYSCTL - if (table->procname && !table->proc_handler) + if (!table->proc_handler) set_fail(&fail, table, "No proc_handler"); #endif #if 0 -- cgit v1.2.3-70-g09d2 From 256c53a65128cbc8a766b1503f3f25a52a8d07cb Mon Sep 17 00:00:00 2001 From: Denis Kirjanov Date: Wed, 23 Mar 2011 16:43:08 -0700 Subject: sysctl_check: drop dead code Drop dead code. Signed-off-by: Denis Kirjanov Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl_check.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c index 3a01c3e4649..4e4932a7b36 100644 --- a/kernel/sysctl_check.c +++ b/kernel/sysctl_check.c @@ -144,10 +144,6 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) #ifdef CONFIG_PROC_SYSCTL if (!table->proc_handler) set_fail(&fail, table, "No proc_handler"); -#endif -#if 0 - if (!table->procname && table->proc_handler) - set_fail(&fail, table, "proc_handler without procname"); #endif sysctl_check_leaf(namespaces, table, &fail); } -- cgit v1.2.3-70-g09d2 From cb16e95fa2996743a6e80a665ed2ed0590bd38cf Mon Sep 17 00:00:00 2001 From: Petr Holasek Date: Wed, 23 Mar 2011 16:43:09 -0700 Subject: sysctl: add some missing input constraint checks Add boundaries of allowed input ranges for: dirty_expire_centisecs, drop_caches, overcommit_memory, page-cluster and panic_on_oom. Signed-off-by: Petr Holasek Acked-by: Dave Young Cc: David Rientjes Cc: Wu Fengguang Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/drop_caches.c | 6 +++++- kernel/sysctl.c | 17 +++++++++++++---- 2 files changed, 18 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/fs/drop_caches.c b/fs/drop_caches.c index 2195c213ab2..816f88e6b9c 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -45,7 +45,11 @@ static void drop_slab(void) int drop_caches_sysctl_handler(ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { - proc_dointvec_minmax(table, write, buffer, length, ppos); + int ret; + + ret = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (ret) + return ret; if (write) { if (sysctl_drop_caches & 1) iterate_supers(drop_pagecache_sb, NULL); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 40245d69760..97ab1690f5e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -117,6 +117,7 @@ static int neg_one = -1; static int zero; static int __maybe_unused one = 1; static int __maybe_unused two = 2; +static int __maybe_unused three = 3; static unsigned long one_ul = 1; static int one_hundred = 100; #ifdef CONFIG_PRINTK @@ -971,14 +972,18 @@ static struct ctl_table vm_table[] = { .data = &sysctl_overcommit_memory, .maxlen = sizeof(sysctl_overcommit_memory), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &two, }, { .procname = "panic_on_oom", .data = &sysctl_panic_on_oom, .maxlen = sizeof(sysctl_panic_on_oom), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &two, }, { .procname = "oom_kill_allocating_task", @@ -1006,7 +1011,8 @@ static struct ctl_table vm_table[] = { .data = &page_cluster, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, }, { .procname = "dirty_background_ratio", @@ -1054,7 +1060,8 @@ static struct ctl_table vm_table[] = { .data = &dirty_expire_interval, .maxlen = sizeof(dirty_expire_interval), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, }, { .procname = "nr_pdflush_threads", @@ -1130,6 +1137,8 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = drop_caches_sysctl_handler, + .extra1 = &one, + .extra2 = &three, }, #ifdef CONFIG_COMPACTION { -- cgit v1.2.3-70-g09d2 From bfdc0b497faa82a0ba2f9dddcf109231dd519fcc Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Wed, 23 Mar 2011 16:43:11 -0700 Subject: sysctl: restrict write access to dmesg_restrict When dmesg_restrict is set to 1 CAP_SYS_ADMIN is needed to read the kernel ring buffer. But a root user without CAP_SYS_ADMIN is able to reset dmesg_restrict to 0. This is an issue when e.g. LXC (Linux Containers) are used and complete user space is running without CAP_SYS_ADMIN. A unprivileged and jailed root user can bypass the dmesg_restrict protection. With this patch writing to dmesg_restrict is only allowed when root has CAP_SYS_ADMIN. Signed-off-by: Richard Weinberger Acked-by: Dan Rosenberg Acked-by: Serge E. Hallyn Cc: Eric Paris Cc: Kees Cook Cc: James Morris Cc: Eugene Teo Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 97ab1690f5e..c0bb32414b1 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -170,6 +170,11 @@ static int proc_taint(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); #endif +#ifdef CONFIG_PRINTK +static int proc_dmesg_restrict(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); +#endif + #ifdef CONFIG_MAGIC_SYSRQ /* Note: sysrq code uses it's own private copy */ static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE; @@ -707,7 +712,7 @@ static struct ctl_table kern_table[] = { .data = &kptr_restrict, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dmesg_restrict, .extra1 = &zero, .extra2 = &two, }, @@ -2394,6 +2399,17 @@ static int proc_taint(struct ctl_table *table, int write, return err; } +#ifdef CONFIG_PRINTK +static int proc_dmesg_restrict(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + return proc_dointvec_minmax(table, write, buffer, lenp, ppos); +} +#endif + struct do_proc_dointvec_minmax_conv_param { int *min; int *max; -- cgit v1.2.3-70-g09d2 From 45a68628d37222e655219febce9e91b6484789b2 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 23 Mar 2011 16:43:12 -0700 Subject: pid: remove the child_reaper special case in init/main.c This patchset is a cleanup and a preparation to unshare the pid namespace. These prerequisites prepare for Eric's patchset to give a file descriptor to a namespace and join an existing namespace. This patch: It turns out that the existing assignment in copy_process of the child_reaper can handle the initial assignment of child_reaper we just need to generalize the test in kernel/fork.c Signed-off-by: Eric W. Biederman Signed-off-by: Daniel Lezcano Cc: Oleg Nesterov Cc: Alexey Dobriyan Acked-by: Serge E. Hallyn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pid.h | 11 +++++++++++ init/main.c | 9 --------- kernel/fork.c | 2 +- 3 files changed, 12 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/pid.h b/include/linux/pid.h index 49f1c2f66e9..efceda0a51b 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -140,6 +140,17 @@ static inline struct pid_namespace *ns_of_pid(struct pid *pid) return ns; } +/* + * is_child_reaper returns true if the pid is the init process + * of the current namespace. As this one could be checked before + * pid_ns->child_reaper is assigned in copy_process, we check + * with the pid number. + */ +static inline bool is_child_reaper(struct pid *pid) +{ + return pid->numbers[pid->level].nr == 1; +} + /* * the helpers to get the pid's id seen from different namespaces * diff --git a/init/main.c b/init/main.c index 3627bb37225..4a9479ef454 100644 --- a/init/main.c +++ b/init/main.c @@ -787,15 +787,6 @@ static int __init kernel_init(void * unused) * init can run on any cpu. */ set_cpus_allowed_ptr(current, cpu_all_mask); - /* - * Tell the world that we're going to be the grim - * reaper of innocent orphaned children. - * - * We don't want people to have to make incorrect - * assumptions about where in the task array this - * can be found. - */ - init_pid_ns.child_reaper = current; cad_pid = task_pid(current); diff --git a/kernel/fork.c b/kernel/fork.c index f2b494d7c55..17aed4378ed 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1296,7 +1296,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, tracehook_finish_clone(p, clone_flags, trace); if (thread_group_leader(p)) { - if (clone_flags & CLONE_NEWPID) + if (is_child_reaper(pid)) p->nsproxy->pid_ns->child_reaper = p; p->signal->leader_pid = pid; -- cgit v1.2.3-70-g09d2 From 4308eebbeb2026827d4492ce8c23d99f7f144a82 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 23 Mar 2011 16:43:13 -0700 Subject: pidns: call pid_ns_prepare_proc() from create_pid_namespace() Reorganize proc_get_sb() so it can be called before the struct pid of the first process is allocated. Signed-off-by: Eric W. Biederman Signed-off-by: Daniel Lezcano Cc: Oleg Nesterov Cc: Alexey Dobriyan Acked-by: Serge E. Hallyn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/root.c | 25 +++++++------------------ kernel/fork.c | 6 ------ kernel/pid_namespace.c | 11 +++++++++-- 3 files changed, 16 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/fs/proc/root.c b/fs/proc/root.c index ef9fa8e24ad..e5e2bfa7a03 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -43,17 +43,6 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, struct pid_namespace *ns; struct proc_inode *ei; - if (proc_mnt) { - /* Seed the root directory with a pid so it doesn't need - * to be special in base.c. I would do this earlier but - * the only task alive when /proc is mounted the first time - * is the init_task and it doesn't have any pids. - */ - ei = PROC_I(proc_mnt->mnt_sb->s_root->d_inode); - if (!ei->pid) - ei->pid = find_get_pid(1); - } - if (flags & MS_KERNMOUNT) ns = (struct pid_namespace *)data; else @@ -71,16 +60,16 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, return ERR_PTR(err); } - ei = PROC_I(sb->s_root->d_inode); - if (!ei->pid) { - rcu_read_lock(); - ei->pid = get_pid(find_pid_ns(1, ns)); - rcu_read_unlock(); - } - sb->s_flags |= MS_ACTIVE; } + ei = PROC_I(sb->s_root->d_inode); + if (!ei->pid) { + rcu_read_lock(); + ei->pid = get_pid(find_pid_ns(1, ns)); + rcu_read_unlock(); + } + return dget(sb->s_root); } diff --git a/kernel/fork.c b/kernel/fork.c index 17aed4378ed..457fff2e17e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1187,12 +1187,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, pid = alloc_pid(p->nsproxy->pid_ns); if (!pid) goto bad_fork_cleanup_io; - - if (clone_flags & CLONE_NEWPID) { - retval = pid_ns_prepare_proc(p->nsproxy->pid_ns); - if (retval < 0) - goto bad_fork_free_pid; - } } p->pid = pid_nr(pid); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index a5aff94e1f0..e9c9adc84ca 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -14,6 +14,7 @@ #include #include #include +#include #define BITS_PER_PAGE (PAGE_SIZE*8) @@ -72,7 +73,7 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p { struct pid_namespace *ns; unsigned int level = parent_pid_ns->level + 1; - int i; + int i, err = -ENOMEM; ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL); if (ns == NULL) @@ -96,14 +97,20 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p for (i = 1; i < PIDMAP_ENTRIES; i++) atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); + err = pid_ns_prepare_proc(ns); + if (err) + goto out_put_parent_pid_ns; + return ns; +out_put_parent_pid_ns: + put_pid_ns(parent_pid_ns); out_free_map: kfree(ns->pidmap[0].page); out_free: kmem_cache_free(pid_ns_cachep, ns); out: - return ERR_PTR(-ENOMEM); + return ERR_PTR(err); } static void destroy_pid_namespace(struct pid_namespace *ns) -- cgit v1.2.3-70-g09d2 From 59607db367c57f515183cb203642291bb14d9c40 Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Wed, 23 Mar 2011 16:43:16 -0700 Subject: userns: add a user_namespace as creator/owner of uts_namespace The expected course of development for user namespaces targeted capabilities is laid out at https://wiki.ubuntu.com/UserNamespace. Goals: - Make it safe for an unprivileged user to unshare namespaces. They will be privileged with respect to the new namespace, but this should only include resources which the unprivileged user already owns. - Provide separate limits and accounting for userids in different namespaces. Status: Currently (as of 2.6.38) you can clone with the CLONE_NEWUSER flag to get a new user namespace if you have the CAP_SYS_ADMIN, CAP_SETUID, and CAP_SETGID capabilities. What this gets you is a whole new set of userids, meaning that user 500 will have a different 'struct user' in your namespace than in other namespaces. So any accounting information stored in struct user will be unique to your namespace. However, throughout the kernel there are checks which - simply check for a capability. Since root in a child namespace has all capabilities, this means that a child namespace is not constrained. - simply compare uid1 == uid2. Since these are the integer uids, uid 500 in namespace 1 will be said to be equal to uid 500 in namespace 2. As a result, the lxc implementation at lxc.sf.net does not use user namespaces. This is actually helpful because it leaves us free to develop user namespaces in such a way that, for some time, user namespaces may be unuseful. Bugs aside, this patchset is supposed to not at all affect systems which are not actively using user namespaces, and only restrict what tasks in child user namespace can do. They begin to limit privilege to a user namespace, so that root in a container cannot kill or ptrace tasks in the parent user namespace, and can only get world access rights to files. Since all files currently belong to the initila user namespace, that means that child user namespaces can only get world access rights to *all* files. While this temporarily makes user namespaces bad for system containers, it starts to get useful for some sandboxing. I've run the 'runltplite.sh' with and without this patchset and found no difference. This patch: copy_process() handles CLONE_NEWUSER before the rest of the namespaces. So in the case of clone(CLONE_NEWUSER|CLONE_NEWUTS) the new uts namespace will have the new user namespace as its owner. That is what we want, since we want root in that new userns to be able to have privilege over it. Changelog: Feb 15: don't set uts_ns->user_ns if we didn't create a new uts_ns. Feb 23: Move extern init_user_ns declaration from init/version.c to utsname.h. Signed-off-by: Serge E. Hallyn Acked-by: "Eric W. Biederman" Acked-by: Daniel Lezcano Acked-by: David Howells Cc: James Morris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/utsname.h | 4 ++++ init/version.c | 1 + kernel/nsproxy.c | 5 +++++ kernel/user.c | 8 ++++++-- kernel/utsname.c | 4 ++++ 5 files changed, 20 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/utsname.h b/include/linux/utsname.h index 69f39974c04..2c3c0f54370 100644 --- a/include/linux/utsname.h +++ b/include/linux/utsname.h @@ -37,9 +37,13 @@ struct new_utsname { #include #include +struct user_namespace; +extern struct user_namespace init_user_ns; + struct uts_namespace { struct kref kref; struct new_utsname name; + struct user_namespace *user_ns; }; extern struct uts_namespace init_uts_ns; diff --git a/init/version.c b/init/version.c index adff586401a..86fe0ccb997 100644 --- a/init/version.c +++ b/init/version.c @@ -33,6 +33,7 @@ struct uts_namespace init_uts_ns = { .machine = UTS_MACHINE, .domainname = UTS_DOMAINNAME, }, + .user_ns = &init_user_ns, }; EXPORT_SYMBOL_GPL(init_uts_ns); diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index f74e6c00e26..034dc2ed13a 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -74,6 +74,11 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, err = PTR_ERR(new_nsp->uts_ns); goto out_uts; } + if (new_nsp->uts_ns != tsk->nsproxy->uts_ns) { + put_user_ns(new_nsp->uts_ns->user_ns); + new_nsp->uts_ns->user_ns = task_cred_xxx(tsk, user)->user_ns; + get_user_ns(new_nsp->uts_ns->user_ns); + } new_nsp->ipc_ns = copy_ipcs(flags, tsk->nsproxy->ipc_ns); if (IS_ERR(new_nsp->ipc_ns)) { diff --git a/kernel/user.c b/kernel/user.c index 5c598ca781d..9e03e9c1df8 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -17,9 +17,13 @@ #include #include +/* + * userns count is 1 for root user, 1 for init_uts_ns, + * and 1 for... ? + */ struct user_namespace init_user_ns = { .kref = { - .refcount = ATOMIC_INIT(2), + .refcount = ATOMIC_INIT(3), }, .creator = &root_user, }; @@ -47,7 +51,7 @@ static struct kmem_cache *uid_cachep; */ static DEFINE_SPINLOCK(uidhash_lock); -/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->creator */ +/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->user_ns */ struct user_struct root_user = { .__count = ATOMIC_INIT(2), .processes = ATOMIC_INIT(1), diff --git a/kernel/utsname.c b/kernel/utsname.c index 8a82b4b8ea5..a7b3a8d1ad2 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -14,6 +14,7 @@ #include #include #include +#include static struct uts_namespace *create_uts_ns(void) { @@ -40,6 +41,8 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns) down_read(&uts_sem); memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); + ns->user_ns = old_ns->user_ns; + get_user_ns(ns->user_ns); up_read(&uts_sem); return ns; } @@ -71,5 +74,6 @@ void free_uts_ns(struct kref *kref) struct uts_namespace *ns; ns = container_of(kref, struct uts_namespace, kref); + put_user_ns(ns->user_ns); kfree(ns); } -- cgit v1.2.3-70-g09d2 From 3486740a4f32a6a466f5ac931654d154790ba648 Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Wed, 23 Mar 2011 16:43:17 -0700 Subject: userns: security: make capabilities relative to the user namespace - Introduce ns_capable to test for a capability in a non-default user namespace. - Teach cap_capable to handle capabilities in a non-default user namespace. The motivation is to get to the unprivileged creation of new namespaces. It looks like this gets us 90% of the way there, with only potential uid confusion issues left. I still need to handle getting all caps after creation but otherwise I think I have a good starter patch that achieves all of your goals. Changelog: 11/05/2010: [serge] add apparmor 12/14/2010: [serge] fix capabilities to created user namespaces Without this, if user serge creates a user_ns, he won't have capabilities to the user_ns he created. THis is because we were first checking whether his effective caps had the caps he needed and returning -EPERM if not, and THEN checking whether he was the creator. Reverse those checks. 12/16/2010: [serge] security_real_capable needs ns argument in !security case 01/11/2011: [serge] add task_ns_capable helper 01/11/2011: [serge] add nsown_capable() helper per Bastian Blank suggestion 02/16/2011: [serge] fix a logic bug: the root user is always creator of init_user_ns, but should not always have capabilities to it! Fix the check in cap_capable(). 02/21/2011: Add the required user_ns parameter to security_capable, fixing a compile failure. 02/23/2011: Convert some macros to functions as per akpm comments. Some couldn't be converted because we can't easily forward-declare them (they are inline if !SECURITY, extern if SECURITY). Add a current_user_ns function so we can use it in capability.h without #including cred.h. Move all forward declarations together to the top of the #ifdef __KERNEL__ section, and use kernel-doc format. 02/23/2011: Per dhowells, clean up comment in cap_capable(). 02/23/2011: Per akpm, remove unreachable 'return -EPERM' in cap_capable. (Original written and signed off by Eric; latest, modified version acked by him) [akpm@linux-foundation.org: fix build] [akpm@linux-foundation.org: export current_user_ns() for ecryptfs] [serge.hallyn@canonical.com: remove unneeded extra argument in selinux's task_has_capability] Signed-off-by: Eric W. Biederman Signed-off-by: Serge E. Hallyn Acked-by: "Eric W. Biederman" Acked-by: Daniel Lezcano Acked-by: David Howells Cc: James Morris Signed-off-by: Serge E. Hallyn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/pci/pci-sysfs.c | 2 +- include/linux/capability.h | 36 ++++++++++++++++++++++++++++-------- include/linux/cred.h | 4 +++- include/linux/security.h | 28 +++++++++++++++++----------- kernel/capability.c | 42 +++++++++++++++++++++++++++++++++++++----- kernel/cred.c | 6 ++++++ security/apparmor/lsm.c | 5 +++-- security/commoncap.c | 38 +++++++++++++++++++++++++++++++------- security/security.c | 16 ++++++++++------ security/selinux/hooks.c | 13 ++++++++----- 10 files changed, 144 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index c85438a367d..a8a277a2e0d 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -369,7 +369,7 @@ pci_read_config(struct file *filp, struct kobject *kobj, u8 *data = (u8*) buf; /* Several chips lock up trying to read undefined config space */ - if (security_capable(filp->f_cred, CAP_SYS_ADMIN) == 0) { + if (security_capable(&init_user_ns, filp->f_cred, CAP_SYS_ADMIN) == 0) { size = dev->cfg_size; } else if (dev->hdr_type == PCI_HEADER_TYPE_CARDBUS) { size = 128; diff --git a/include/linux/capability.h b/include/linux/capability.h index fb16a3699b9..7c9c8290301 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -368,6 +368,17 @@ struct cpu_vfs_cap_data { #ifdef __KERNEL__ +struct dentry; +struct user_namespace; + +extern struct user_namespace init_user_ns; + +struct user_namespace *current_user_ns(void); + +extern const kernel_cap_t __cap_empty_set; +extern const kernel_cap_t __cap_full_set; +extern const kernel_cap_t __cap_init_eff_set; + /* * Internal kernel functions only */ @@ -530,10 +541,6 @@ static inline kernel_cap_t cap_raise_nfsd_set(const kernel_cap_t a, cap_intersect(permitted, __cap_nfsd_set)); } -extern const kernel_cap_t __cap_empty_set; -extern const kernel_cap_t __cap_full_set; -extern const kernel_cap_t __cap_init_eff_set; - /** * has_capability - Determine if a task has a superior capability available * @t: The task in question @@ -544,7 +551,7 @@ extern const kernel_cap_t __cap_init_eff_set; * * Note that this does not set PF_SUPERPRIV on the task. */ -#define has_capability(t, cap) (security_real_capable((t), (cap)) == 0) +#define has_capability(t, cap) (security_real_capable((t), &init_user_ns, (cap)) == 0) /** * has_capability_noaudit - Determine if a task has a superior capability available (unaudited) @@ -558,12 +565,25 @@ extern const kernel_cap_t __cap_init_eff_set; * Note that this does not set PF_SUPERPRIV on the task. */ #define has_capability_noaudit(t, cap) \ - (security_real_capable_noaudit((t), (cap)) == 0) + (security_real_capable_noaudit((t), &init_user_ns, (cap)) == 0) -extern int capable(int cap); +extern bool capable(int cap); +extern bool ns_capable(struct user_namespace *ns, int cap); +extern bool task_ns_capable(struct task_struct *t, int cap); + +/** + * nsown_capable - Check superior capability to one's own user_ns + * @cap: The capability in question + * + * Return true if the current task has the given superior capability + * targeted at its own user namespace. + */ +static inline bool nsown_capable(int cap) +{ + return ns_capable(current_user_ns(), cap); +} /* audit system wants to get cap info from files as well */ -struct dentry; extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps); #endif /* __KERNEL__ */ diff --git a/include/linux/cred.h b/include/linux/cred.h index 4aaeab37644..9aeeb0ba200 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -354,9 +354,11 @@ static inline void put_cred(const struct cred *_cred) #define current_fsgid() (current_cred_xxx(fsgid)) #define current_cap() (current_cred_xxx(cap_effective)) #define current_user() (current_cred_xxx(user)) -#define current_user_ns() (current_cred_xxx(user)->user_ns) +#define _current_user_ns() (current_cred_xxx(user)->user_ns) #define current_security() (current_cred_xxx(security)) +extern struct user_namespace *current_user_ns(void); + #define current_uid_gid(_uid, _gid) \ do { \ const struct cred *__cred; \ diff --git a/include/linux/security.h b/include/linux/security.h index 56cac520d01..ca02f171673 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -47,13 +47,14 @@ struct ctl_table; struct audit_krule; +struct user_namespace; /* * These functions are in security/capability.c and are used * as the default capabilities functions */ extern int cap_capable(struct task_struct *tsk, const struct cred *cred, - int cap, int audit); + struct user_namespace *ns, int cap, int audit); extern int cap_settime(const struct timespec *ts, const struct timezone *tz); extern int cap_ptrace_access_check(struct task_struct *child, unsigned int mode); extern int cap_ptrace_traceme(struct task_struct *parent); @@ -1262,6 +1263,7 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * credentials. * @tsk contains the task_struct for the process. * @cred contains the credentials to use. + * @ns contains the user namespace we want the capability in * @cap contains the capability . * @audit: Whether to write an audit message or not * Return 0 if the capability is granted for @tsk. @@ -1384,7 +1386,7 @@ struct security_operations { const kernel_cap_t *inheritable, const kernel_cap_t *permitted); int (*capable) (struct task_struct *tsk, const struct cred *cred, - int cap, int audit); + struct user_namespace *ns, int cap, int audit); int (*quotactl) (int cmds, int type, int id, struct super_block *sb); int (*quota_on) (struct dentry *dentry); int (*syslog) (int type); @@ -1665,9 +1667,12 @@ int security_capset(struct cred *new, const struct cred *old, const kernel_cap_t *effective, const kernel_cap_t *inheritable, const kernel_cap_t *permitted); -int security_capable(const struct cred *cred, int cap); -int security_real_capable(struct task_struct *tsk, int cap); -int security_real_capable_noaudit(struct task_struct *tsk, int cap); +int security_capable(struct user_namespace *ns, const struct cred *cred, + int cap); +int security_real_capable(struct task_struct *tsk, struct user_namespace *ns, + int cap); +int security_real_capable_noaudit(struct task_struct *tsk, + struct user_namespace *ns, int cap); int security_quotactl(int cmds, int type, int id, struct super_block *sb); int security_quota_on(struct dentry *dentry); int security_syslog(int type); @@ -1860,28 +1865,29 @@ static inline int security_capset(struct cred *new, return cap_capset(new, old, effective, inheritable, permitted); } -static inline int security_capable(const struct cred *cred, int cap) +static inline int security_capable(struct user_namespace *ns, + const struct cred *cred, int cap) { - return cap_capable(current, cred, cap, SECURITY_CAP_AUDIT); + return cap_capable(current, cred, ns, cap, SECURITY_CAP_AUDIT); } -static inline int security_real_capable(struct task_struct *tsk, int cap) +static inline int security_real_capable(struct task_struct *tsk, struct user_namespace *ns, int cap) { int ret; rcu_read_lock(); - ret = cap_capable(tsk, __task_cred(tsk), cap, SECURITY_CAP_AUDIT); + ret = cap_capable(tsk, __task_cred(tsk), ns, cap, SECURITY_CAP_AUDIT); rcu_read_unlock(); return ret; } static inline -int security_real_capable_noaudit(struct task_struct *tsk, int cap) +int security_real_capable_noaudit(struct task_struct *tsk, struct user_namespace *ns, int cap) { int ret; rcu_read_lock(); - ret = cap_capable(tsk, __task_cred(tsk), cap, + ret = cap_capable(tsk, __task_cred(tsk), ns, cap, SECURITY_CAP_NOAUDIT); rcu_read_unlock(); return ret; diff --git a/kernel/capability.c b/kernel/capability.c index 9e9385f132c..0a3d2c863a1 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -14,6 +14,7 @@ #include #include #include +#include #include /* @@ -299,17 +300,48 @@ error: * This sets PF_SUPERPRIV on the task if the capability is available on the * assumption that it's about to be used. */ -int capable(int cap) +bool capable(int cap) +{ + return ns_capable(&init_user_ns, cap); +} +EXPORT_SYMBOL(capable); + +/** + * ns_capable - Determine if the current task has a superior capability in effect + * @ns: The usernamespace we want the capability in + * @cap: The capability to be tested for + * + * Return true if the current task has the given superior capability currently + * available for use, false if not. + * + * This sets PF_SUPERPRIV on the task if the capability is available on the + * assumption that it's about to be used. + */ +bool ns_capable(struct user_namespace *ns, int cap) { if (unlikely(!cap_valid(cap))) { printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap); BUG(); } - if (security_capable(current_cred(), cap) == 0) { + if (security_capable(ns, current_cred(), cap) == 0) { current->flags |= PF_SUPERPRIV; - return 1; + return true; } - return 0; + return false; } -EXPORT_SYMBOL(capable); +EXPORT_SYMBOL(ns_capable); + +/** + * task_ns_capable - Determine whether current task has a superior + * capability targeted at a specific task's user namespace. + * @t: The task whose user namespace is targeted. + * @cap: The capability in question. + * + * Return true if it does, false otherwise. + */ +bool task_ns_capable(struct task_struct *t, int cap) +{ + return ns_capable(task_cred_xxx(t, user)->user_ns, cap); +} +EXPORT_SYMBOL(task_ns_capable); diff --git a/kernel/cred.c b/kernel/cred.c index 2343c132c5a..5557b55048d 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -741,6 +741,12 @@ int set_create_files_as(struct cred *new, struct inode *inode) } EXPORT_SYMBOL(set_create_files_as); +struct user_namespace *current_user_ns(void) +{ + return _current_user_ns(); +} +EXPORT_SYMBOL(current_user_ns); + #ifdef CONFIG_DEBUG_CREDENTIALS bool creds_are_invalid(const struct cred *cred) diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index d21a427a35a..ae3a698415e 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include "include/apparmor.h" @@ -136,11 +137,11 @@ static int apparmor_capget(struct task_struct *target, kernel_cap_t *effective, } static int apparmor_capable(struct task_struct *task, const struct cred *cred, - int cap, int audit) + struct user_namespace *ns, int cap, int audit) { struct aa_profile *profile; /* cap_capable returns 0 on success, else -EPERM */ - int error = cap_capable(task, cred, cap, audit); + int error = cap_capable(task, cred, ns, cap, audit); if (!error) { profile = aa_cred_profile(cred); if (!unconfined(profile)) diff --git a/security/commoncap.c b/security/commoncap.c index 49c57fd60ae..43a205bc7d7 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -27,6 +27,7 @@ #include #include #include +#include /* * If a non-root user executes a setuid-root binary in @@ -67,6 +68,7 @@ EXPORT_SYMBOL(cap_netlink_recv); * cap_capable - Determine whether a task has a particular effective capability * @tsk: The task to query * @cred: The credentials to use + * @ns: The user namespace in which we need the capability * @cap: The capability to check for * @audit: Whether to write an audit message or not * @@ -78,10 +80,30 @@ EXPORT_SYMBOL(cap_netlink_recv); * cap_has_capability() returns 0 when a task has a capability, but the * kernel's capable() and has_capability() returns 1 for this case. */ -int cap_capable(struct task_struct *tsk, const struct cred *cred, int cap, - int audit) +int cap_capable(struct task_struct *tsk, const struct cred *cred, + struct user_namespace *targ_ns, int cap, int audit) { - return cap_raised(cred->cap_effective, cap) ? 0 : -EPERM; + for (;;) { + /* The creator of the user namespace has all caps. */ + if (targ_ns != &init_user_ns && targ_ns->creator == cred->user) + return 0; + + /* Do we have the necessary capabilities? */ + if (targ_ns == cred->user->user_ns) + return cap_raised(cred->cap_effective, cap) ? 0 : -EPERM; + + /* Have we tried all of the parent namespaces? */ + if (targ_ns == &init_user_ns) + return -EPERM; + + /* + *If you have a capability in a parent user ns, then you have + * it over all children user namespaces as well. + */ + targ_ns = targ_ns->creator->user_ns; + } + + /* We never get here */ } /** @@ -176,7 +198,8 @@ static inline int cap_inh_is_capped(void) /* they are so limited unless the current task has the CAP_SETPCAP * capability */ - if (cap_capable(current, current_cred(), CAP_SETPCAP, + if (cap_capable(current, current_cred(), + current_cred()->user->user_ns, CAP_SETPCAP, SECURITY_CAP_AUDIT) == 0) return 0; return 1; @@ -828,7 +851,8 @@ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3, & (new->securebits ^ arg2)) /*[1]*/ || ((new->securebits & SECURE_ALL_LOCKS & ~arg2)) /*[2]*/ || (arg2 & ~(SECURE_ALL_LOCKS | SECURE_ALL_BITS)) /*[3]*/ - || (cap_capable(current, current_cred(), CAP_SETPCAP, + || (cap_capable(current, current_cred(), + current_cred()->user->user_ns, CAP_SETPCAP, SECURITY_CAP_AUDIT) != 0) /*[4]*/ /* * [1] no changing of bits that are locked @@ -893,7 +917,7 @@ int cap_vm_enough_memory(struct mm_struct *mm, long pages) { int cap_sys_admin = 0; - if (cap_capable(current, current_cred(), CAP_SYS_ADMIN, + if (cap_capable(current, current_cred(), &init_user_ns, CAP_SYS_ADMIN, SECURITY_CAP_NOAUDIT) == 0) cap_sys_admin = 1; return __vm_enough_memory(mm, pages, cap_sys_admin); @@ -920,7 +944,7 @@ int cap_file_mmap(struct file *file, unsigned long reqprot, int ret = 0; if (addr < dac_mmap_min_addr) { - ret = cap_capable(current, current_cred(), CAP_SYS_RAWIO, + ret = cap_capable(current, current_cred(), &init_user_ns, CAP_SYS_RAWIO, SECURITY_CAP_AUDIT); /* set PF_SUPERPRIV if it turns out we allow the low mmap */ if (ret == 0) diff --git a/security/security.c b/security/security.c index 9187665a3fd..101142369db 100644 --- a/security/security.c +++ b/security/security.c @@ -154,29 +154,33 @@ int security_capset(struct cred *new, const struct cred *old, effective, inheritable, permitted); } -int security_capable(const struct cred *cred, int cap) +int security_capable(struct user_namespace *ns, const struct cred *cred, + int cap) { - return security_ops->capable(current, cred, cap, SECURITY_CAP_AUDIT); + return security_ops->capable(current, cred, ns, cap, + SECURITY_CAP_AUDIT); } -int security_real_capable(struct task_struct *tsk, int cap) +int security_real_capable(struct task_struct *tsk, struct user_namespace *ns, + int cap) { const struct cred *cred; int ret; cred = get_task_cred(tsk); - ret = security_ops->capable(tsk, cred, cap, SECURITY_CAP_AUDIT); + ret = security_ops->capable(tsk, cred, ns, cap, SECURITY_CAP_AUDIT); put_cred(cred); return ret; } -int security_real_capable_noaudit(struct task_struct *tsk, int cap) +int security_real_capable_noaudit(struct task_struct *tsk, + struct user_namespace *ns, int cap) { const struct cred *cred; int ret; cred = get_task_cred(tsk); - ret = security_ops->capable(tsk, cred, cap, SECURITY_CAP_NOAUDIT); + ret = security_ops->capable(tsk, cred, ns, cap, SECURITY_CAP_NOAUDIT); put_cred(cred); return ret; } diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 6475e1f0223..c67f863d354 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -79,6 +79,7 @@ #include #include #include +#include #include "avc.h" #include "objsec.h" @@ -1846,11 +1847,11 @@ static int selinux_capset(struct cred *new, const struct cred *old, */ static int selinux_capable(struct task_struct *tsk, const struct cred *cred, - int cap, int audit) + struct user_namespace *ns, int cap, int audit) { int rc; - rc = cap_capable(tsk, cred, cap, audit); + rc = cap_capable(tsk, cred, ns, cap, audit); if (rc) return rc; @@ -1931,7 +1932,8 @@ static int selinux_vm_enough_memory(struct mm_struct *mm, long pages) { int rc, cap_sys_admin = 0; - rc = selinux_capable(current, current_cred(), CAP_SYS_ADMIN, + rc = selinux_capable(current, current_cred(), + &init_user_ns, CAP_SYS_ADMIN, SECURITY_CAP_NOAUDIT); if (rc == 0) cap_sys_admin = 1; @@ -2834,7 +2836,8 @@ static int selinux_inode_getsecurity(const struct inode *inode, const char *name * and lack of permission just means that we fall back to the * in-core context value, not a denial. */ - error = selinux_capable(current, current_cred(), CAP_MAC_ADMIN, + error = selinux_capable(current, current_cred(), + &init_user_ns, CAP_MAC_ADMIN, SECURITY_CAP_NOAUDIT); if (!error) error = security_sid_to_context_force(isec->sid, &context, @@ -2968,7 +2971,7 @@ static int selinux_file_ioctl(struct file *file, unsigned int cmd, case KDSKBENT: case KDSKBSENT: error = task_has_capability(current, cred, CAP_SYS_TTY_CONFIG, - SECURITY_CAP_AUDIT); + SECURITY_CAP_AUDIT); break; /* default case assumes that the command will go -- cgit v1.2.3-70-g09d2 From bb96a6f50be27390dc959ff67d9ea0ea0cfbe177 Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Wed, 23 Mar 2011 16:43:18 -0700 Subject: userns: allow sethostname in a container Changelog: Feb 23: let clone_uts_ns() handle setting uts->user_ns To do so we need to pass in the task_struct who'll get the utsname, so we can get its user_ns. Feb 23: As per Oleg's coment, just pass in tsk, instead of two of its members. Signed-off-by: Serge E. Hallyn Acked-by: "Eric W. Biederman" Acked-by: Daniel Lezcano Acked-by: David Howells Cc: James Morris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/utsname.h | 6 +++--- kernel/nsproxy.c | 7 +------ kernel/sys.c | 2 +- kernel/utsname.c | 12 +++++++----- 4 files changed, 12 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/include/linux/utsname.h b/include/linux/utsname.h index 2c3c0f54370..4e5b0213fdc 100644 --- a/include/linux/utsname.h +++ b/include/linux/utsname.h @@ -54,7 +54,7 @@ static inline void get_uts_ns(struct uts_namespace *ns) } extern struct uts_namespace *copy_utsname(unsigned long flags, - struct uts_namespace *ns); + struct task_struct *tsk); extern void free_uts_ns(struct kref *kref); static inline void put_uts_ns(struct uts_namespace *ns) @@ -71,12 +71,12 @@ static inline void put_uts_ns(struct uts_namespace *ns) } static inline struct uts_namespace *copy_utsname(unsigned long flags, - struct uts_namespace *ns) + struct task_struct *tsk) { if (flags & CLONE_NEWUTS) return ERR_PTR(-EINVAL); - return ns; + return tsk->nsproxy->uts_ns; } #endif diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 034dc2ed13a..b97fc9d04dd 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -69,16 +69,11 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, goto out_ns; } - new_nsp->uts_ns = copy_utsname(flags, tsk->nsproxy->uts_ns); + new_nsp->uts_ns = copy_utsname(flags, tsk); if (IS_ERR(new_nsp->uts_ns)) { err = PTR_ERR(new_nsp->uts_ns); goto out_uts; } - if (new_nsp->uts_ns != tsk->nsproxy->uts_ns) { - put_user_ns(new_nsp->uts_ns->user_ns); - new_nsp->uts_ns->user_ns = task_cred_xxx(tsk, user)->user_ns; - get_user_ns(new_nsp->uts_ns->user_ns); - } new_nsp->ipc_ns = copy_ipcs(flags, tsk->nsproxy->ipc_ns); if (IS_ERR(new_nsp->ipc_ns)) { diff --git a/kernel/sys.c b/kernel/sys.c index 1ad48b3b906..5761c53e19e 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1181,7 +1181,7 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) int errno; char tmp[__NEW_UTS_LEN]; - if (!capable(CAP_SYS_ADMIN)) + if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; if (len < 0 || len > __NEW_UTS_LEN) return -EINVAL; diff --git a/kernel/utsname.c b/kernel/utsname.c index a7b3a8d1ad2..44646179eab 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -31,7 +31,8 @@ static struct uts_namespace *create_uts_ns(void) * @old_ns: namespace to clone * Return NULL on error (failure to kmalloc), new ns otherwise */ -static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns) +static struct uts_namespace *clone_uts_ns(struct task_struct *tsk, + struct uts_namespace *old_ns) { struct uts_namespace *ns; @@ -41,8 +42,7 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns) down_read(&uts_sem); memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); - ns->user_ns = old_ns->user_ns; - get_user_ns(ns->user_ns); + ns->user_ns = get_user_ns(task_cred_xxx(tsk, user)->user_ns); up_read(&uts_sem); return ns; } @@ -53,8 +53,10 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns) * utsname of this process won't be seen by parent, and vice * versa. */ -struct uts_namespace *copy_utsname(unsigned long flags, struct uts_namespace *old_ns) +struct uts_namespace *copy_utsname(unsigned long flags, + struct task_struct *tsk) { + struct uts_namespace *old_ns = tsk->nsproxy->uts_ns; struct uts_namespace *new_ns; BUG_ON(!old_ns); @@ -63,7 +65,7 @@ struct uts_namespace *copy_utsname(unsigned long flags, struct uts_namespace *ol if (!(flags & CLONE_NEWUTS)) return old_ns; - new_ns = clone_uts_ns(old_ns); + new_ns = clone_uts_ns(tsk, old_ns); put_uts_ns(old_ns); return new_ns; -- cgit v1.2.3-70-g09d2 From 39fd33933b0209e4b6254743f2cede07c5ad4c52 Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Wed, 23 Mar 2011 16:43:19 -0700 Subject: userns: allow killing tasks in your own or child userns Changelog: Dec 8: Fixed bug in my check_kill_permission pointed out by Eric Biederman. Dec 13: Apply Eric's suggestion to pass target task into kill_ok_by_cred() for clarity Dec 31: address comment by Eric Biederman: don't need cred/tcred in check_kill_permission. Jan 1: use const cred struct. Jan 11: Per Bastian Blank's advice, clean up kill_ok_by_cred(). Feb 16: kill_ok_by_cred: fix bad parentheses Feb 23: per akpm, let compiler inline kill_ok_by_cred Signed-off-by: Serge E. Hallyn Acked-by: "Eric W. Biederman" Acked-by: Daniel Lezcano Acked-by: David Howells Cc: James Morris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 31751868de8..324eff5468a 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -635,6 +635,27 @@ static inline bool si_fromuser(const struct siginfo *info) (!is_si_special(info) && SI_FROMUSER(info)); } +/* + * called with RCU read lock from check_kill_permission() + */ +static int kill_ok_by_cred(struct task_struct *t) +{ + const struct cred *cred = current_cred(); + const struct cred *tcred = __task_cred(t); + + if (cred->user->user_ns == tcred->user->user_ns && + (cred->euid == tcred->suid || + cred->euid == tcred->uid || + cred->uid == tcred->suid || + cred->uid == tcred->uid)) + return 1; + + if (ns_capable(tcred->user->user_ns, CAP_KILL)) + return 1; + + return 0; +} + /* * Bad permissions for sending the signal * - the caller must hold the RCU read lock @@ -642,7 +663,6 @@ static inline bool si_fromuser(const struct siginfo *info) static int check_kill_permission(int sig, struct siginfo *info, struct task_struct *t) { - const struct cred *cred, *tcred; struct pid *sid; int error; @@ -656,14 +676,8 @@ static int check_kill_permission(int sig, struct siginfo *info, if (error) return error; - cred = current_cred(); - tcred = __task_cred(t); if (!same_thread_group(current, t) && - (cred->euid ^ tcred->suid) && - (cred->euid ^ tcred->uid) && - (cred->uid ^ tcred->suid) && - (cred->uid ^ tcred->uid) && - !capable(CAP_KILL)) { + !kill_ok_by_cred(t)) { switch (sig) { case SIGCONT: sid = task_session(t); -- cgit v1.2.3-70-g09d2 From 8409cca7056113bee3236cb6a8e4d8d4d1eef102 Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Wed, 23 Mar 2011 16:43:20 -0700 Subject: userns: allow ptrace from non-init user namespaces ptrace is allowed to tasks in the same user namespace according to the usual rules (i.e. the same rules as for two tasks in the init user namespace). ptrace is also allowed to a user namespace to which the current task the has CAP_SYS_PTRACE capability. Changelog: Dec 31: Address feedback by Eric: . Correct ptrace uid check . Rename may_ptrace_ns to ptrace_capable . Also fix the cap_ptrace checks. Jan 1: Use const cred struct Jan 11: use task_ns_capable() in place of ptrace_capable(). Feb 23: same_or_ancestore_user_ns() was not an appropriate check to constrain cap_issubset. Rather, cap_issubset() only is meaningful when both capsets are in the same user_ns. Signed-off-by: Serge E. Hallyn Cc: "Eric W. Biederman" Acked-by: Daniel Lezcano Acked-by: David Howells Cc: James Morris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/capability.h | 2 ++ kernel/ptrace.c | 27 +++++++++++++++------------ security/commoncap.c | 40 ++++++++++++++++++++++++++++++++-------- 3 files changed, 49 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/include/linux/capability.h b/include/linux/capability.h index 7c9c8290301..2ec4a8cc86a 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -553,6 +553,8 @@ static inline kernel_cap_t cap_raise_nfsd_set(const kernel_cap_t a, */ #define has_capability(t, cap) (security_real_capable((t), &init_user_ns, (cap)) == 0) +#define has_ns_capability(t, ns, cap) (security_real_capable((t), (ns), (cap)) == 0) + /** * has_capability_noaudit - Determine if a task has a superior capability available (unaudited) * @t: The task in question diff --git a/kernel/ptrace.c b/kernel/ptrace.c index e2302e40b36..0fc1eed28d2 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -134,21 +134,24 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode) return 0; rcu_read_lock(); tcred = __task_cred(task); - if ((cred->uid != tcred->euid || - cred->uid != tcred->suid || - cred->uid != tcred->uid || - cred->gid != tcred->egid || - cred->gid != tcred->sgid || - cred->gid != tcred->gid) && - !capable(CAP_SYS_PTRACE)) { - rcu_read_unlock(); - return -EPERM; - } + if (cred->user->user_ns == tcred->user->user_ns && + (cred->uid == tcred->euid && + cred->uid == tcred->suid && + cred->uid == tcred->uid && + cred->gid == tcred->egid && + cred->gid == tcred->sgid && + cred->gid == tcred->gid)) + goto ok; + if (ns_capable(tcred->user->user_ns, CAP_SYS_PTRACE)) + goto ok; + rcu_read_unlock(); + return -EPERM; +ok: rcu_read_unlock(); smp_rmb(); if (task->mm) dumpable = get_dumpable(task->mm); - if (!dumpable && !capable(CAP_SYS_PTRACE)) + if (!dumpable && !task_ns_capable(task, CAP_SYS_PTRACE)) return -EPERM; return security_ptrace_access_check(task, mode); @@ -198,7 +201,7 @@ static int ptrace_attach(struct task_struct *task) goto unlock_tasklist; task->ptrace = PT_PTRACED; - if (capable(CAP_SYS_PTRACE)) + if (task_ns_capable(task, CAP_SYS_PTRACE)) task->ptrace |= PT_PTRACE_CAP; __ptrace_link(task, current); diff --git a/security/commoncap.c b/security/commoncap.c index 43a205bc7d7..f20e984ccfb 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -127,18 +127,30 @@ int cap_settime(const struct timespec *ts, const struct timezone *tz) * @child: The process to be accessed * @mode: The mode of attachment. * + * If we are in the same or an ancestor user_ns and have all the target + * task's capabilities, then ptrace access is allowed. + * If we have the ptrace capability to the target user_ns, then ptrace + * access is allowed. + * Else denied. + * * Determine whether a process may access another, returning 0 if permission * granted, -ve if denied. */ int cap_ptrace_access_check(struct task_struct *child, unsigned int mode) { int ret = 0; + const struct cred *cred, *child_cred; rcu_read_lock(); - if (!cap_issubset(__task_cred(child)->cap_permitted, - current_cred()->cap_permitted) && - !capable(CAP_SYS_PTRACE)) - ret = -EPERM; + cred = current_cred(); + child_cred = __task_cred(child); + if (cred->user->user_ns == child_cred->user->user_ns && + cap_issubset(child_cred->cap_permitted, cred->cap_permitted)) + goto out; + if (ns_capable(child_cred->user->user_ns, CAP_SYS_PTRACE)) + goto out; + ret = -EPERM; +out: rcu_read_unlock(); return ret; } @@ -147,18 +159,30 @@ int cap_ptrace_access_check(struct task_struct *child, unsigned int mode) * cap_ptrace_traceme - Determine whether another process may trace the current * @parent: The task proposed to be the tracer * + * If parent is in the same or an ancestor user_ns and has all current's + * capabilities, then ptrace access is allowed. + * If parent has the ptrace capability to current's user_ns, then ptrace + * access is allowed. + * Else denied. + * * Determine whether the nominated task is permitted to trace the current * process, returning 0 if permission is granted, -ve if denied. */ int cap_ptrace_traceme(struct task_struct *parent) { int ret = 0; + const struct cred *cred, *child_cred; rcu_read_lock(); - if (!cap_issubset(current_cred()->cap_permitted, - __task_cred(parent)->cap_permitted) && - !has_capability(parent, CAP_SYS_PTRACE)) - ret = -EPERM; + cred = __task_cred(parent); + child_cred = current_cred(); + if (cred->user->user_ns == child_cred->user->user_ns && + cap_issubset(child_cred->cap_permitted, cred->cap_permitted)) + goto out; + if (has_ns_capability(parent, child_cred->user->user_ns, CAP_SYS_PTRACE)) + goto out; + ret = -EPERM; +out: rcu_read_unlock(); return ret; } -- cgit v1.2.3-70-g09d2 From 3263245de48344ad7bdd0e7256bf1606d2592f88 Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Wed, 23 Mar 2011 16:43:21 -0700 Subject: userns: make has_capability* into real functions So we can let type safety keep things sane, and as a bonus we can remove the declaration of init_user_ns in capability.h. Signed-off-by: Serge E. Hallyn Cc: "Eric W. Biederman" Cc: Daniel Lezcano Cc: David Howells Cc: James Morris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/capability.h | 34 ++++------------------------- kernel/capability.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/include/linux/capability.h b/include/linux/capability.h index 2ec4a8cc86a..16ee8b49a20 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -371,8 +371,6 @@ struct cpu_vfs_cap_data { struct dentry; struct user_namespace; -extern struct user_namespace init_user_ns; - struct user_namespace *current_user_ns(void); extern const kernel_cap_t __cap_empty_set; @@ -541,34 +539,10 @@ static inline kernel_cap_t cap_raise_nfsd_set(const kernel_cap_t a, cap_intersect(permitted, __cap_nfsd_set)); } -/** - * has_capability - Determine if a task has a superior capability available - * @t: The task in question - * @cap: The capability to be tested for - * - * Return true if the specified task has the given superior capability - * currently in effect, false if not. - * - * Note that this does not set PF_SUPERPRIV on the task. - */ -#define has_capability(t, cap) (security_real_capable((t), &init_user_ns, (cap)) == 0) - -#define has_ns_capability(t, ns, cap) (security_real_capable((t), (ns), (cap)) == 0) - -/** - * has_capability_noaudit - Determine if a task has a superior capability available (unaudited) - * @t: The task in question - * @cap: The capability to be tested for - * - * Return true if the specified task has the given superior capability - * currently in effect, false if not, but don't write an audit message for the - * check. - * - * Note that this does not set PF_SUPERPRIV on the task. - */ -#define has_capability_noaudit(t, cap) \ - (security_real_capable_noaudit((t), &init_user_ns, (cap)) == 0) - +extern bool has_capability(struct task_struct *t, int cap); +extern bool has_ns_capability(struct task_struct *t, + struct user_namespace *ns, int cap); +extern bool has_capability_noaudit(struct task_struct *t, int cap); extern bool capable(int cap); extern bool ns_capable(struct user_namespace *ns, int cap); extern bool task_ns_capable(struct task_struct *t, int cap); diff --git a/kernel/capability.c b/kernel/capability.c index 0a3d2c863a1..bf0c734d0c1 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -290,6 +290,60 @@ error: return ret; } +/** + * has_capability - Does a task have a capability in init_user_ns + * @t: The task in question + * @cap: The capability to be tested for + * + * Return true if the specified task has the given superior capability + * currently in effect to the initial user namespace, false if not. + * + * Note that this does not set PF_SUPERPRIV on the task. + */ +bool has_capability(struct task_struct *t, int cap) +{ + int ret = security_real_capable(t, &init_user_ns, cap); + + return (ret == 0); +} + +/** + * has_capability - Does a task have a capability in a specific user ns + * @t: The task in question + * @ns: target user namespace + * @cap: The capability to be tested for + * + * Return true if the specified task has the given superior capability + * currently in effect to the specified user namespace, false if not. + * + * Note that this does not set PF_SUPERPRIV on the task. + */ +bool has_ns_capability(struct task_struct *t, + struct user_namespace *ns, int cap) +{ + int ret = security_real_capable(t, ns, cap); + + return (ret == 0); +} + +/** + * has_capability_noaudit - Does a task have a capability (unaudited) + * @t: The task in question + * @cap: The capability to be tested for + * + * Return true if the specified task has the given superior capability + * currently in effect to init_user_ns, false if not. Don't write an + * audit message for the check. + * + * Note that this does not set PF_SUPERPRIV on the task. + */ +bool has_capability_noaudit(struct task_struct *t, int cap) +{ + int ret = security_real_capable_noaudit(t, &init_user_ns, cap); + + return (ret == 0); +} + /** * capable - Determine if the current task has a superior capability in effect * @cap: The capability to be tested for -- cgit v1.2.3-70-g09d2 From fc832ad3645f0507f24d11752544525a50a83c71 Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Wed, 23 Mar 2011 16:43:22 -0700 Subject: userns: user namespaces: convert all capable checks in kernel/sys.c This allows setuid/setgid in containers. It also fixes some corner cases where kernel logic foregoes capability checks when uids are equivalent. The latter will need to be done throughout the whole kernel. Changelog: Jan 11: Use nsown_capable() as suggested by Bastian Blank. Jan 11: Fix logic errors in uid checks pointed out by Bastian. Feb 15: allow prlimit to current (was regression in previous version) Feb 23: remove debugging printks, uninline set_one_prio_perm and make it bool, and document its return value. Signed-off-by: Serge E. Hallyn Acked-by: "Eric W. Biederman" Acked-by: Daniel Lezcano Acked-by: David Howells Cc: James Morris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 75 +++++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 49 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 5761c53e19e..af468edf096 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -119,17 +119,34 @@ EXPORT_SYMBOL(cad_pid); void (*pm_power_off_prepare)(void); +/* + * Returns true if current's euid is same as p's uid or euid, + * or has CAP_SYS_NICE to p's user_ns. + * + * Called with rcu_read_lock, creds are safe + */ +static bool set_one_prio_perm(struct task_struct *p) +{ + const struct cred *cred = current_cred(), *pcred = __task_cred(p); + + if (pcred->user->user_ns == cred->user->user_ns && + (pcred->uid == cred->euid || + pcred->euid == cred->euid)) + return true; + if (ns_capable(pcred->user->user_ns, CAP_SYS_NICE)) + return true; + return false; +} + /* * set the priority of a task * - the caller must hold the RCU read lock */ static int set_one_prio(struct task_struct *p, int niceval, int error) { - const struct cred *cred = current_cred(), *pcred = __task_cred(p); int no_nice; - if (pcred->uid != cred->euid && - pcred->euid != cred->euid && !capable(CAP_SYS_NICE)) { + if (!set_one_prio_perm(p)) { error = -EPERM; goto out; } @@ -506,7 +523,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) if (rgid != (gid_t) -1) { if (old->gid == rgid || old->egid == rgid || - capable(CAP_SETGID)) + nsown_capable(CAP_SETGID)) new->gid = rgid; else goto error; @@ -515,7 +532,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) if (old->gid == egid || old->egid == egid || old->sgid == egid || - capable(CAP_SETGID)) + nsown_capable(CAP_SETGID)) new->egid = egid; else goto error; @@ -550,7 +567,7 @@ SYSCALL_DEFINE1(setgid, gid_t, gid) old = current_cred(); retval = -EPERM; - if (capable(CAP_SETGID)) + if (nsown_capable(CAP_SETGID)) new->gid = new->egid = new->sgid = new->fsgid = gid; else if (gid == old->gid || gid == old->sgid) new->egid = new->fsgid = gid; @@ -617,7 +634,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) new->uid = ruid; if (old->uid != ruid && old->euid != ruid && - !capable(CAP_SETUID)) + !nsown_capable(CAP_SETUID)) goto error; } @@ -626,7 +643,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) if (old->uid != euid && old->euid != euid && old->suid != euid && - !capable(CAP_SETUID)) + !nsown_capable(CAP_SETUID)) goto error; } @@ -674,7 +691,7 @@ SYSCALL_DEFINE1(setuid, uid_t, uid) old = current_cred(); retval = -EPERM; - if (capable(CAP_SETUID)) { + if (nsown_capable(CAP_SETUID)) { new->suid = new->uid = uid; if (uid != old->uid) { retval = set_user(new); @@ -716,7 +733,7 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) old = current_cred(); retval = -EPERM; - if (!capable(CAP_SETUID)) { + if (!nsown_capable(CAP_SETUID)) { if (ruid != (uid_t) -1 && ruid != old->uid && ruid != old->euid && ruid != old->suid) goto error; @@ -780,7 +797,7 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) old = current_cred(); retval = -EPERM; - if (!capable(CAP_SETGID)) { + if (!nsown_capable(CAP_SETGID)) { if (rgid != (gid_t) -1 && rgid != old->gid && rgid != old->egid && rgid != old->sgid) goto error; @@ -840,7 +857,7 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid) if (uid == old->uid || uid == old->euid || uid == old->suid || uid == old->fsuid || - capable(CAP_SETUID)) { + nsown_capable(CAP_SETUID)) { if (uid != old_fsuid) { new->fsuid = uid; if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) @@ -873,7 +890,7 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid) if (gid == old->gid || gid == old->egid || gid == old->sgid || gid == old->fsgid || - capable(CAP_SETGID)) { + nsown_capable(CAP_SETGID)) { if (gid != old_fsgid) { new->fsgid = gid; goto change_okay; @@ -1183,6 +1200,7 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; + if (len < 0 || len > __NEW_UTS_LEN) return -EINVAL; down_write(&uts_sem); @@ -1230,7 +1248,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len) int errno; char tmp[__NEW_UTS_LEN]; - if (!capable(CAP_SYS_ADMIN)) + if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; if (len < 0 || len > __NEW_UTS_LEN) return -EINVAL; @@ -1345,6 +1363,8 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource, rlim = tsk->signal->rlim + resource; task_lock(tsk->group_leader); if (new_rlim) { + /* Keep the capable check against init_user_ns until + cgroups can contain all limits */ if (new_rlim->rlim_max > rlim->rlim_max && !capable(CAP_SYS_RESOURCE)) retval = -EPERM; @@ -1388,19 +1408,22 @@ static int check_prlimit_permission(struct task_struct *task) { const struct cred *cred = current_cred(), *tcred; - tcred = __task_cred(task); - if (current != task && - (cred->uid != tcred->euid || - cred->uid != tcred->suid || - cred->uid != tcred->uid || - cred->gid != tcred->egid || - cred->gid != tcred->sgid || - cred->gid != tcred->gid) && - !capable(CAP_SYS_RESOURCE)) { - return -EPERM; - } + if (current == task) + return 0; - return 0; + tcred = __task_cred(task); + if (cred->user->user_ns == tcred->user->user_ns && + (cred->uid == tcred->euid && + cred->uid == tcred->suid && + cred->uid == tcred->uid && + cred->gid == tcred->egid && + cred->gid == tcred->sgid && + cred->gid == tcred->gid)) + return 0; + if (ns_capable(tcred->user->user_ns, CAP_SYS_RESOURCE)) + return 0; + + return -EPERM; } SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource, -- cgit v1.2.3-70-g09d2 From b515498f5bb5f38fc0e390b4ff7d00b6077de127 Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Wed, 23 Mar 2011 16:43:23 -0700 Subject: userns: add a user namespace owner of ipc ns Changelog: Feb 15: Don't set new ipc->user_ns if we didn't create a new ipc_ns. Feb 23: Move extern declaration to ipc_namespace.h, and group fwd declarations at top. Signed-off-by: Serge E. Hallyn Acked-by: "Eric W. Biederman" Acked-by: Daniel Lezcano Acked-by: David Howells Cc: James Morris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ipc_namespace.h | 3 +++ ipc/msgutil.c | 1 + ipc/namespace.c | 9 +++++++-- kernel/nsproxy.c | 5 +++++ 4 files changed, 16 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index 51952989ad4..d3c32dcec62 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -15,6 +15,7 @@ #define IPCNS_CALLBACK_PRI 0 +struct user_namespace; struct ipc_ids { int in_use; @@ -56,6 +57,8 @@ struct ipc_namespace { unsigned int mq_msg_max; /* initialized to DFLT_MSGMAX */ unsigned int mq_msgsize_max; /* initialized to DFLT_MSGSIZEMAX */ + /* user_ns which owns the ipc ns */ + struct user_namespace *user_ns; }; extern struct ipc_namespace init_ipc_ns; diff --git a/ipc/msgutil.c b/ipc/msgutil.c index f095ee26883..8b5ce5d3f3e 100644 --- a/ipc/msgutil.c +++ b/ipc/msgutil.c @@ -32,6 +32,7 @@ struct ipc_namespace init_ipc_ns = { .mq_msg_max = DFLT_MSGMAX, .mq_msgsize_max = DFLT_MSGSIZEMAX, #endif + .user_ns = &init_user_ns, }; atomic_t nr_ipc_ns = ATOMIC_INIT(1); diff --git a/ipc/namespace.c b/ipc/namespace.c index a1094ff0bef..aa188996269 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -11,10 +11,11 @@ #include #include #include +#include #include "util.h" -static struct ipc_namespace *create_ipc_ns(void) +static struct ipc_namespace *create_ipc_ns(struct ipc_namespace *old_ns) { struct ipc_namespace *ns; int err; @@ -43,6 +44,9 @@ static struct ipc_namespace *create_ipc_ns(void) ipcns_notify(IPCNS_CREATED); register_ipcns_notifier(ns); + ns->user_ns = old_ns->user_ns; + get_user_ns(ns->user_ns); + return ns; } @@ -50,7 +54,7 @@ struct ipc_namespace *copy_ipcs(unsigned long flags, struct ipc_namespace *ns) { if (!(flags & CLONE_NEWIPC)) return get_ipc_ns(ns); - return create_ipc_ns(); + return create_ipc_ns(ns); } /* @@ -105,6 +109,7 @@ static void free_ipc_ns(struct ipc_namespace *ns) * order to have a correct value when recomputing msgmni. */ ipcns_notify(IPCNS_REMOVED); + put_user_ns(ns->user_ns); } /* diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index b97fc9d04dd..ac8a56e90bf 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -80,6 +80,11 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, err = PTR_ERR(new_nsp->ipc_ns); goto out_ipc; } + if (new_nsp->ipc_ns != tsk->nsproxy->ipc_ns) { + put_user_ns(new_nsp->ipc_ns->user_ns); + new_nsp->ipc_ns->user_ns = task_cred_xxx(tsk, user)->user_ns; + get_user_ns(new_nsp->ipc_ns->user_ns); + } new_nsp->pid_ns = copy_pid_ns(flags, task_active_pid_ns(tsk)); if (IS_ERR(new_nsp->pid_ns)) { -- cgit v1.2.3-70-g09d2 From b0e77598f87107001a00b8a4ece9c95e4254ccc4 Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Wed, 23 Mar 2011 16:43:24 -0700 Subject: userns: user namespaces: convert several capable() calls CAP_IPC_OWNER and CAP_IPC_LOCK can be checked against current_user_ns(), because the resource comes from current's own ipc namespace. setuid/setgid are to uids in own namespace, so again checks can be against current_user_ns(). Changelog: Jan 11: Use task_ns_capable() in place of sched_capable(). Jan 11: Use nsown_capable() as suggested by Bastian Blank. Jan 11: Clarify (hopefully) some logic in futex and sched.c Feb 15: use ns_capable for ipc, not nsown_capable Feb 23: let copy_ipcs handle setting ipc_ns->user_ns Feb 23: pass ns down rather than taking it from current [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Serge E. Hallyn Acked-by: "Eric W. Biederman" Acked-by: Daniel Lezcano Acked-by: David Howells Cc: James Morris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ipc_namespace.h | 7 ++++--- ipc/msg.c | 8 ++++---- ipc/namespace.c | 13 ++++++++----- ipc/sem.c | 10 ++++++---- ipc/shm.c | 9 +++++---- ipc/util.c | 26 ++++++++++++++++---------- ipc/util.h | 5 +++-- kernel/futex.c | 11 ++++++++++- kernel/futex_compat.c | 11 ++++++++++- kernel/groups.c | 2 +- kernel/nsproxy.c | 7 +------ kernel/sched.c | 9 ++++++--- kernel/uid16.c | 2 +- 13 files changed, 75 insertions(+), 45 deletions(-) (limited to 'kernel') diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index d3c32dcec62..a6d1655f960 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -5,6 +5,7 @@ #include #include #include +#include /* * ipc namespace events @@ -93,7 +94,7 @@ static inline int mq_init_ns(struct ipc_namespace *ns) { return 0; } #if defined(CONFIG_IPC_NS) extern struct ipc_namespace *copy_ipcs(unsigned long flags, - struct ipc_namespace *ns); + struct task_struct *tsk); static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns) { if (ns) @@ -104,12 +105,12 @@ static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns) extern void put_ipc_ns(struct ipc_namespace *ns); #else static inline struct ipc_namespace *copy_ipcs(unsigned long flags, - struct ipc_namespace *ns) + struct task_struct *tsk) { if (flags & CLONE_NEWIPC) return ERR_PTR(-EINVAL); - return ns; + return tsk->nsproxy->ipc_ns; } static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns) diff --git a/ipc/msg.c b/ipc/msg.c index 747b65507a9..0e732e92e22 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -421,7 +421,7 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd, return -EFAULT; } - ipcp = ipcctl_pre_down(&msg_ids(ns), msqid, cmd, + ipcp = ipcctl_pre_down(ns, &msg_ids(ns), msqid, cmd, &msqid64.msg_perm, msqid64.msg_qbytes); if (IS_ERR(ipcp)) return PTR_ERR(ipcp); @@ -539,7 +539,7 @@ SYSCALL_DEFINE3(msgctl, int, msqid, int, cmd, struct msqid_ds __user *, buf) success_return = 0; } err = -EACCES; - if (ipcperms(&msq->q_perm, S_IRUGO)) + if (ipcperms(ns, &msq->q_perm, S_IRUGO)) goto out_unlock; err = security_msg_queue_msgctl(msq, cmd); @@ -664,7 +664,7 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext, struct msg_sender s; err = -EACCES; - if (ipcperms(&msq->q_perm, S_IWUGO)) + if (ipcperms(ns, &msq->q_perm, S_IWUGO)) goto out_unlock_free; err = security_msg_queue_msgsnd(msq, msg, msgflg); @@ -774,7 +774,7 @@ long do_msgrcv(int msqid, long *pmtype, void __user *mtext, struct list_head *tmp; msg = ERR_PTR(-EACCES); - if (ipcperms(&msq->q_perm, S_IRUGO)) + if (ipcperms(ns, &msq->q_perm, S_IRUGO)) goto out_unlock; msg = ERR_PTR(-EAGAIN); diff --git a/ipc/namespace.c b/ipc/namespace.c index aa188996269..3c3e5223e7e 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -15,7 +15,8 @@ #include "util.h" -static struct ipc_namespace *create_ipc_ns(struct ipc_namespace *old_ns) +static struct ipc_namespace *create_ipc_ns(struct task_struct *tsk, + struct ipc_namespace *old_ns) { struct ipc_namespace *ns; int err; @@ -44,17 +45,19 @@ static struct ipc_namespace *create_ipc_ns(struct ipc_namespace *old_ns) ipcns_notify(IPCNS_CREATED); register_ipcns_notifier(ns); - ns->user_ns = old_ns->user_ns; - get_user_ns(ns->user_ns); + ns->user_ns = get_user_ns(task_cred_xxx(tsk, user)->user_ns); return ns; } -struct ipc_namespace *copy_ipcs(unsigned long flags, struct ipc_namespace *ns) +struct ipc_namespace *copy_ipcs(unsigned long flags, + struct task_struct *tsk) { + struct ipc_namespace *ns = tsk->nsproxy->ipc_ns; + if (!(flags & CLONE_NEWIPC)) return get_ipc_ns(ns); - return create_ipc_ns(ns); + return create_ipc_ns(tsk, ns); } /* diff --git a/ipc/sem.c b/ipc/sem.c index 0e0d49bbb86..ae040a0727c 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -817,7 +817,7 @@ static int semctl_nolock(struct ipc_namespace *ns, int semid, } err = -EACCES; - if (ipcperms (&sma->sem_perm, S_IRUGO)) + if (ipcperms(ns, &sma->sem_perm, S_IRUGO)) goto out_unlock; err = security_sem_semctl(sma, cmd); @@ -862,7 +862,8 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, nsems = sma->sem_nsems; err = -EACCES; - if (ipcperms (&sma->sem_perm, (cmd==SETVAL||cmd==SETALL)?S_IWUGO:S_IRUGO)) + if (ipcperms(ns, &sma->sem_perm, + (cmd == SETVAL || cmd == SETALL) ? S_IWUGO : S_IRUGO)) goto out_unlock; err = security_sem_semctl(sma, cmd); @@ -1047,7 +1048,8 @@ static int semctl_down(struct ipc_namespace *ns, int semid, return -EFAULT; } - ipcp = ipcctl_pre_down(&sem_ids(ns), semid, cmd, &semid64.sem_perm, 0); + ipcp = ipcctl_pre_down(ns, &sem_ids(ns), semid, cmd, + &semid64.sem_perm, 0); if (IS_ERR(ipcp)) return PTR_ERR(ipcp); @@ -1386,7 +1388,7 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, goto out_unlock_free; error = -EACCES; - if (ipcperms(&sma->sem_perm, alter ? S_IWUGO : S_IRUGO)) + if (ipcperms(ns, &sma->sem_perm, alter ? S_IWUGO : S_IRUGO)) goto out_unlock_free; error = security_sem_semop(sma, sops, nsops, alter); diff --git a/ipc/shm.c b/ipc/shm.c index 7d3bb22a930..8644452f5c4 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -623,7 +623,8 @@ static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd, return -EFAULT; } - ipcp = ipcctl_pre_down(&shm_ids(ns), shmid, cmd, &shmid64.shm_perm, 0); + ipcp = ipcctl_pre_down(ns, &shm_ids(ns), shmid, cmd, + &shmid64.shm_perm, 0); if (IS_ERR(ipcp)) return PTR_ERR(ipcp); @@ -737,7 +738,7 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf) result = 0; } err = -EACCES; - if (ipcperms (&shp->shm_perm, S_IRUGO)) + if (ipcperms(ns, &shp->shm_perm, S_IRUGO)) goto out_unlock; err = security_shm_shmctl(shp, cmd); if (err) @@ -773,7 +774,7 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf) audit_ipc_obj(&(shp->shm_perm)); - if (!capable(CAP_IPC_LOCK)) { + if (!ns_capable(ns->user_ns, CAP_IPC_LOCK)) { uid_t euid = current_euid(); err = -EPERM; if (euid != shp->shm_perm.uid && @@ -888,7 +889,7 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr) } err = -EACCES; - if (ipcperms(&shp->shm_perm, acc_mode)) + if (ipcperms(ns, &shp->shm_perm, acc_mode)) goto out_unlock; err = security_shm_shmat(shp, shmaddr, shmflg); diff --git a/ipc/util.c b/ipc/util.c index 69a0cc13d96..8fd1b891ec0 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -329,12 +329,14 @@ retry: * * It is called with ipc_ids.rw_mutex and ipcp->lock held. */ -static int ipc_check_perms(struct kern_ipc_perm *ipcp, struct ipc_ops *ops, - struct ipc_params *params) +static int ipc_check_perms(struct ipc_namespace *ns, + struct kern_ipc_perm *ipcp, + struct ipc_ops *ops, + struct ipc_params *params) { int err; - if (ipcperms(ipcp, params->flg)) + if (ipcperms(ns, ipcp, params->flg)) err = -EACCES; else { err = ops->associate(ipcp, params->flg); @@ -396,7 +398,7 @@ retry: * ipc_check_perms returns the IPC id on * success */ - err = ipc_check_perms(ipcp, ops, params); + err = ipc_check_perms(ns, ipcp, ops, params); } ipc_unlock(ipcp); } @@ -610,10 +612,12 @@ void ipc_rcu_putref(void *ptr) * * Check user, group, other permissions for access * to ipc resources. return 0 if allowed + * + * @flag will most probably be 0 or S_...UGO from */ -int ipcperms (struct kern_ipc_perm *ipcp, short flag) -{ /* flag will most probably be 0 or S_...UGO from */ +int ipcperms(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp, short flag) +{ uid_t euid = current_euid(); int requested_mode, granted_mode; @@ -627,7 +631,7 @@ int ipcperms (struct kern_ipc_perm *ipcp, short flag) granted_mode >>= 3; /* is there some bit set in requested_mode but not in granted_mode? */ if ((requested_mode & ~granted_mode & 0007) && - !capable(CAP_IPC_OWNER)) + !ns_capable(ns->user_ns, CAP_IPC_OWNER)) return -1; return security_ipc_permission(ipcp, flag); @@ -765,6 +769,7 @@ void ipc_update_perm(struct ipc64_perm *in, struct kern_ipc_perm *out) /** * ipcctl_pre_down - retrieve an ipc and check permissions for some IPC_XXX cmd + * @ids: the ipc namespace * @ids: the table of ids where to look for the ipc * @id: the id of the ipc to retrieve * @cmd: the cmd to check @@ -779,7 +784,8 @@ void ipc_update_perm(struct ipc64_perm *in, struct kern_ipc_perm *out) * - returns the ipc with both ipc and rw_mutex locks held in case of success * or an err-code without any lock held otherwise. */ -struct kern_ipc_perm *ipcctl_pre_down(struct ipc_ids *ids, int id, int cmd, +struct kern_ipc_perm *ipcctl_pre_down(struct ipc_namespace *ns, + struct ipc_ids *ids, int id, int cmd, struct ipc64_perm *perm, int extra_perm) { struct kern_ipc_perm *ipcp; @@ -799,8 +805,8 @@ struct kern_ipc_perm *ipcctl_pre_down(struct ipc_ids *ids, int id, int cmd, perm->gid, perm->mode); euid = current_euid(); - if (euid == ipcp->cuid || - euid == ipcp->uid || capable(CAP_SYS_ADMIN)) + if (euid == ipcp->cuid || euid == ipcp->uid || + ns_capable(ns->user_ns, CAP_SYS_ADMIN)) return ipcp; err = -EPERM; diff --git a/ipc/util.h b/ipc/util.h index 764b51a37a6..6f5c20bedaa 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -103,7 +103,7 @@ int ipc_get_maxid(struct ipc_ids *); void ipc_rmid(struct ipc_ids *, struct kern_ipc_perm *); /* must be called with ipcp locked */ -int ipcperms(struct kern_ipc_perm *ipcp, short flg); +int ipcperms(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp, short flg); /* for rare, potentially huge allocations. * both function can sleep @@ -126,7 +126,8 @@ struct kern_ipc_perm *ipc_lock(struct ipc_ids *, int); void kernel_to_ipc64_perm(struct kern_ipc_perm *in, struct ipc64_perm *out); void ipc64_perm_to_ipc_perm(struct ipc64_perm *in, struct ipc_perm *out); void ipc_update_perm(struct ipc64_perm *in, struct kern_ipc_perm *out); -struct kern_ipc_perm *ipcctl_pre_down(struct ipc_ids *ids, int id, int cmd, +struct kern_ipc_perm *ipcctl_pre_down(struct ipc_namespace *ns, + struct ipc_ids *ids, int id, int cmd, struct ipc64_perm *perm, int extra_perm); #ifndef __ARCH_WANT_IPC_PARSE_VERSION diff --git a/kernel/futex.c b/kernel/futex.c index bda41571538..6570c459f31 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2418,10 +2418,19 @@ SYSCALL_DEFINE3(get_robust_list, int, pid, goto err_unlock; ret = -EPERM; pcred = __task_cred(p); + /* If victim is in different user_ns, then uids are not + comparable, so we must have CAP_SYS_PTRACE */ + if (cred->user->user_ns != pcred->user->user_ns) { + if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE)) + goto err_unlock; + goto ok; + } + /* If victim is in same user_ns, then uids are comparable */ if (cred->euid != pcred->euid && cred->euid != pcred->uid && - !capable(CAP_SYS_PTRACE)) + !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE)) goto err_unlock; +ok: head = p->robust_list; rcu_read_unlock(); } diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index a7934ac75e5..5f9e689dc8f 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -153,10 +153,19 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, goto err_unlock; ret = -EPERM; pcred = __task_cred(p); + /* If victim is in different user_ns, then uids are not + comparable, so we must have CAP_SYS_PTRACE */ + if (cred->user->user_ns != pcred->user->user_ns) { + if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE)) + goto err_unlock; + goto ok; + } + /* If victim is in same user_ns, then uids are comparable */ if (cred->euid != pcred->euid && cred->euid != pcred->uid && - !capable(CAP_SYS_PTRACE)) + !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE)) goto err_unlock; +ok: head = p->compat_robust_list; rcu_read_unlock(); } diff --git a/kernel/groups.c b/kernel/groups.c index 253dc0f35cf..1cc476d52dd 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -233,7 +233,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist) struct group_info *group_info; int retval; - if (!capable(CAP_SETGID)) + if (!nsown_capable(CAP_SETGID)) return -EPERM; if ((unsigned)gidsetsize > NGROUPS_MAX) return -EINVAL; diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index ac8a56e90bf..a05d191ffdd 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -75,16 +75,11 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, goto out_uts; } - new_nsp->ipc_ns = copy_ipcs(flags, tsk->nsproxy->ipc_ns); + new_nsp->ipc_ns = copy_ipcs(flags, tsk); if (IS_ERR(new_nsp->ipc_ns)) { err = PTR_ERR(new_nsp->ipc_ns); goto out_ipc; } - if (new_nsp->ipc_ns != tsk->nsproxy->ipc_ns) { - put_user_ns(new_nsp->ipc_ns->user_ns); - new_nsp->ipc_ns->user_ns = task_cred_xxx(tsk, user)->user_ns; - get_user_ns(new_nsp->ipc_ns->user_ns); - } new_nsp->pid_ns = copy_pid_ns(flags, task_active_pid_ns(tsk)); if (IS_ERR(new_nsp->pid_ns)) { diff --git a/kernel/sched.c b/kernel/sched.c index a172494a9a6..480adeb63f8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4892,8 +4892,11 @@ static bool check_same_owner(struct task_struct *p) rcu_read_lock(); pcred = __task_cred(p); - match = (cred->euid == pcred->euid || - cred->euid == pcred->uid); + if (cred->user->user_ns == pcred->user->user_ns) + match = (cred->euid == pcred->euid || + cred->euid == pcred->uid); + else + match = false; rcu_read_unlock(); return match; } @@ -5221,7 +5224,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) goto out_free_cpus_allowed; } retval = -EPERM; - if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) + if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE)) goto out_unlock; retval = security_task_setscheduler(p); diff --git a/kernel/uid16.c b/kernel/uid16.c index 419209893d8..51c6e89e861 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -189,7 +189,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist) struct group_info *group_info; int retval; - if (!capable(CAP_SETGID)) + if (!nsown_capable(CAP_SETGID)) return -EPERM; if ((unsigned)gidsetsize > NGROUPS_MAX) return -EINVAL; -- cgit v1.2.3-70-g09d2 From f9b182e24ecb2b3bb33340f053ba31c8c4e1d895 Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Wed, 23 Mar 2011 16:43:27 -0700 Subject: taskstats: use appropriate printk priority level printk()s without a priority level default to KERN_WARNING. To reduce noise at KERN_WARNING, this patch set the priority level appriopriately for unleveled printks()s. This should be useful to folks that look at dmesg warnings closely. Signed-off-by: Mandeep Singh Baines Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/taskstats.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 3971c6b9d58..9ffea360a77 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -685,7 +685,7 @@ static int __init taskstats_init(void) goto err_cgroup_ops; family_registered = 1; - printk("registered taskstats version %d\n", TASKSTATS_GENL_VERSION); + pr_info("registered taskstats version %d\n", TASKSTATS_GENL_VERSION); return 0; err_cgroup_ops: genl_unregister_ops(&family, &taskstats_ops); -- cgit v1.2.3-70-g09d2 From 93a72052be81823fa1584b9be037d51924f9efa4 Mon Sep 17 00:00:00 2001 From: Olaf Hering Date: Wed, 23 Mar 2011 16:43:29 -0700 Subject: crash_dump: export is_kdump_kernel to modules, consolidate elfcorehdr_addr, setup_elfcorehdr and saved_max_pfn The Xen PV drivers in a crashed HVM guest can not connect to the dom0 backend drivers because both frontend and backend drivers are still in connected state. To run the connection reset function only in case of a crashdump, the is_kdump_kernel() function needs to be available for the PV driver modules. Consolidate elfcorehdr_addr, setup_elfcorehdr and saved_max_pfn into kernel/crash_dump.c Also export elfcorehdr_addr to make is_kdump_kernel() usable for modules. Leave 'elfcorehdr' as early_param(). This changes powerpc from __setup() to early_param(). It adds an address range check from x86 also on ia64 and powerpc. [akpm@linux-foundation.org: additional #includes] [akpm@linux-foundation.org: remove elfcorehdr_addr export] [akpm@linux-foundation.org: fix for Tejun's mm/nobootmem.c changes] Signed-off-by: Olaf Hering Cc: Russell King Cc: "Luck, Tony" Cc: Benjamin Herrenschmidt Cc: Paul Mundt Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/kernel/crash_dump.c | 3 --- arch/arm/kernel/setup.c | 24 ------------------------ arch/ia64/kernel/crash_dump.c | 3 --- arch/ia64/kernel/efi.c | 1 + arch/ia64/kernel/setup.c | 18 ------------------ arch/powerpc/kernel/crash_dump.c | 17 ----------------- arch/sh/kernel/crash_dump.c | 22 ---------------------- arch/x86/kernel/crash_dump_32.c | 3 --- arch/x86/kernel/crash_dump_64.c | 3 --- arch/x86/kernel/e820.c | 1 + arch/x86/kernel/setup.c | 22 ---------------------- include/linux/bootmem.h | 4 ---- kernel/Makefile | 1 + kernel/crash_dump.c | 34 ++++++++++++++++++++++++++++++++++ mm/bootmem.c | 8 -------- mm/nobootmem.c | 8 -------- 16 files changed, 37 insertions(+), 135 deletions(-) create mode 100644 kernel/crash_dump.c (limited to 'kernel') diff --git a/arch/arm/kernel/crash_dump.c b/arch/arm/kernel/crash_dump.c index cd3b853a8a6..90c50d4b43f 100644 --- a/arch/arm/kernel/crash_dump.c +++ b/arch/arm/kernel/crash_dump.c @@ -18,9 +18,6 @@ #include #include -/* stores the physical address of elf header of crash image */ -unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; - /** * copy_oldmem_page() - copy one page from old kernel memory * @pfn: page frame number to be copied diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c index d1da9217427..c36c1a4250f 100644 --- a/arch/arm/kernel/setup.c +++ b/arch/arm/kernel/setup.c @@ -788,30 +788,6 @@ static void __init reserve_crashkernel(void) static inline void reserve_crashkernel(void) {} #endif /* CONFIG_KEXEC */ -/* - * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by - * is_kdump_kernel() to determine if we are booting after a panic. Hence - * ifdef it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE. - */ - -#ifdef CONFIG_CRASH_DUMP -/* - * elfcorehdr= specifies the location of elf core header stored by the crashed - * kernel. This option will be passed by kexec loader to the capture kernel. - */ -static int __init setup_elfcorehdr(char *arg) -{ - char *end; - - if (!arg) - return -EINVAL; - - elfcorehdr_addr = memparse(arg, &end); - return end > arg ? 0 : -EINVAL; -} -early_param("elfcorehdr", setup_elfcorehdr); -#endif /* CONFIG_CRASH_DUMP */ - static void __init squash_mem_tags(struct tag *tag) { for (; tag->hdr.size; tag = tag_next(tag)) diff --git a/arch/ia64/kernel/crash_dump.c b/arch/ia64/kernel/crash_dump.c index 23e91290e41..c8c9298666f 100644 --- a/arch/ia64/kernel/crash_dump.c +++ b/arch/ia64/kernel/crash_dump.c @@ -13,9 +13,6 @@ #include #include -/* Stores the physical address of elf header of crash image. */ -unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; - /** * copy_oldmem_page - copy one page from "oldmem" * @pfn: page frame number to be copied diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c index a0f00192850..6fc03aff046 100644 --- a/arch/ia64/kernel/efi.c +++ b/arch/ia64/kernel/efi.c @@ -23,6 +23,7 @@ */ #include #include +#include #include #include #include diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c index 911cf974970..5e2c72498c5 100644 --- a/arch/ia64/kernel/setup.c +++ b/arch/ia64/kernel/setup.c @@ -479,25 +479,7 @@ static __init int setup_nomca(char *s) } early_param("nomca", setup_nomca); -/* - * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by - * is_kdump_kernel() to determine if we are booting after a panic. Hence - * ifdef it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE. - */ #ifdef CONFIG_CRASH_DUMP -/* elfcorehdr= specifies the location of elf core header - * stored by the crashed kernel. - */ -static int __init parse_elfcorehdr(char *arg) -{ - if (!arg) - return -EINVAL; - - elfcorehdr_addr = memparse(arg, &arg); - return 0; -} -early_param("elfcorehdr", parse_elfcorehdr); - int __init reserve_elfcorehdr(u64 *start, u64 *end) { u64 length; diff --git a/arch/powerpc/kernel/crash_dump.c b/arch/powerpc/kernel/crash_dump.c index 0a2af50243c..424afb6b8fb 100644 --- a/arch/powerpc/kernel/crash_dump.c +++ b/arch/powerpc/kernel/crash_dump.c @@ -28,9 +28,6 @@ #define DBG(fmt...) #endif -/* Stores the physical address of elf header of crash image. */ -unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; - #ifndef CONFIG_RELOCATABLE void __init reserve_kdump_trampoline(void) { @@ -72,20 +69,6 @@ void __init setup_kdump_trampoline(void) } #endif /* CONFIG_RELOCATABLE */ -/* - * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by - * is_kdump_kernel() to determine if we are booting after a panic. Hence - * ifdef it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE. - */ -static int __init parse_elfcorehdr(char *p) -{ - if (p) - elfcorehdr_addr = memparse(p, &p); - - return 1; -} -__setup("elfcorehdr=", parse_elfcorehdr); - static int __init parse_savemaxmem(char *p) { if (p) diff --git a/arch/sh/kernel/crash_dump.c b/arch/sh/kernel/crash_dump.c index 37c97d44457..569e7b171c0 100644 --- a/arch/sh/kernel/crash_dump.c +++ b/arch/sh/kernel/crash_dump.c @@ -9,28 +9,6 @@ #include #include -/* Stores the physical address of elf header of crash image. */ -unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; - -/* - * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by - * is_kdump_kernel() to determine if we are booting after a panic. Hence - * ifdef it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE. - * - * elfcorehdr= specifies the location of elf core header - * stored by the crashed kernel. - */ -static int __init parse_elfcorehdr(char *arg) -{ - if (!arg) - return -EINVAL; - - elfcorehdr_addr = memparse(arg, &arg); - - return 0; -} -early_param("elfcorehdr", parse_elfcorehdr); - /** * copy_oldmem_page - copy one page from "oldmem" * @pfn: page frame number to be copied diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c index d5cd13945d5..642f75a68cd 100644 --- a/arch/x86/kernel/crash_dump_32.c +++ b/arch/x86/kernel/crash_dump_32.c @@ -14,9 +14,6 @@ static void *kdump_buf_page; -/* Stores the physical address of elf header of crash image. */ -unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; - static inline bool is_crashed_pfn_valid(unsigned long pfn) { #ifndef CONFIG_X86_PAE diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c index 994828899e0..afa64adb75e 100644 --- a/arch/x86/kernel/crash_dump_64.c +++ b/arch/x86/kernel/crash_dump_64.c @@ -10,9 +10,6 @@ #include #include -/* Stores the physical address of elf header of crash image. */ -unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; - /** * copy_oldmem_page - copy one page from "oldmem" * @pfn: page frame number to be copied diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index cdf5bfd9d4d..3e2ef842531 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 32bd87cbf98..5a0484a95ad 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -619,28 +619,6 @@ void __init reserve_standard_io_resources(void) } -/* - * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by - * is_kdump_kernel() to determine if we are booting after a panic. Hence - * ifdef it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE. - */ - -#ifdef CONFIG_CRASH_DUMP -/* elfcorehdr= specifies the location of elf core header - * stored by the crashed kernel. This option will be passed - * by kexec loader to the capture kernel. - */ -static int __init setup_elfcorehdr(char *arg) -{ - char *end; - if (!arg) - return -EINVAL; - elfcorehdr_addr = memparse(arg, &end); - return end > arg ? 0 : -EINVAL; -} -early_param("elfcorehdr", setup_elfcorehdr); -#endif - static __init void reserve_ibft_region(void) { unsigned long addr, size = 0; diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index 499dfe982a0..b8613e806aa 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -19,10 +19,6 @@ extern unsigned long min_low_pfn; */ extern unsigned long max_pfn; -#ifdef CONFIG_CRASH_DUMP -extern unsigned long saved_max_pfn; -#endif - #ifndef CONFIG_NO_BOOTMEM /* * node_bootmem_map is a map pointer - the bits represent all physical diff --git a/kernel/Makefile b/kernel/Makefile index 353d3fe8ba3..85cbfb31e73 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -107,6 +107,7 @@ obj-$(CONFIG_PERF_EVENTS) += perf_event.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o obj-$(CONFIG_PADATA) += padata.o +obj-$(CONFIG_CRASH_DUMP) += crash_dump.o ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) # According to Alan Modra , the -fno-omit-frame-pointer is diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c new file mode 100644 index 00000000000..5f85690285d --- /dev/null +++ b/kernel/crash_dump.c @@ -0,0 +1,34 @@ +#include +#include +#include +#include +#include + +/* + * If we have booted due to a crash, max_pfn will be a very low value. We need + * to know the amount of memory that the previous kernel used. + */ +unsigned long saved_max_pfn; + +/* + * stores the physical address of elf header of crash image + * + * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by + * is_kdump_kernel() to determine if we are booting after a panic. Hence put + * it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE. + */ +unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; + +/* + * elfcorehdr= specifies the location of elf core header stored by the crashed + * kernel. This option will be passed by kexec loader to the capture kernel. + */ +static int __init setup_elfcorehdr(char *arg) +{ + char *end; + if (!arg) + return -EINVAL; + elfcorehdr_addr = memparse(arg, &end); + return end > arg ? 0 : -EINVAL; +} +early_param("elfcorehdr", setup_elfcorehdr); diff --git a/mm/bootmem.c b/mm/bootmem.c index 07aeb89e396..01d5a4b3dd0 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -34,14 +34,6 @@ unsigned long max_low_pfn; unsigned long min_low_pfn; unsigned long max_pfn; -#ifdef CONFIG_CRASH_DUMP -/* - * If we have booted due to a crash, max_pfn will be a very low value. We need - * to know the amount of memory that the previous kernel used. - */ -unsigned long saved_max_pfn; -#endif - bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata; static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list); diff --git a/mm/nobootmem.c b/mm/nobootmem.c index e2bdb07079c..e99f6cd1da1 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -32,14 +32,6 @@ unsigned long max_low_pfn; unsigned long min_low_pfn; unsigned long max_pfn; -#ifdef CONFIG_CRASH_DUMP -/* - * If we have booted due to a crash, max_pfn will be a very low value. We need - * to know the amount of memory that the previous kernel used. - */ -unsigned long saved_max_pfn; -#endif - static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, u64 goal, u64 limit) { -- cgit v1.2.3-70-g09d2 From 0f77a8d378254f27df4a114a5da67223af1fe93f Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 24 Mar 2011 11:42:29 +0900 Subject: vsprintf: Introduce %pB format specifier The %pB format specifier is for stack backtrace. Its handler sprint_backtrace() does symbol lookup using (address-1) to ensure the address will not point outside of the function. If there is a tail-call to the function marked "noreturn", gcc optimized out the code after the call then causes saved return address points outside of the function (i.e. the start of the next function), so pollutes call trace somewhat. This patch adds the %pB printk mechanism that allows architecture call-trace printout functions to improve backtrace printouts. Signed-off-by: Namhyung Kim Acked-by: Steven Rostedt Acked-by: Frederic Weisbecker Cc: Linus Torvalds Cc: Andrew Morton Cc: linux-arch@vger.kernel.org LKML-Reference: <1300934550-21394-1-git-send-email-namhyung@gmail.com> Signed-off-by: Ingo Molnar --- include/linux/kallsyms.h | 7 +++++++ kernel/kallsyms.c | 44 +++++++++++++++++++++++++++++++++++++++++--- lib/vsprintf.c | 7 ++++++- 3 files changed, 54 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h index d8e9b3d1c23..0df513b7a9f 100644 --- a/include/linux/kallsyms.h +++ b/include/linux/kallsyms.h @@ -36,6 +36,7 @@ const char *kallsyms_lookup(unsigned long addr, /* Look up a kernel symbol and return it in a text buffer. */ extern int sprint_symbol(char *buffer, unsigned long address); +extern int sprint_backtrace(char *buffer, unsigned long address); /* Look up a kernel symbol and print it to the kernel messages. */ extern void __print_symbol(const char *fmt, unsigned long address); @@ -79,6 +80,12 @@ static inline int sprint_symbol(char *buffer, unsigned long addr) return 0; } +static inline int sprint_backtrace(char *buffer, unsigned long addr) +{ + *buffer = '\0'; + return 0; +} + static inline int lookup_symbol_name(unsigned long addr, char *symname) { return -ERANGE; diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 6f6d091b575..59e879929b1 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -342,13 +342,15 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size, } /* Look up a kernel symbol and return it in a text buffer. */ -int sprint_symbol(char *buffer, unsigned long address) +static int __sprint_symbol(char *buffer, unsigned long address, + int symbol_offset) { char *modname; const char *name; unsigned long offset, size; int len; + address += symbol_offset; name = kallsyms_lookup(address, &size, &offset, &modname, buffer); if (!name) return sprintf(buffer, "0x%lx", address); @@ -357,17 +359,53 @@ int sprint_symbol(char *buffer, unsigned long address) strcpy(buffer, name); len = strlen(buffer); buffer += len; + offset -= symbol_offset; if (modname) - len += sprintf(buffer, "+%#lx/%#lx [%s]", - offset, size, modname); + len += sprintf(buffer, "+%#lx/%#lx [%s]", offset, size, modname); else len += sprintf(buffer, "+%#lx/%#lx", offset, size); return len; } + +/** + * sprint_symbol - Look up a kernel symbol and return it in a text buffer + * @buffer: buffer to be stored + * @address: address to lookup + * + * This function looks up a kernel symbol with @address and stores its name, + * offset, size and module name to @buffer if possible. If no symbol was found, + * just saves its @address as is. + * + * This function returns the number of bytes stored in @buffer. + */ +int sprint_symbol(char *buffer, unsigned long address) +{ + return __sprint_symbol(buffer, address, 0); +} + EXPORT_SYMBOL_GPL(sprint_symbol); +/** + * sprint_backtrace - Look up a backtrace symbol and return it in a text buffer + * @buffer: buffer to be stored + * @address: address to lookup + * + * This function is for stack backtrace and does the same thing as + * sprint_symbol() but with modified/decreased @address. If there is a + * tail-call to the function marked "noreturn", gcc optimized out code after + * the call so that the stack-saved return address could point outside of the + * caller. This function ensures that kallsyms will find the original caller + * by decreasing @address. + * + * This function returns the number of bytes stored in @buffer. + */ +int sprint_backtrace(char *buffer, unsigned long address) +{ + return __sprint_symbol(buffer, address, -1); +} + /* Look up a kernel symbol and print it to the kernel messages. */ void __print_symbol(const char *fmt, unsigned long address) { diff --git a/lib/vsprintf.c b/lib/vsprintf.c index d3023df8477..d9e01fc3168 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -574,7 +574,9 @@ char *symbol_string(char *buf, char *end, void *ptr, unsigned long value = (unsigned long) ptr; #ifdef CONFIG_KALLSYMS char sym[KSYM_SYMBOL_LEN]; - if (ext != 'f' && ext != 's') + if (ext == 'B') + sprint_backtrace(sym, value); + else if (ext != 'f' && ext != 's') sprint_symbol(sym, value); else kallsyms_lookup(value, NULL, NULL, NULL, sym); @@ -949,6 +951,7 @@ int kptr_restrict = 1; * - 'f' For simple symbolic function names without offset * - 'S' For symbolic direct pointers with offset * - 's' For symbolic direct pointers without offset + * - 'B' For backtraced symbolic direct pointers with offset * - 'R' For decoded struct resource, e.g., [mem 0x0-0x1f 64bit pref] * - 'r' For raw struct resource, e.g., [mem 0x0-0x1f flags 0x201] * - 'M' For a 6-byte MAC address, it prints the address in the @@ -1008,6 +1011,7 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr, /* Fallthrough */ case 'S': case 's': + case 'B': return symbol_string(buf, end, ptr, spec, *fmt); case 'R': case 'r': @@ -1279,6 +1283,7 @@ qualifier: * %ps output the name of a text symbol without offset * %pF output the name of a function pointer with its offset * %pf output the name of a function pointer without its offset + * %pB output the name of a backtrace symbol with its offset * %pR output the address range in a struct resource with decoded flags * %pr output the address range in a struct resource with raw flags * %pM output a 6-byte MAC address with colons -- cgit v1.2.3-70-g09d2 From 29096202176ceaa5016a17ea2dd1aea19a4e90e2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 17 Mar 2011 15:21:07 -0400 Subject: futex: Fix WARN_ON() test for UP An update of the futex code had a WARN_ON(!spin_is_locked(q->lock_ptr)) But on UP, spin_is_locked() is always false, and will trigger this warning, and even worse, it will exit the function without doing the necessary work. Converting this to a WARN_ON_SMP() fixes the problem. Reported-by: Richard Weinberger Tested-by: Richard Weinberger Signed-off-by: Steven Rostedt Acked-by: Thomas Gleixner Acked-by: Peter Zijlstra Acked-by: Darren Hart Cc: Lai Jiangshan LKML-Reference: <20110317192208.682654502@goodmis.org> Signed-off-by: Ingo Molnar --- kernel/futex.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index bda41571538..823aae3e2a9 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -782,8 +782,8 @@ static void __unqueue_futex(struct futex_q *q) { struct futex_hash_bucket *hb; - if (WARN_ON(!q->lock_ptr || !spin_is_locked(q->lock_ptr) - || plist_node_empty(&q->list))) + if (WARN_ON_SMP(!q->lock_ptr || !spin_is_locked(q->lock_ptr)) + || WARN_ON(plist_node_empty(&q->list))) return; hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); -- cgit v1.2.3-70-g09d2 From ab7798ffcf98b11a9525cf65bacdae3fd58d357f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 Mar 2011 16:48:50 +0100 Subject: genirq: Expand generic show_interrupts() Some archs want to print extra information for certain irq_chips which is per irq and not per chip. Allow them to provide a chip callback to print the chip name and the extra information. PowerPC wants to print the LEVEL/EDGE type information. Make it configurable. Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 4 ++++ kernel/irq/Kconfig | 4 ++++ kernel/irq/proc.c | 15 ++++++++++++++- 3 files changed, 22 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index 1d3577f30d4..5d876c9b3a3 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -28,6 +28,7 @@ #include #include +struct seq_file; struct irq_desc; struct irq_data; typedef void (*irq_flow_handler_t)(unsigned int irq, @@ -270,6 +271,7 @@ static inline bool irqd_can_move_in_process_context(struct irq_data *d) * @irq_set_wake: enable/disable power-management wake-on of an IRQ * @irq_bus_lock: function to lock access to slow bus (i2c) chips * @irq_bus_sync_unlock:function to sync and unlock slow bus (i2c) chips + * @irq_print_chip: optional to print special chip info in show_interrupts * @flags: chip specific flags * * @release: release function solely used by UML @@ -317,6 +319,8 @@ struct irq_chip { void (*irq_bus_lock)(struct irq_data *data); void (*irq_bus_sync_unlock)(struct irq_data *data); + void (*irq_print_chip)(struct irq_data *data, struct seq_file *p); + unsigned long flags; /* Currently used only by UML, might disappear one day.*/ diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 09bef82d74c..00f2c037267 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -31,6 +31,10 @@ config GENERIC_IRQ_PROBE config GENERIC_IRQ_SHOW bool +# Print level/edge extra information +config GENERIC_IRQ_SHOW_LEVEL + bool + # Support for delayed migration from interrupt context config GENERIC_PENDING_IRQ bool diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 760248de109..626d092eed9 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -404,7 +404,20 @@ int show_interrupts(struct seq_file *p, void *v) seq_printf(p, "%*d: ", prec, i); for_each_online_cpu(j) seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); - seq_printf(p, " %8s", desc->irq_data.chip->name); + + if (desc->irq_data.chip) { + if (desc->irq_data.chip->irq_print_chip) + desc->irq_data.chip->irq_print_chip(&desc->irq_data, p); + else if (desc->irq_data.chip->name) + seq_printf(p, " %8s", desc->irq_data.chip->name); + else + seq_printf(p, " %8s", "-"); + } else { + seq_printf(p, " %8s", "None"); + } +#ifdef CONFIG_GENIRC_IRQ_SHOW_LEVEL + seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge"); +#endif if (desc->name) seq_printf(p, "-%-8s", desc->name); -- cgit v1.2.3-70-g09d2 From 27029c339b1beebe79bb4e64422ad1bb8d0b6440 Mon Sep 17 00:00:00 2001 From: Jovi Zhang Date: Mon, 15 Mar 2010 07:28:00 -0500 Subject: kdb: code cleanup to use macro instead of value It's better to use macro KDB_BASE_CMD_MAX instead of 50 Signed-off-by: Jovi Zhang Signed-off-by: Jason Wessel --- kernel/debug/kdb/kdb_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index bd3e8e29caa..38a85428c70 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -78,7 +78,7 @@ static unsigned int kdb_continue_catastrophic; static kdbtab_t *kdb_commands; #define KDB_BASE_CMD_MAX 50 static int kdb_max_commands = KDB_BASE_CMD_MAX; -static kdbtab_t kdb_base_commands[50]; +static kdbtab_t kdb_base_commands[KDB_BASE_CMD_MAX]; #define for_each_kdbcmd(cmd, num) \ for ((cmd) = kdb_base_commands, (num) = 0; \ num < kdb_max_commands; \ -- cgit v1.2.3-70-g09d2 From 0d3db28daed2529ab90933a3aaaaf46446fdfda8 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 15 Mar 2010 07:28:00 -0500 Subject: kdb: add usage string of 'per_cpu' command Signed-off-by: Namhyung Kim Signed-off-by: Jason Wessel --- kernel/debug/kdb/kdb_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 38a85428c70..6bc6e3bc4f9 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -2892,7 +2892,7 @@ static void __init kdb_inittab(void) "Send a signal to a process", 0, KDB_REPEAT_NONE); kdb_register_repeat("summary", kdb_summary, "", "Summarize the system", 4, KDB_REPEAT_NONE); - kdb_register_repeat("per_cpu", kdb_per_cpu, "", + kdb_register_repeat("per_cpu", kdb_per_cpu, " [] []", "Display per_cpu variables", 3, KDB_REPEAT_NONE); kdb_register_repeat("grephelp", kdb_grep_help, "", "Display help on | grep", 0, KDB_REPEAT_NONE); -- cgit v1.2.3-70-g09d2 From d72274e5895d11570a0a4a3214a1933c86d5ccb7 Mon Sep 17 00:00:00 2001 From: David Daney Date: Fri, 25 Mar 2011 12:38:48 -0700 Subject: genirq: Reserve the irq when calling irq_set_chip() The helper macros and functions like for_each_active_irq() don't work unless the irq is in the allocated_irqs set. In the case of !CONFIG_SPARSE_IRQ, instead of forcing all users of the irq infrastructure to explicitly call irq_reserve_irq(), do it for them. Signed-off-by: David Daney Cc: linux-mips@linux-mips.org Cc: ralf@linux-mips.org LKML-Reference: <1301081931-11240-2-git-send-email-ddaney@caviumnetworks.com> Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index c9c0601f061..c35d74c08b5 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -37,6 +37,12 @@ int irq_set_chip(unsigned int irq, struct irq_chip *chip) irq_chip_set_defaults(chip); desc->irq_data.chip = chip; irq_put_desc_unlock(desc, flags); + /* + * For !CONFIG_SPARSE_IRQ make the irq show up in + * allocated_irqs. For the CONFIG_SPARSE_IRQ case, it is + * already marked, and this call is harmless. + */ + irq_reserve_irq(irq); return 0; } EXPORT_SYMBOL(irq_set_chip); -- cgit v1.2.3-70-g09d2 From 801a0e9ae36e9b487092e31699d28c0b9a21ad52 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 27 Mar 2011 11:02:49 +0200 Subject: genirq: Add irq disabled flag to irq_data state Some irq_chip implementation require to know the disabled state of the interrupt in certain callbacks. Add a state flag and accessor to irq_data. Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 8 ++++++++ kernel/irq/chip.c | 5 +++-- kernel/irq/irqdesc.c | 1 + 3 files changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index 5d876c9b3a3..8649b0fb9da 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -174,6 +174,8 @@ struct irq_data { * from suspend * IRDQ_MOVE_PCNTXT - Interrupt can be moved in process * context + * IRQD_IRQ_DISABLED - Some chip function need to know the + * disabled state. */ enum { IRQD_TRIGGER_MASK = 0xf, @@ -184,6 +186,7 @@ enum { IRQD_LEVEL = (1 << 13), IRQD_WAKEUP_STATE = (1 << 14), IRQD_MOVE_PCNTXT = (1 << 15), + IRQD_IRQ_DISABLED = (1 << 16), }; static inline bool irqd_is_setaffinity_pending(struct irq_data *d) @@ -235,6 +238,11 @@ static inline bool irqd_can_move_in_process_context(struct irq_data *d) return d->state_use_accessors & IRQD_MOVE_PCNTXT; } +static inline bool irqd_irq_disabled(struct irq_data *d) +{ + return d->state_use_accessors & IRQD_IRQ_DISABLED; +} + /** * struct irq_chip - hardware interrupt chip descriptor * diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index c35d74c08b5..0a890bdd9c6 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -141,12 +141,14 @@ EXPORT_SYMBOL_GPL(irq_get_irq_data); static void irq_state_clr_disabled(struct irq_desc *desc) { desc->istate &= ~IRQS_DISABLED; + irqd_clear(&desc->irq_data, IRQD_IRQ_DISABLED); irq_compat_clr_disabled(desc); } static void irq_state_set_disabled(struct irq_desc *desc) { desc->istate |= IRQS_DISABLED; + irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); irq_compat_set_disabled(desc); } @@ -648,8 +650,7 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, if (handle == handle_bad_irq) { if (desc->irq_data.chip != &no_irq_chip) mask_ack_irq(desc); - irq_compat_set_disabled(desc); - desc->istate |= IRQS_DISABLED; + irq_state_set_disabled(desc); desc->depth = 1; } desc->handle_irq = handle; diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 6fb014f172f..96c3268a509 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -80,6 +80,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) desc->irq_data.handler_data = NULL; desc->irq_data.msi_desc = NULL; irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS); + irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); desc->istate = IRQS_DISABLED; desc->handle_irq = handle_bad_irq; desc->depth = 1; -- cgit v1.2.3-70-g09d2 From 0fdb4b259ed3e91b687ac26848202f5e7c217e62 Mon Sep 17 00:00:00 2001 From: David Daney Date: Fri, 25 Mar 2011 12:38:49 -0700 Subject: genirq: Add chip hooks for taking CPUs on/off line. [ tglx: Removed the enabled argument as this is now available in irq_data ] Signed-off-by: David Daney Cc: linux-mips@linux-mips.org Cc: ralf@linux-mips.org LKML-Reference: <1301081931-11240-3-git-send-email-ddaney@caviumnetworks.com> Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 8 ++++++++ kernel/irq/chip.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+) (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index 8649b0fb9da..c2a0c192969 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -279,6 +279,8 @@ static inline bool irqd_irq_disabled(struct irq_data *d) * @irq_set_wake: enable/disable power-management wake-on of an IRQ * @irq_bus_lock: function to lock access to slow bus (i2c) chips * @irq_bus_sync_unlock:function to sync and unlock slow bus (i2c) chips + * @irq_cpu_online: configure an interrupt source for a secondary CPU + * @irq_cpu_offline: un-configure an interrupt source for a secondary CPU * @irq_print_chip: optional to print special chip info in show_interrupts * @flags: chip specific flags * @@ -327,6 +329,9 @@ struct irq_chip { void (*irq_bus_lock)(struct irq_data *data); void (*irq_bus_sync_unlock)(struct irq_data *data); + void (*irq_cpu_online)(struct irq_data *data); + void (*irq_cpu_offline)(struct irq_data *data); + void (*irq_print_chip)(struct irq_data *data, struct seq_file *p); unsigned long flags; @@ -372,6 +377,9 @@ struct irqaction; extern int setup_irq(unsigned int irq, struct irqaction *new); extern void remove_irq(unsigned int irq, struct irqaction *act); +extern void irq_cpu_online(void); +extern void irq_cpu_offline(void); + #ifdef CONFIG_GENERIC_HARDIRQS #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_PENDING_IRQ) diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 0a890bdd9c6..44b16a1ecd9 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -696,3 +696,61 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) irq_put_desc_unlock(desc, flags); } + +/** + * irq_cpu_online - Invoke all irq_cpu_online functions. + * + * Iterate through all irqs and invoke the chip.irq_cpu_online() + * for each. + */ +void irq_cpu_online(void) +{ + struct irq_desc *desc; + struct irq_chip *chip; + unsigned long flags; + unsigned int irq; + + for_each_active_irq(irq) { + desc = irq_to_desc(irq); + if (!desc) + continue; + + raw_spin_lock_irqsave(&desc->lock, flags); + + chip = irq_data_get_irq_chip(&desc->irq_data); + + if (chip && chip->irq_cpu_online) + chip->irq_cpu_online(&desc->irq_data); + + raw_spin_unlock_irqrestore(&desc->lock, flags); + } +} + +/** + * irq_cpu_offline - Invoke all irq_cpu_offline functions. + * + * Iterate through all irqs and invoke the chip.irq_cpu_offline() + * for each. + */ +void irq_cpu_offline(void) +{ + struct irq_desc *desc; + struct irq_chip *chip; + unsigned long flags; + unsigned int irq; + + for_each_active_irq(irq) { + desc = irq_to_desc(irq); + if (!desc) + continue; + + raw_spin_lock_irqsave(&desc->lock, flags); + + chip = irq_data_get_irq_chip(&desc->irq_data); + + if (chip && chip->irq_cpu_offline) + chip->irq_cpu_offline(&desc->irq_data); + + raw_spin_unlock_irqrestore(&desc->lock, flags); + } +} -- cgit v1.2.3-70-g09d2 From b3d422329f2e061d66af4f933ef316e50e5edcac Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 27 Mar 2011 16:05:36 +0200 Subject: genirq: Add chip flag for restricting cpu_on/offline calls Add a flag which indicates that the on/offline callback should only be called on enabled interrupts. Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 3 +++ kernel/irq/chip.c | 10 ++++++---- 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index c2a0c192969..76e948fa88f 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -348,11 +348,14 @@ struct irq_chip { * IRQCHIP_SET_TYPE_MASKED: Mask before calling chip.irq_set_type() * IRQCHIP_EOI_IF_HANDLED: Only issue irq_eoi() when irq was handled * IRQCHIP_MASK_ON_SUSPEND: Mask non wake irqs in the suspend path + * IRQCHIP_ONOFFLINE_ENABLED: Only call irq_on/off_line callbacks + * when irq enabled */ enum { IRQCHIP_SET_TYPE_MASKED = (1 << 0), IRQCHIP_EOI_IF_HANDLED = (1 << 1), IRQCHIP_MASK_ON_SUSPEND = (1 << 2), + IRQCHIP_ONOFFLINE_ENABLED = (1 << 3), }; /* This include will go away once we isolated irq_desc usage to core code */ diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 44b16a1ecd9..9283d3300ea 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -718,8 +718,9 @@ void irq_cpu_online(void) raw_spin_lock_irqsave(&desc->lock, flags); chip = irq_data_get_irq_chip(&desc->irq_data); - - if (chip && chip->irq_cpu_online) + if (chip && chip->irq_cpu_online && + (!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) || + !(desc->istate & IRQS_DISABLED))) chip->irq_cpu_online(&desc->irq_data); raw_spin_unlock_irqrestore(&desc->lock, flags); @@ -747,8 +748,9 @@ void irq_cpu_offline(void) raw_spin_lock_irqsave(&desc->lock, flags); chip = irq_data_get_irq_chip(&desc->irq_data); - - if (chip && chip->irq_cpu_offline) + if (chip && chip->irq_cpu_offline && + (!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) || + !(desc->istate & IRQS_DISABLED))) chip->irq_cpu_offline(&desc->irq_data); raw_spin_unlock_irqrestore(&desc->lock, flags); -- cgit v1.2.3-70-g09d2 From c2d0c555c22242c3a76e366074c4d83ef9fa3b8c Mon Sep 17 00:00:00 2001 From: David Daney Date: Fri, 25 Mar 2011 12:38:50 -0700 Subject: genirq: Split irq_set_affinity() so it can be called with lock held. The .irq_cpu_online() and .irq_cpu_offline() functions may need to adjust affinity, but they are called with the descriptor lock held. Create __irq_set_affinity_locked() which is called with the lock held. Make irq_set_affinity() just a wrapper that acquires the lock. [ tglx: Changed the argument to irq_data, added a !desc check and moved the !irq_set_affinity check where it belongs ] Signed-off-by: David Daney Cc: linux-mips@linux-mips.org Cc: ralf@linux-mips.org LKML-Reference: <1301081931-11240-4-git-send-email-ddaney@caviumnetworks.com> Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 1 + kernel/irq/manage.c | 48 ++++++++++++++++++++++++++++++------------------ 2 files changed, 31 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index 76e948fa88f..a10717e1c1f 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -382,6 +382,7 @@ extern void remove_irq(unsigned int irq, struct irqaction *act); extern void irq_cpu_online(void); extern void irq_cpu_offline(void); +extern int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *cpumask); #ifdef CONFIG_GENERIC_HARDIRQS diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 0a2aa73e536..3d151fd762a 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -139,35 +139,26 @@ static inline void irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { } #endif -/** - * irq_set_affinity - Set the irq affinity of a given irq - * @irq: Interrupt to set affinity - * @cpumask: cpumask - * - */ -int irq_set_affinity(unsigned int irq, const struct cpumask *mask) +int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) { - struct irq_desc *desc = irq_to_desc(irq); - struct irq_chip *chip = desc->irq_data.chip; - unsigned long flags; + struct irq_chip *chip = irq_data_get_irq_chip(data); + struct irq_desc *desc = irq_data_to_desc(data); int ret = 0; - if (!chip->irq_set_affinity) + if (!chip || !chip->irq_set_affinity) return -EINVAL; - raw_spin_lock_irqsave(&desc->lock, flags); - - if (irq_can_move_pcntxt(desc)) { - ret = chip->irq_set_affinity(&desc->irq_data, mask, false); + if (irqd_can_move_in_process_context(data)) { + ret = chip->irq_set_affinity(data, mask, false); switch (ret) { case IRQ_SET_MASK_OK: - cpumask_copy(desc->irq_data.affinity, mask); + cpumask_copy(data->affinity, mask); case IRQ_SET_MASK_OK_NOCOPY: irq_set_thread_affinity(desc); ret = 0; } } else { - irqd_set_move_pending(&desc->irq_data); + irqd_set_move_pending(data); irq_copy_pending(desc, mask); } @@ -176,7 +167,28 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *mask) schedule_work(&desc->affinity_notify->work); } irq_compat_set_affinity(desc); - irqd_set(&desc->irq_data, IRQD_AFFINITY_SET); + irqd_set(data, IRQD_AFFINITY_SET); + + return ret; +} + +/** + * irq_set_affinity - Set the irq affinity of a given irq + * @irq: Interrupt to set affinity + * @cpumask: cpumask + * + */ +int irq_set_affinity(unsigned int irq, const struct cpumask *mask) +{ + struct irq_desc *desc = irq_to_desc(irq); + unsigned long flags; + int ret; + + if (!desc) + return -EINVAL; + + raw_spin_lock_irqsave(&desc->lock, flags); + ret = __irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask); raw_spin_unlock_irqrestore(&desc->lock, flags); return ret; } -- cgit v1.2.3-70-g09d2 From 32f4125ebffee4f3c4dbc6a437fc656129eb9e60 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 28 Mar 2011 14:10:52 +0200 Subject: genirq: Move INPROGRESS, MASKED and DISABLED state flags to irq_data We really need these flags for some of the interrupt chips. Move it from internal state to irq_data and provide proper accessors. Signed-off-by: Thomas Gleixner Cc: David Daney --- include/linux/irq.h | 17 +++++++++++++++-- kernel/irq/chip.c | 40 +++++++++++++++++++--------------------- kernel/irq/debug.h | 10 +++++++--- kernel/irq/handle.c | 4 ++-- kernel/irq/internals.h | 6 ------ kernel/irq/irqdesc.c | 2 -- kernel/irq/manage.c | 30 ++++++++++++++---------------- kernel/irq/migration.c | 4 ++-- kernel/irq/spurious.c | 10 +++++----- 9 files changed, 64 insertions(+), 59 deletions(-) (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index a10717e1c1f..18aacccb0fa 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -174,8 +174,9 @@ struct irq_data { * from suspend * IRDQ_MOVE_PCNTXT - Interrupt can be moved in process * context - * IRQD_IRQ_DISABLED - Some chip function need to know the - * disabled state. + * IRQD_IRQ_DISABLED - Disabled state of the interrupt + * IRQD_IRQ_MASKED - Masked state of the interrupt + * IRQD_IRQ_INPROGRESS - In progress state of the interrupt */ enum { IRQD_TRIGGER_MASK = 0xf, @@ -187,6 +188,8 @@ enum { IRQD_WAKEUP_STATE = (1 << 14), IRQD_MOVE_PCNTXT = (1 << 15), IRQD_IRQ_DISABLED = (1 << 16), + IRQD_IRQ_MASKED = (1 << 17), + IRQD_IRQ_INPROGRESS = (1 << 18), }; static inline bool irqd_is_setaffinity_pending(struct irq_data *d) @@ -243,6 +246,16 @@ static inline bool irqd_irq_disabled(struct irq_data *d) return d->state_use_accessors & IRQD_IRQ_DISABLED; } +static inline bool irqd_irq_masked(struct irq_data *d) +{ + return d->state_use_accessors & IRQD_IRQ_MASKED; +} + +static inline bool irqd_irq_inprogress(struct irq_data *d) +{ + return d->state_use_accessors & IRQD_IRQ_INPROGRESS; +} + /** * struct irq_chip - hardware interrupt chip descriptor * diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 9283d3300ea..e00bdc56269 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -140,27 +140,25 @@ EXPORT_SYMBOL_GPL(irq_get_irq_data); static void irq_state_clr_disabled(struct irq_desc *desc) { - desc->istate &= ~IRQS_DISABLED; irqd_clear(&desc->irq_data, IRQD_IRQ_DISABLED); irq_compat_clr_disabled(desc); } static void irq_state_set_disabled(struct irq_desc *desc) { - desc->istate |= IRQS_DISABLED; irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); irq_compat_set_disabled(desc); } static void irq_state_clr_masked(struct irq_desc *desc) { - desc->istate &= ~IRQS_MASKED; + irqd_clear(&desc->irq_data, IRQD_IRQ_MASKED); irq_compat_clr_masked(desc); } static void irq_state_set_masked(struct irq_desc *desc) { - desc->istate |= IRQS_MASKED; + irqd_set(&desc->irq_data, IRQD_IRQ_MASKED); irq_compat_set_masked(desc); } @@ -380,11 +378,11 @@ void handle_nested_irq(unsigned int irq) kstat_incr_irqs_this_cpu(irq, desc); action = desc->action; - if (unlikely(!action || (desc->istate & IRQS_DISABLED))) + if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) goto out_unlock; irq_compat_set_progress(desc); - desc->istate |= IRQS_INPROGRESS; + irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); raw_spin_unlock_irq(&desc->lock); action_ret = action->thread_fn(action->irq, action->dev_id); @@ -392,7 +390,7 @@ void handle_nested_irq(unsigned int irq) note_interrupt(irq, desc, action_ret); raw_spin_lock_irq(&desc->lock); - desc->istate &= ~IRQS_INPROGRESS; + irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); irq_compat_clr_progress(desc); out_unlock: @@ -424,14 +422,14 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) { raw_spin_lock(&desc->lock); - if (unlikely(desc->istate & IRQS_INPROGRESS)) + if (unlikely(irqd_irq_inprogress(&desc->irq_data))) if (!irq_check_poll(desc)) goto out_unlock; desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); kstat_incr_irqs_this_cpu(irq, desc); - if (unlikely(!desc->action || (desc->istate & IRQS_DISABLED))) + if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) goto out_unlock; handle_irq_event(desc); @@ -456,7 +454,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) raw_spin_lock(&desc->lock); mask_ack_irq(desc); - if (unlikely(desc->istate & IRQS_INPROGRESS)) + if (unlikely(irqd_irq_inprogress(&desc->irq_data))) if (!irq_check_poll(desc)) goto out_unlock; @@ -467,12 +465,12 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) * If its disabled or no action available * keep it masked and get out of here */ - if (unlikely(!desc->action || (desc->istate & IRQS_DISABLED))) + if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) goto out_unlock; handle_irq_event(desc); - if (!(desc->istate & (IRQS_DISABLED | IRQS_ONESHOT))) + if (!irqd_irq_disabled(&desc->irq_data) && !(desc->istate & IRQS_ONESHOT)) unmask_irq(desc); out_unlock: raw_spin_unlock(&desc->lock); @@ -504,7 +502,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) { raw_spin_lock(&desc->lock); - if (unlikely(desc->istate & IRQS_INPROGRESS)) + if (unlikely(irqd_irq_inprogress(&desc->irq_data))) if (!irq_check_poll(desc)) goto out; @@ -515,7 +513,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) * If its disabled or no action available * then mask it and get out of here: */ - if (unlikely(!desc->action || (desc->istate & IRQS_DISABLED))) { + if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { irq_compat_set_pending(desc); desc->istate |= IRQS_PENDING; mask_irq(desc); @@ -566,8 +564,8 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) * we shouldn't process the IRQ. Mark it pending, handle * the necessary masking and go out */ - if (unlikely((desc->istate & (IRQS_DISABLED | IRQS_INPROGRESS) || - !desc->action))) { + if (unlikely(irqd_irq_disabled(&desc->irq_data) || + irqd_irq_inprogress(&desc->irq_data) || !desc->action)) { if (!irq_check_poll(desc)) { irq_compat_set_pending(desc); desc->istate |= IRQS_PENDING; @@ -592,15 +590,15 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) * Renable it, if it was not disabled in meantime. */ if (unlikely(desc->istate & IRQS_PENDING)) { - if (!(desc->istate & IRQS_DISABLED) && - (desc->istate & IRQS_MASKED)) + if (!irqd_irq_disabled(&desc->irq_data) && + irqd_irq_masked(&desc->irq_data)) unmask_irq(desc); } handle_irq_event(desc); } while ((desc->istate & IRQS_PENDING) && - !(desc->istate & IRQS_DISABLED)); + !irqd_irq_disabled(&desc->irq_data)); out_unlock: raw_spin_unlock(&desc->lock); @@ -720,7 +718,7 @@ void irq_cpu_online(void) chip = irq_data_get_irq_chip(&desc->irq_data); if (chip && chip->irq_cpu_online && (!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) || - !(desc->istate & IRQS_DISABLED))) + !irqd_irq_disabled(&desc->irq_data))) chip->irq_cpu_online(&desc->irq_data); raw_spin_unlock_irqrestore(&desc->lock, flags); @@ -750,7 +748,7 @@ void irq_cpu_offline(void) chip = irq_data_get_irq_chip(&desc->irq_data); if (chip && chip->irq_cpu_offline && (!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) || - !(desc->istate & IRQS_DISABLED))) + !irqd_irq_disabled(&desc->irq_data))) chip->irq_cpu_offline(&desc->irq_data); raw_spin_unlock_irqrestore(&desc->lock, flags); diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h index d1a33b7fa61..a0bd875ba3d 100644 --- a/kernel/irq/debug.h +++ b/kernel/irq/debug.h @@ -6,6 +6,8 @@ #define P(f) if (desc->status & f) printk("%14s set\n", #f) #define PS(f) if (desc->istate & f) printk("%14s set\n", #f) +/* FIXME */ +#define PD(f) do { } while (0) static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) { @@ -28,13 +30,15 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) P(IRQ_NOAUTOEN); PS(IRQS_AUTODETECT); - PS(IRQS_INPROGRESS); PS(IRQS_REPLAY); PS(IRQS_WAITING); - PS(IRQS_DISABLED); PS(IRQS_PENDING); - PS(IRQS_MASKED); + + PD(IRQS_INPROGRESS); + PD(IRQS_DISABLED); + PD(IRQS_MASKED); } #undef P #undef PS +#undef PD diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 517561fc731..60fd5cd75c7 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -178,13 +178,13 @@ irqreturn_t handle_irq_event(struct irq_desc *desc) irq_compat_clr_pending(desc); desc->istate &= ~IRQS_PENDING; irq_compat_set_progress(desc); - desc->istate |= IRQS_INPROGRESS; + irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); raw_spin_unlock(&desc->lock); ret = handle_irq_event_percpu(desc, action); raw_spin_lock(&desc->lock); - desc->istate &= ~IRQS_INPROGRESS; + irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); irq_compat_clr_progress(desc); return ret; } diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 6c6ec9a4902..6b8b9713e28 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -44,26 +44,20 @@ enum { * IRQS_SPURIOUS_DISABLED - was disabled due to spurious interrupt * detection * IRQS_POLL_INPROGRESS - polling in progress - * IRQS_INPROGRESS - Interrupt in progress * IRQS_ONESHOT - irq is not unmasked in primary handler * IRQS_REPLAY - irq is replayed * IRQS_WAITING - irq is waiting - * IRQS_DISABLED - irq is disabled * IRQS_PENDING - irq is pending and replayed later - * IRQS_MASKED - irq is masked * IRQS_SUSPENDED - irq is suspended */ enum { IRQS_AUTODETECT = 0x00000001, IRQS_SPURIOUS_DISABLED = 0x00000002, IRQS_POLL_INPROGRESS = 0x00000008, - IRQS_INPROGRESS = 0x00000010, IRQS_ONESHOT = 0x00000020, IRQS_REPLAY = 0x00000040, IRQS_WAITING = 0x00000080, - IRQS_DISABLED = 0x00000100, IRQS_PENDING = 0x00000200, - IRQS_MASKED = 0x00000400, IRQS_SUSPENDED = 0x00000800, }; diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 96c3268a509..2c039c9b938 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -81,7 +81,6 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) desc->irq_data.msi_desc = NULL; irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS); irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); - desc->istate = IRQS_DISABLED; desc->handle_irq = handle_bad_irq; desc->depth = 1; desc->irq_count = 0; @@ -239,7 +238,6 @@ int __init early_irq_init(void) struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { [0 ... NR_IRQS-1] = { - .istate = IRQS_DISABLED, .handle_irq = handle_bad_irq, .depth = 1, .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock), diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 3d151fd762a..6e8acb75599 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -41,7 +41,7 @@ early_param("threadirqs", setup_forced_irqthreads); void synchronize_irq(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); - unsigned int state; + bool inprogress; if (!desc) return; @@ -53,16 +53,16 @@ void synchronize_irq(unsigned int irq) * Wait until we're out of the critical section. This might * give the wrong answer due to the lack of memory barriers. */ - while (desc->istate & IRQS_INPROGRESS) + while (irqd_irq_inprogress(&desc->irq_data)) cpu_relax(); /* Ok, that indicated we're done: double-check carefully. */ raw_spin_lock_irqsave(&desc->lock, flags); - state = desc->istate; + inprogress = irqd_irq_inprogress(&desc->irq_data); raw_spin_unlock_irqrestore(&desc->lock, flags); /* Oops, that failed? */ - } while (state & IRQS_INPROGRESS); + } while (inprogress); /* * We made sure that no hardirq handler is running. Now verify @@ -563,9 +563,9 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, flags &= IRQ_TYPE_SENSE_MASK; if (chip->flags & IRQCHIP_SET_TYPE_MASKED) { - if (!(desc->istate & IRQS_MASKED)) + if (!irqd_irq_masked(&desc->irq_data)) mask_irq(desc); - if (!(desc->istate & IRQS_DISABLED)) + if (!irqd_irq_disabled(&desc->irq_data)) unmask = 1; } @@ -663,7 +663,7 @@ again: * irq_wake_thread(). See the comment there which explains the * serialization. */ - if (unlikely(desc->istate & IRQS_INPROGRESS)) { + if (unlikely(irqd_irq_inprogress(&desc->irq_data))) { raw_spin_unlock_irq(&desc->lock); chip_bus_sync_unlock(desc); cpu_relax(); @@ -680,12 +680,10 @@ again: desc->threads_oneshot &= ~action->thread_mask; - if (!desc->threads_oneshot && !(desc->istate & IRQS_DISABLED) && - (desc->istate & IRQS_MASKED)) { - irq_compat_clr_masked(desc); - desc->istate &= ~IRQS_MASKED; - desc->irq_data.chip->irq_unmask(&desc->irq_data); - } + if (!desc->threads_oneshot && !irqd_irq_disabled(&desc->irq_data) && + irqd_irq_masked(&desc->irq_data)) + unmask_irq(desc); + out_unlock: raw_spin_unlock_irq(&desc->lock); chip_bus_sync_unlock(desc); @@ -779,7 +777,7 @@ static int irq_thread(void *data) atomic_inc(&desc->threads_active); raw_spin_lock_irq(&desc->lock); - if (unlikely(desc->istate & IRQS_DISABLED)) { + if (unlikely(irqd_irq_disabled(&desc->irq_data))) { /* * CHECKME: We might need a dedicated * IRQ_THREAD_PENDING flag here, which @@ -997,8 +995,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) } desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \ - IRQS_INPROGRESS | IRQS_ONESHOT | \ - IRQS_WAITING); + IRQS_ONESHOT | IRQS_WAITING); + irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); if (new->flags & IRQF_PERCPU) { irqd_set(&desc->irq_data, IRQD_PER_CPU); diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index ec4806d4778..5e81d34b08d 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -66,7 +66,7 @@ void irq_move_irq(struct irq_data *idata) if (likely(!irqd_is_setaffinity_pending(idata))) return; - if (unlikely(desc->istate & IRQS_DISABLED)) + if (unlikely(irqd_irq_disabled(idata))) return; /* @@ -74,7 +74,7 @@ void irq_move_irq(struct irq_data *idata) * threaded interrupt with ONESHOT set, we can end up with an * interrupt storm. */ - masked = desc->istate & IRQS_MASKED; + masked = irqd_irq_masked(idata); if (!masked) idata->chip->irq_mask(idata); irq_move_masked_irq(idata); diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index dd586ebf9c8..cd424cdf17f 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -45,12 +45,12 @@ bool irq_wait_for_poll(struct irq_desc *desc) #ifdef CONFIG_SMP do { raw_spin_unlock(&desc->lock); - while (desc->istate & IRQS_INPROGRESS) + while (irqd_irq_inprogress(&desc->irq_data)) cpu_relax(); raw_spin_lock(&desc->lock); - } while (desc->istate & IRQS_INPROGRESS); + } while irqd_irq_inprogress(&desc->irq_data); /* Might have been disabled in meantime */ - return !(desc->istate & IRQS_DISABLED) && desc->action; + return !irqd_irq_disabled(&desc->irq_data) && desc->action; #else return false; #endif @@ -75,7 +75,7 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force) * Do not poll disabled interrupts unless the spurious * disabled poller asks explicitely. */ - if ((desc->istate & IRQS_DISABLED) && !force) + if (irqd_irq_disabled(&desc->irq_data) && !force) goto out; /* @@ -88,7 +88,7 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force) goto out; /* Already running on another processor */ - if (desc->istate & IRQS_INPROGRESS) { + if (irqd_irq_inprogress(&desc->irq_data)) { /* * Already running: If it is shared get the other * CPU to go looking for our mystery interrupt too -- cgit v1.2.3-70-g09d2 From 0521c8fbb3da45c2a58cd551ca6e9644983f6028 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 28 Mar 2011 16:13:24 +0200 Subject: genirq: Provide edge_eoi flow handler This is a replacment for the cell flow handler which is in the way of cleanups. Must be selected to avoid general bloat. Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 1 + kernel/irq/Kconfig | 4 ++++ kernel/irq/chip.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 50 insertions(+) (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index 18aacccb0fa..44ebca74578 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -423,6 +423,7 @@ extern irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action); extern void handle_level_irq(unsigned int irq, struct irq_desc *desc); extern void handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc); extern void handle_edge_irq(unsigned int irq, struct irq_desc *desc); +extern void handle_edge_eoi_irq(unsigned int irq, struct irq_desc *desc); extern void handle_simple_irq(unsigned int irq, struct irq_desc *desc); extern void handle_percpu_irq(unsigned int irq, struct irq_desc *desc); extern void handle_bad_irq(unsigned int irq, struct irq_desc *desc); diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 00f2c037267..72606ba10b1 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -51,6 +51,10 @@ config HARDIRQS_SW_RESEND config IRQ_PREFLOW_FASTEOI bool +# Edge style eoi based handler (cell) +config IRQ_EDGE_EOI_HANDLER + bool + # Support forced irq threading config IRQ_FORCED_THREADING bool diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index e00bdc56269..451d1e81c15 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -604,6 +604,51 @@ out_unlock: raw_spin_unlock(&desc->lock); } +#ifdef CONFIG_IRQ_EDGE_EOI_HANDLER +/** + * handle_edge_eoi_irq - edge eoi type IRQ handler + * @irq: the interrupt number + * @desc: the interrupt description structure for this irq + * + * Similar as the above handle_edge_irq, but using eoi and w/o the + * mask/unmask logic. + */ +void handle_edge_eoi_irq(unsigned int irq, struct irq_desc *desc) +{ + struct irq_chip *chip = irq_desc_get_chip(desc); + + raw_spin_lock(&desc->lock); + + desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); + /* + * If we're currently running this IRQ, or its disabled, + * we shouldn't process the IRQ. Mark it pending, handle + * the necessary masking and go out + */ + if (unlikely(irqd_irq_disabled(&desc->irq_data) || + irqd_irq_inprogress(&desc->irq_data) || !desc->action)) { + if (!irq_check_poll(desc)) { + desc->istate |= IRQS_PENDING; + goto out_eoi; + } + } + kstat_incr_irqs_this_cpu(irq, desc); + + do { + if (unlikely(!desc->action)) + goto out_eoi; + + handle_irq_event(desc); + + } while ((desc->istate & IRQS_PENDING) && + !irqd_irq_disabled(&desc->irq_data)); + +out_unlock: + chip->irq_eoi(&desc->irq_data); + raw_spin_unlock(&desc->lock); +} +#endif + /** * handle_percpu_irq - Per CPU local irq handler * @irq: the interrupt number -- cgit v1.2.3-70-g09d2 From 33b054b867b84015173a38d9cd9ff513b6498818 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 28 Mar 2011 16:27:31 +0200 Subject: genirq: Remove handle_IRQ_event Last user gone. Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 3 --- kernel/irq/handle.c | 12 ------------ 2 files changed, 15 deletions(-) (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index 44ebca74578..41fc783171f 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -413,9 +413,6 @@ static inline void irq_move_masked_irq(struct irq_data *data) { } extern int no_irq_affinity; -/* Handle irq action chains: */ -extern irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action); - /* * Built-in IRQ handlers for various IRQ types, * callable via desc->handle_irq() diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 60fd5cd75c7..1a2fb77f2fd 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -188,15 +188,3 @@ irqreturn_t handle_irq_event(struct irq_desc *desc) irq_compat_clr_progress(desc); return ret; } - -/** - * handle_IRQ_event - irq action chain handler - * @irq: the interrupt number - * @action: the interrupt action chain for this irq - * - * Handles the action chain of an irq event - */ -irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) -{ - return handle_irq_event_percpu(irq_to_desc(irq), action); -} -- cgit v1.2.3-70-g09d2 From 30398bf6c684a77274dbdabf7efc1f24e4a99028 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 18 Mar 2011 09:33:56 -0700 Subject: genirq: Fix new kernel-doc warnings Fix new irq-related kernel-doc warnings in 2.6.38: Warning(kernel/irq/manage.c:149): No description found for parameter 'mask' Warning(kernel/irq/manage.c:149): Excess function parameter 'cpumask' description in 'irq_set_affinity' Warning(include/linux/irq.h:161): No description found for parameter 'state_use_accessors' Warning(include/linux/irq.h:161): Excess struct/union/enum/typedef member 'state_use_accessor' description in 'irq_data' Signed-off-by: Randy Dunlap LKML-Reference: <20110318093356.b939558d.randy.dunlap@oracle.com> Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 2 +- kernel/irq/manage.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index 34b954e5426..b3741c83774 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -135,7 +135,7 @@ struct msi_desc; * struct irq_data - per irq and irq chip data passed down to chip functions * @irq: interrupt number * @node: node index useful for balancing - * @state_use_accessor: status information for irq chip functions. + * @state_use_accessors: status information for irq chip functions. * Use accessor functions to deal with it * @chip: low level interrupt hardware access * @handler_data: per-IRQ data for the irq_chip methods diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 6e8acb75599..805c6a0ce78 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -175,7 +175,7 @@ int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) /** * irq_set_affinity - Set the irq affinity of a given irq * @irq: Interrupt to set affinity - * @cpumask: cpumask + * @mask: cpumask * */ int irq_set_affinity(unsigned int irq, const struct cpumask *mask) -- cgit v1.2.3-70-g09d2 From a6aeddd1c4e464a2150f97ca2d1c3d68cfbd9296 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 28 Mar 2011 20:28:56 +0200 Subject: genirq: Fix typo and remove unused variable Sigh, I'm overworked. Signed-off-by: Thomas Gleixner --- kernel/irq/migration.c | 1 - kernel/irq/spurious.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 5e81d34b08d..e33d9c8d508 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -60,7 +60,6 @@ void move_masked_irq(int irq) void irq_move_irq(struct irq_data *idata) { - struct irq_desc *desc = irq_data_to_desc(idata); bool masked; if (likely(!irqd_is_setaffinity_pending(idata))) diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index cd424cdf17f..83f4799f46b 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -48,7 +48,7 @@ bool irq_wait_for_poll(struct irq_desc *desc) while (irqd_irq_inprogress(&desc->irq_data)) cpu_relax(); raw_spin_lock(&desc->lock); - } while irqd_irq_inprogress(&desc->irq_data); + } while (irqd_irq_inprogress(&desc->irq_data)); /* Might have been disabled in meantime */ return !irqd_irq_disabled(&desc->irq_data) && desc->action; #else -- cgit v1.2.3-70-g09d2 From 243b422af9ea9af4ead07a8ad54c90d4f9b6081a Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Mon, 28 Mar 2011 14:13:35 -0700 Subject: Relax si_code check in rt_sigqueueinfo and rt_tgsigqueueinfo Commit da48524eb206 ("Prevent rt_sigqueueinfo and rt_tgsigqueueinfo from spoofing the signal code") made the check on si_code too strict. There are several legitimate places where glibc wants to queue a negative si_code different from SI_QUEUE: - This was first noticed with glibc's aio implementation, which wants to queue a signal with si_code SI_ASYNCIO; the current kernel causes glibc's tst-aio4 test to fail because rt_sigqueueinfo() fails with EPERM. - Further examination of the glibc source shows that getaddrinfo_a() wants to use SI_ASYNCNL (which the kernel does not even define). The timer_create() fallback code wants to queue signals with SI_TIMER. As suggested by Oleg Nesterov , loosen the check to forbid only the problematic SI_TKILL case. Reported-by: Klaus Dittrich Acked-by: Julien Tinnes Cc: Signed-off-by: Roland Dreier Signed-off-by: Linus Torvalds --- kernel/signal.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 324eff5468a..1186cf7fac7 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2437,7 +2437,7 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig, /* Not even root can pretend to send signals from the kernel. * Nor can they impersonate a kill()/tgkill(), which adds source info. */ - if (info.si_code != SI_QUEUE) { + if (info.si_code >= 0 || info.si_code == SI_TKILL) { /* We used to allow any < 0 si_code */ WARN_ON_ONCE(info.si_code < 0); return -EPERM; @@ -2457,7 +2457,7 @@ long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info) /* Not even root can pretend to send signals from the kernel. * Nor can they impersonate a kill()/tgkill(), which adds source info. */ - if (info->si_code != SI_QUEUE) { + if (info->si_code >= 0 || info->si_code == SI_TKILL) { /* We used to allow any < 0 si_code */ WARN_ON_ONCE(info->si_code < 0); return -EPERM; -- cgit v1.2.3-70-g09d2 From 0ef5ca1e1f0de71300142b8f730f26ded6a0c2f3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 28 Mar 2011 21:59:37 +0200 Subject: genirq; Fix cleanup fallout I missed the CONFIG_GENERIC_PENDING_IRQ dependency in the affinity related functions and the IRQ_LEVEL propagation into irq_data state. Did not pop up on my main test platforms. :( Signed-off-by: Thomas Gleixner Tested-by: David Daney --- kernel/irq/chip.c | 2 ++ kernel/irq/manage.c | 16 ++++++++-------- 2 files changed, 10 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 451d1e81c15..03099d521f5 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -734,6 +734,8 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) irqd_set(&desc->irq_data, IRQD_PER_CPU); if (irq_settings_can_move_pcntxt(desc)) irqd_set(&desc->irq_data, IRQD_MOVE_PCNTXT); + if (irq_settings_is_level(desc)) + irqd_set(&desc->irq_data, IRQD_LEVEL); irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc)); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 805c6a0ce78..acf540768b8 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -112,13 +112,13 @@ void irq_set_thread_affinity(struct irq_desc *desc) } #ifdef CONFIG_GENERIC_PENDING_IRQ -static inline bool irq_can_move_pcntxt(struct irq_desc *desc) +static inline bool irq_can_move_pcntxt(struct irq_data *data) { - return irq_settings_can_move_pcntxt(desc); + return irqd_can_move_in_process_context(data); } -static inline bool irq_move_pending(struct irq_desc *desc) +static inline bool irq_move_pending(struct irq_data *data) { - return irqd_is_setaffinity_pending(&desc->irq_data); + return irqd_is_setaffinity_pending(data); } static inline void irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) @@ -131,8 +131,8 @@ irq_get_pending(struct cpumask *mask, struct irq_desc *desc) cpumask_copy(mask, desc->pending_mask); } #else -static inline bool irq_can_move_pcntxt(struct irq_desc *desc) { return true; } -static inline bool irq_move_pending(struct irq_desc *desc) { return false; } +static inline bool irq_can_move_pcntxt(struct irq_data *data) { return true; } +static inline bool irq_move_pending(struct irq_desc *data) { return false; } static inline void irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) { } static inline void @@ -148,7 +148,7 @@ int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) if (!chip || !chip->irq_set_affinity) return -EINVAL; - if (irqd_can_move_in_process_context(data)) { + if (irq_can_move_pcntxt(data)) { ret = chip->irq_set_affinity(data, mask, false); switch (ret) { case IRQ_SET_MASK_OK: @@ -218,7 +218,7 @@ static void irq_affinity_notify(struct work_struct *work) goto out; raw_spin_lock_irqsave(&desc->lock, flags); - if (irq_move_pending(desc)) + if (irq_move_pending(&desc->irq_data)) irq_get_pending(cpumask, desc); else cpumask_copy(cpumask, desc->irq_data.affinity); -- cgit v1.2.3-70-g09d2 From cd22c0e44b105aecd78e5f9e77abab3a1b8dc00c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 29 Mar 2011 11:36:05 +0200 Subject: genirq: Fix harmless typo The late night fixup missed to convert the data type from irq_desc to irq_data, which results in a harmless but annoying warning. Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index acf540768b8..b3bf54f7d97 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -132,7 +132,7 @@ irq_get_pending(struct cpumask *mask, struct irq_desc *desc) } #else static inline bool irq_can_move_pcntxt(struct irq_data *data) { return true; } -static inline bool irq_move_pending(struct irq_desc *data) { return false; } +static inline bool irq_move_pending(struct irq_data *data) { return false; } static inline void irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) { } static inline void -- cgit v1.2.3-70-g09d2 From a6e120ed42004d6051fff7c3233e2554f12ccecb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 Mar 2011 22:20:51 +0100 Subject: alpha: Use generic show_interrupts() The only subtle difference is that alpha uses ACTUAL_NR_IRQS and prints the IRQF_DISABLED flag. Change the generic implementation to deal with ACTUAL_NR_IRQS if defined. The IRQF_DISABLED printing is pointless, as we nowadays run all interrupts with irqs disabled. Signed-off-by: Thomas Gleixner --- arch/alpha/Kconfig | 1 + arch/alpha/kernel/irq.c | 67 ++++++++----------------------------------------- kernel/irq/proc.c | 8 ++++-- 3 files changed, 17 insertions(+), 59 deletions(-) (limited to 'kernel') diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index cc31bec2e31..bd4160c5719 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -11,6 +11,7 @@ config ALPHA select HAVE_GENERIC_HARDIRQS select GENERIC_IRQ_PROBE select AUTO_IRQ_AFFINITY if SMP + select GENERIC_IRQ_SHOW select GENERIC_HARDIRQS_NO_DEPRECATED help The Alpha is a 64-bit general-purpose processor designed and diff --git a/arch/alpha/kernel/irq.c b/arch/alpha/kernel/irq.c index 1461d7c53b4..381431a2d6d 100644 --- a/arch/alpha/kernel/irq.c +++ b/arch/alpha/kernel/irq.c @@ -67,68 +67,21 @@ int irq_select_affinity(unsigned int irq) } #endif /* CONFIG_SMP */ -int -show_interrupts(struct seq_file *p, void *v) +int arch_show_interrupts(struct seq_file *p, int prec) { int j; - int irq = *(loff_t *) v; - struct irqaction * action; - struct irq_desc *desc; - unsigned long flags; #ifdef CONFIG_SMP - if (irq == 0) { - seq_puts(p, " "); - for_each_online_cpu(j) - seq_printf(p, "CPU%d ", j); - seq_putc(p, '\n'); - } -#endif - - if (irq < ACTUAL_NR_IRQS) { - desc = irq_to_desc(irq); - - if (!desc) - return 0; - - raw_spin_lock_irqsave(&desc->lock, flags); - action = desc->action; - if (!action) - goto unlock; - seq_printf(p, "%3d: ", irq); -#ifndef CONFIG_SMP - seq_printf(p, "%10u ", kstat_irqs(irq)); -#else - for_each_online_cpu(j) - seq_printf(p, "%10u ", kstat_irqs_cpu(irq, j)); + seq_puts(p, "IPI: "); + for_each_online_cpu(j) + seq_printf(p, "%10lu ", cpu_data[j].ipi_count); + seq_putc(p, '\n'); #endif - seq_printf(p, " %14s", irq_desc_get_chip(desc)->name); - seq_printf(p, " %c%s", - (action->flags & IRQF_DISABLED)?'+':' ', - action->name); - - for (action=action->next; action; action = action->next) { - seq_printf(p, ", %c%s", - (action->flags & IRQF_DISABLED)?'+':' ', - action->name); - } - - seq_putc(p, '\n'); -unlock: - raw_spin_unlock_irqrestore(&desc->lock, flags); - } else if (irq == ACTUAL_NR_IRQS) { -#ifdef CONFIG_SMP - seq_puts(p, "IPI: "); - for_each_online_cpu(j) - seq_printf(p, "%10lu ", cpu_data[j].ipi_count); - seq_putc(p, '\n'); -#endif - seq_puts(p, "PMI: "); - for_each_online_cpu(j) - seq_printf(p, "%10lu ", per_cpu(irq_pmi_count, j)); - seq_puts(p, " Performance Monitoring\n"); - seq_printf(p, "ERR: %10lu\n", irq_err_count); - } + seq_puts(p, "PMI: "); + for_each_online_cpu(j) + seq_printf(p, "%10lu ", per_cpu(irq_pmi_count, j)); + seq_puts(p, " Performance Monitoring\n"); + seq_printf(p, "ERR: %10lu\n", irq_err_count); return 0; } diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 626d092eed9..dd201bd3510 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -364,6 +364,10 @@ int __weak arch_show_interrupts(struct seq_file *p, int prec) return 0; } +#ifndef ACTUAL_NR_IRQS +# define ACTUAL_NR_IRQS nr_irqs +#endif + int show_interrupts(struct seq_file *p, void *v) { static int prec; @@ -373,10 +377,10 @@ int show_interrupts(struct seq_file *p, void *v) struct irqaction *action; struct irq_desc *desc; - if (i > nr_irqs) + if (i > ACTUAL_NR_IRQS) return 0; - if (i == nr_irqs) + if (i == ACTUAL_NR_IRQS) return arch_show_interrupts(p, prec); /* print header and calculate the width of the first column */ -- cgit v1.2.3-70-g09d2 From 0c6f8a8b917ad361319c8ace3e9f28e69bfdb4c1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 28 Mar 2011 13:32:20 +0200 Subject: genirq: Remove compat code Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 117 ------------------------------------------- include/linux/irqdesc.h | 60 +--------------------- kernel/irq/Kconfig | 4 -- kernel/irq/autoprobe.c | 4 +- kernel/irq/chip.c | 129 ------------------------------------------------ kernel/irq/compat.h | 72 --------------------------- kernel/irq/debug.h | 2 +- kernel/irq/dummychip.c | 9 ---- kernel/irq/handle.c | 3 -- kernel/irq/internals.h | 10 ---- kernel/irq/manage.c | 10 +--- kernel/irq/resend.c | 1 - kernel/irq/settings.h | 55 ++++++++------------- kernel/irq/spurious.c | 1 - 14 files changed, 25 insertions(+), 452 deletions(-) delete mode 100644 kernel/irq/compat.h (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index b3741c83774..7b3faace437 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -92,18 +92,6 @@ enum { IRQ_NO_BALANCING = (1 << 13), IRQ_MOVE_PCNTXT = (1 << 14), IRQ_NESTED_THREAD = (1 << 15), - -#ifndef CONFIG_GENERIC_HARDIRQS_NO_COMPAT - IRQ_INPROGRESS = (1 << 16), - IRQ_REPLAY = (1 << 17), - IRQ_WAITING = (1 << 18), - IRQ_DISABLED = (1 << 19), - IRQ_PENDING = (1 << 20), - IRQ_MASKED = (1 << 21), - IRQ_MOVE_PENDING = (1 << 22), - IRQ_AFFINITY_SET = (1 << 23), - IRQ_WAKEUP = (1 << 24), -#endif }; #define IRQF_MODIFY_MASK \ @@ -321,28 +309,6 @@ static inline void irqd_clr_chained_irq_inprogress(struct irq_data *d) */ struct irq_chip { const char *name; -#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED - unsigned int (*startup)(unsigned int irq); - void (*shutdown)(unsigned int irq); - void (*enable)(unsigned int irq); - void (*disable)(unsigned int irq); - - void (*ack)(unsigned int irq); - void (*mask)(unsigned int irq); - void (*mask_ack)(unsigned int irq); - void (*unmask)(unsigned int irq); - void (*eoi)(unsigned int irq); - - void (*end)(unsigned int irq); - int (*set_affinity)(unsigned int irq, - const struct cpumask *dest); - int (*retrigger)(unsigned int irq); - int (*set_type)(unsigned int irq, unsigned int flow_type); - int (*set_wake)(unsigned int irq, unsigned int on); - - void (*bus_lock)(unsigned int irq); - void (*bus_sync_unlock)(unsigned int irq); -#endif unsigned int (*irq_startup)(struct irq_data *data); void (*irq_shutdown)(struct irq_data *data); void (*irq_enable)(struct irq_data *data); @@ -589,89 +555,6 @@ static inline struct msi_desc *irq_data_get_msi(struct irq_data *d) return d->msi_desc; } -#ifndef CONFIG_GENERIC_HARDIRQS_NO_COMPAT -/* Please do not use: Use the replacement functions instead */ -static inline int set_irq_chip(unsigned int irq, struct irq_chip *chip) -{ - return irq_set_chip(irq, chip); -} -static inline int set_irq_data(unsigned int irq, void *data) -{ - return irq_set_handler_data(irq, data); -} -static inline int set_irq_chip_data(unsigned int irq, void *data) -{ - return irq_set_chip_data(irq, data); -} -static inline int set_irq_type(unsigned int irq, unsigned int type) -{ - return irq_set_irq_type(irq, type); -} -static inline int set_irq_msi(unsigned int irq, struct msi_desc *entry) -{ - return irq_set_msi_desc(irq, entry); -} -static inline struct irq_chip *get_irq_chip(unsigned int irq) -{ - return irq_get_chip(irq); -} -static inline void *get_irq_chip_data(unsigned int irq) -{ - return irq_get_chip_data(irq); -} -static inline void *get_irq_data(unsigned int irq) -{ - return irq_get_handler_data(irq); -} -static inline void *irq_data_get_irq_data(struct irq_data *d) -{ - return irq_data_get_irq_handler_data(d); -} -static inline struct msi_desc *get_irq_msi(unsigned int irq) -{ - return irq_get_msi_desc(irq); -} -static inline void set_irq_noprobe(unsigned int irq) -{ - irq_set_noprobe(irq); -} -static inline void set_irq_probe(unsigned int irq) -{ - irq_set_probe(irq); -} -static inline void set_irq_nested_thread(unsigned int irq, int nest) -{ - irq_set_nested_thread(irq, nest); -} -static inline void -set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, - irq_flow_handler_t handle, const char *name) -{ - irq_set_chip_and_handler_name(irq, chip, handle, name); -} -static inline void -set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip, - irq_flow_handler_t handle) -{ - irq_set_chip_and_handler(irq, chip, handle); -} -static inline void -__set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, - const char *name) -{ - __irq_set_handler(irq, handle, is_chained, name); -} -static inline void set_irq_handler(unsigned int irq, irq_flow_handler_t handle) -{ - irq_set_handler(irq, handle); -} -static inline void -set_irq_chained_handler(unsigned int irq, irq_flow_handler_t handle) -{ - irq_set_chained_handler(irq, handle); -} -#endif - int irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node); void irq_free_descs(unsigned int irq, unsigned int cnt); int irq_reserve_irqs(unsigned int from, unsigned int cnt); diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index 15e6c3905f4..a082905b5eb 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -35,32 +35,7 @@ struct timer_rand_state; * @name: flow handler name for /proc/interrupts output */ struct irq_desc { - -#ifdef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED struct irq_data irq_data; -#else - /* - * This union will go away, once we fixed the direct access to - * irq_desc all over the place. The direct fields are a 1:1 - * overlay of irq_data. - */ - union { - struct irq_data irq_data; - struct { - unsigned int irq; - unsigned int node; - unsigned int pad_do_not_even_think_about_it; - struct irq_chip *chip; - void *handler_data; - void *chip_data; - struct msi_desc *msi_desc; -#ifdef CONFIG_SMP - cpumask_var_t affinity; -#endif - }; - }; -#endif - struct timer_rand_state *timer_rand_state; unsigned int __percpu *kstat_irqs; irq_flow_handler_t handle_irq; @@ -68,11 +43,7 @@ struct irq_desc { irq_preflow_handler_t preflow_handler; #endif struct irqaction *action; /* IRQ action list */ -#ifdef CONFIG_GENERIC_HARDIRQS_NO_COMPAT unsigned int status_use_accessors; -#else - unsigned int status; /* IRQ status */ -#endif unsigned int core_internal_state__do_not_mess_with_it; unsigned int depth; /* nested irq disables */ unsigned int wake_depth; /* nested wake enables */ @@ -127,27 +98,6 @@ static inline struct msi_desc *irq_desc_get_msi_desc(struct irq_desc *desc) return desc->irq_data.msi_desc; } -#ifndef CONFIG_GENERIC_HARDIRQS_NO_COMPAT -static inline struct irq_chip *get_irq_desc_chip(struct irq_desc *desc) -{ - return irq_desc_get_chip(desc); -} -static inline void *get_irq_desc_data(struct irq_desc *desc) -{ - return irq_desc_get_handler_data(desc); -} - -static inline void *get_irq_desc_chip_data(struct irq_desc *desc) -{ - return irq_desc_get_chip_data(desc); -} - -static inline struct msi_desc *get_irq_desc_msi(struct irq_desc *desc) -{ - return irq_desc_get_msi_desc(desc); -} -#endif - /* * Architectures call this to let the generic IRQ layer * handle an interrupt. If the descriptor is attached to an @@ -194,21 +144,13 @@ __irq_set_chip_handler_name_locked(unsigned int irq, struct irq_chip *chip, desc->name = name; } -#ifndef CONFIG_GENERIC_HARDIRQS_NO_COMPAT -static inline void __set_irq_handler_unlocked(int irq, - irq_flow_handler_t handler) -{ - __irq_set_handler_locked(irq, handler); -} - static inline int irq_balancing_disabled(unsigned int irq) { struct irq_desc *desc; desc = irq_to_desc(irq); - return desc->status & IRQ_NO_BALANCING_MASK; + return desc->status_use_accessors & IRQ_NO_BALANCING_MASK; } -#endif static inline void irq_set_lockdep_class(unsigned int irq, struct lock_class_key *class) diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 72606ba10b1..a69c333f78e 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -10,10 +10,6 @@ menu "IRQ subsystem" config GENERIC_HARDIRQS def_bool y -# Select this to disable the deprecated stuff -config GENERIC_HARDIRQS_NO_DEPRECATED - bool - config GENERIC_HARDIRQS_NO_COMPAT bool diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 394784c5706..342d8f44e40 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -70,10 +70,8 @@ unsigned long probe_irq_on(void) raw_spin_lock_irq(&desc->lock); if (!desc->action && irq_settings_can_probe(desc)) { desc->istate |= IRQS_AUTODETECT | IRQS_WAITING; - if (irq_startup(desc)) { - irq_compat_set_pending(desc); + if (irq_startup(desc)) desc->istate |= IRQS_PENDING; - } } raw_spin_unlock_irq(&desc->lock); } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 03099d521f5..616ec1c6b06 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -34,7 +34,6 @@ int irq_set_chip(unsigned int irq, struct irq_chip *chip) if (!chip) chip = &no_irq_chip; - irq_chip_set_defaults(chip); desc->irq_data.chip = chip; irq_put_desc_unlock(desc, flags); /* @@ -141,25 +140,21 @@ EXPORT_SYMBOL_GPL(irq_get_irq_data); static void irq_state_clr_disabled(struct irq_desc *desc) { irqd_clear(&desc->irq_data, IRQD_IRQ_DISABLED); - irq_compat_clr_disabled(desc); } static void irq_state_set_disabled(struct irq_desc *desc) { irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); - irq_compat_set_disabled(desc); } static void irq_state_clr_masked(struct irq_desc *desc) { irqd_clear(&desc->irq_data, IRQD_IRQ_MASKED); - irq_compat_clr_masked(desc); } static void irq_state_set_masked(struct irq_desc *desc) { irqd_set(&desc->irq_data, IRQD_IRQ_MASKED); - irq_compat_set_masked(desc); } int irq_startup(struct irq_desc *desc) @@ -209,126 +204,6 @@ void irq_disable(struct irq_desc *desc) } } -#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED -/* Temporary migration helpers */ -static void compat_irq_mask(struct irq_data *data) -{ - data->chip->mask(data->irq); -} - -static void compat_irq_unmask(struct irq_data *data) -{ - data->chip->unmask(data->irq); -} - -static void compat_irq_ack(struct irq_data *data) -{ - data->chip->ack(data->irq); -} - -static void compat_irq_mask_ack(struct irq_data *data) -{ - data->chip->mask_ack(data->irq); -} - -static void compat_irq_eoi(struct irq_data *data) -{ - data->chip->eoi(data->irq); -} - -static void compat_irq_enable(struct irq_data *data) -{ - data->chip->enable(data->irq); -} - -static void compat_irq_disable(struct irq_data *data) -{ - data->chip->disable(data->irq); -} - -static void compat_irq_shutdown(struct irq_data *data) -{ - data->chip->shutdown(data->irq); -} - -static unsigned int compat_irq_startup(struct irq_data *data) -{ - return data->chip->startup(data->irq); -} - -static int compat_irq_set_affinity(struct irq_data *data, - const struct cpumask *dest, bool force) -{ - return data->chip->set_affinity(data->irq, dest); -} - -static int compat_irq_set_type(struct irq_data *data, unsigned int type) -{ - return data->chip->set_type(data->irq, type); -} - -static int compat_irq_set_wake(struct irq_data *data, unsigned int on) -{ - return data->chip->set_wake(data->irq, on); -} - -static int compat_irq_retrigger(struct irq_data *data) -{ - return data->chip->retrigger(data->irq); -} - -static void compat_bus_lock(struct irq_data *data) -{ - data->chip->bus_lock(data->irq); -} - -static void compat_bus_sync_unlock(struct irq_data *data) -{ - data->chip->bus_sync_unlock(data->irq); -} -#endif - -/* - * Fixup enable/disable function pointers - */ -void irq_chip_set_defaults(struct irq_chip *chip) -{ -#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED - if (chip->enable) - chip->irq_enable = compat_irq_enable; - if (chip->disable) - chip->irq_disable = compat_irq_disable; - if (chip->shutdown) - chip->irq_shutdown = compat_irq_shutdown; - if (chip->startup) - chip->irq_startup = compat_irq_startup; - if (!chip->end) - chip->end = dummy_irq_chip.end; - if (chip->bus_lock) - chip->irq_bus_lock = compat_bus_lock; - if (chip->bus_sync_unlock) - chip->irq_bus_sync_unlock = compat_bus_sync_unlock; - if (chip->mask) - chip->irq_mask = compat_irq_mask; - if (chip->unmask) - chip->irq_unmask = compat_irq_unmask; - if (chip->ack) - chip->irq_ack = compat_irq_ack; - if (chip->mask_ack) - chip->irq_mask_ack = compat_irq_mask_ack; - if (chip->eoi) - chip->irq_eoi = compat_irq_eoi; - if (chip->set_affinity) - chip->irq_set_affinity = compat_irq_set_affinity; - if (chip->set_type) - chip->irq_set_type = compat_irq_set_type; - if (chip->set_wake) - chip->irq_set_wake = compat_irq_set_wake; - if (chip->retrigger) - chip->irq_retrigger = compat_irq_retrigger; -#endif -} - static inline void mask_ack_irq(struct irq_desc *desc) { if (desc->irq_data.chip->irq_mask_ack) @@ -381,7 +256,6 @@ void handle_nested_irq(unsigned int irq) if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) goto out_unlock; - irq_compat_set_progress(desc); irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); raw_spin_unlock_irq(&desc->lock); @@ -391,7 +265,6 @@ void handle_nested_irq(unsigned int irq) raw_spin_lock_irq(&desc->lock); irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); - irq_compat_clr_progress(desc); out_unlock: raw_spin_unlock_irq(&desc->lock); @@ -514,7 +387,6 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) * then mask it and get out of here: */ if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { - irq_compat_set_pending(desc); desc->istate |= IRQS_PENDING; mask_irq(desc); goto out; @@ -567,7 +439,6 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) if (unlikely(irqd_irq_disabled(&desc->irq_data) || irqd_irq_inprogress(&desc->irq_data) || !desc->action)) { if (!irq_check_poll(desc)) { - irq_compat_set_pending(desc); desc->istate |= IRQS_PENDING; mask_ack_irq(desc); goto out_unlock; diff --git a/kernel/irq/compat.h b/kernel/irq/compat.h deleted file mode 100644 index 6bbaf66aca8..00000000000 --- a/kernel/irq/compat.h +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Compat layer for transition period - */ -#ifndef CONFIG_GENERIC_HARDIRQS_NO_COMPAT -static inline void irq_compat_set_progress(struct irq_desc *desc) -{ - desc->status |= IRQ_INPROGRESS; -} - -static inline void irq_compat_clr_progress(struct irq_desc *desc) -{ - desc->status &= ~IRQ_INPROGRESS; -} -static inline void irq_compat_set_disabled(struct irq_desc *desc) -{ - desc->status |= IRQ_DISABLED; -} -static inline void irq_compat_clr_disabled(struct irq_desc *desc) -{ - desc->status &= ~IRQ_DISABLED; -} -static inline void irq_compat_set_pending(struct irq_desc *desc) -{ - desc->status |= IRQ_PENDING; -} - -static inline void irq_compat_clr_pending(struct irq_desc *desc) -{ - desc->status &= ~IRQ_PENDING; -} -static inline void irq_compat_set_masked(struct irq_desc *desc) -{ - desc->status |= IRQ_MASKED; -} - -static inline void irq_compat_clr_masked(struct irq_desc *desc) -{ - desc->status &= ~IRQ_MASKED; -} -static inline void irq_compat_set_move_pending(struct irq_desc *desc) -{ - desc->status |= IRQ_MOVE_PENDING; -} - -static inline void irq_compat_clr_move_pending(struct irq_desc *desc) -{ - desc->status &= ~IRQ_MOVE_PENDING; -} -static inline void irq_compat_set_affinity(struct irq_desc *desc) -{ - desc->status |= IRQ_AFFINITY_SET; -} - -static inline void irq_compat_clr_affinity(struct irq_desc *desc) -{ - desc->status &= ~IRQ_AFFINITY_SET; -} -#else -static inline void irq_compat_set_progress(struct irq_desc *desc) { } -static inline void irq_compat_clr_progress(struct irq_desc *desc) { } -static inline void irq_compat_set_disabled(struct irq_desc *desc) { } -static inline void irq_compat_clr_disabled(struct irq_desc *desc) { } -static inline void irq_compat_set_pending(struct irq_desc *desc) { } -static inline void irq_compat_clr_pending(struct irq_desc *desc) { } -static inline void irq_compat_set_masked(struct irq_desc *desc) { } -static inline void irq_compat_clr_masked(struct irq_desc *desc) { } -static inline void irq_compat_set_move_pending(struct irq_desc *desc) { } -static inline void irq_compat_clr_move_pending(struct irq_desc *desc) { } -static inline void irq_compat_set_affinity(struct irq_desc *desc) { } -static inline void irq_compat_clr_affinity(struct irq_desc *desc) { } -#endif - diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h index a0bd875ba3d..306cba37e9a 100644 --- a/kernel/irq/debug.h +++ b/kernel/irq/debug.h @@ -4,7 +4,7 @@ #include -#define P(f) if (desc->status & f) printk("%14s set\n", #f) +#define P(f) if (desc->status_use_accessors & f) printk("%14s set\n", #f) #define PS(f) if (desc->istate & f) printk("%14s set\n", #f) /* FIXME */ #define PD(f) do { } while (0) diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c index 20dc5474947..b5fcd96c710 100644 --- a/kernel/irq/dummychip.c +++ b/kernel/irq/dummychip.c @@ -31,13 +31,6 @@ static unsigned int noop_ret(struct irq_data *data) return 0; } -#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED -static void compat_noop(unsigned int irq) { } -#define END_INIT .end = compat_noop -#else -#define END_INIT -#endif - /* * Generic no controller implementation */ @@ -48,7 +41,6 @@ struct irq_chip no_irq_chip = { .irq_enable = noop, .irq_disable = noop, .irq_ack = ack_bad, - END_INIT }; /* @@ -64,5 +56,4 @@ struct irq_chip dummy_irq_chip = { .irq_ack = noop, .irq_mask = noop, .irq_unmask = noop, - END_INIT }; diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 1a2fb77f2fd..90cb55f6d7e 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -175,9 +175,7 @@ irqreturn_t handle_irq_event(struct irq_desc *desc) struct irqaction *action = desc->action; irqreturn_t ret; - irq_compat_clr_pending(desc); desc->istate &= ~IRQS_PENDING; - irq_compat_set_progress(desc); irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); raw_spin_unlock(&desc->lock); @@ -185,6 +183,5 @@ irqreturn_t handle_irq_event(struct irq_desc *desc) raw_spin_lock(&desc->lock); irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); - irq_compat_clr_progress(desc); return ret; } diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 6b8b9713e28..6546431447d 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -15,10 +15,6 @@ #define istate core_internal_state__do_not_mess_with_it -#ifdef CONFIG_GENERIC_HARDIRQS_NO_COMPAT -# define status status_use_accessors -#endif - extern int noirqdebug; /* @@ -61,15 +57,11 @@ enum { IRQS_SUSPENDED = 0x00000800, }; -#include "compat.h" #include "debug.h" #include "settings.h" #define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) -/* Set default functions for irq_chip structures: */ -extern void irq_chip_set_defaults(struct irq_chip *chip); - extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, unsigned long flags); extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); @@ -156,13 +148,11 @@ irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags) static inline void irqd_set_move_pending(struct irq_data *d) { d->state_use_accessors |= IRQD_SETAFFINITY_PENDING; - irq_compat_set_move_pending(irq_data_to_desc(d)); } static inline void irqd_clr_move_pending(struct irq_data *d) { d->state_use_accessors &= ~IRQD_SETAFFINITY_PENDING; - irq_compat_clr_move_pending(irq_data_to_desc(d)); } static inline void irqd_clear(struct irq_data *d, unsigned int mask) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index b3bf54f7d97..12a80fdae11 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -166,7 +166,6 @@ int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) kref_get(&desc->affinity_notify->kref); schedule_work(&desc->affinity_notify->work); } - irq_compat_set_affinity(desc); irqd_set(data, IRQD_AFFINITY_SET); return ret; @@ -297,10 +296,8 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) if (cpumask_intersects(desc->irq_data.affinity, cpu_online_mask)) set = desc->irq_data.affinity; - else { - irq_compat_clr_affinity(desc); + else irqd_clear(&desc->irq_data, IRQD_AFFINITY_SET); - } } cpumask_and(mask, cpu_online_mask, set); @@ -587,8 +584,6 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, irqd_set(&desc->irq_data, IRQD_LEVEL); } - if (chip != desc->irq_data.chip) - irq_chip_set_defaults(desc->irq_data.chip); ret = 0; break; default: @@ -785,7 +780,6 @@ static int irq_thread(void *data) * but AFAICT IRQS_PENDING should be fine as it * retriggers the interrupt itself --- tglx */ - irq_compat_set_pending(desc); desc->istate |= IRQS_PENDING; raw_spin_unlock_irq(&desc->lock); } else { @@ -981,8 +975,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) new->thread_mask = 1 << ffz(thread_mask); if (!shared) { - irq_chip_set_defaults(desc->irq_data.chip); - init_waitqueue_head(&desc->wait_for_threads); /* Setup the type (level, edge polarity) if configured: */ diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index ad683a99b1e..14dd5761e8c 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -65,7 +65,6 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq) if (desc->istate & IRQS_REPLAY) return; if (desc->istate & IRQS_PENDING) { - irq_compat_clr_pending(desc); desc->istate &= ~IRQS_PENDING; desc->istate |= IRQS_REPLAY; diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index 0227ad35827..0d91730b633 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h @@ -15,17 +15,8 @@ enum { _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK, }; -#define IRQ_INPROGRESS GOT_YOU_MORON -#define IRQ_REPLAY GOT_YOU_MORON -#define IRQ_WAITING GOT_YOU_MORON -#define IRQ_DISABLED GOT_YOU_MORON -#define IRQ_PENDING GOT_YOU_MORON -#define IRQ_MASKED GOT_YOU_MORON -#define IRQ_WAKEUP GOT_YOU_MORON -#define IRQ_MOVE_PENDING GOT_YOU_MORON #define IRQ_PER_CPU GOT_YOU_MORON #define IRQ_NO_BALANCING GOT_YOU_MORON -#define IRQ_AFFINITY_SET GOT_YOU_MORON #define IRQ_LEVEL GOT_YOU_MORON #define IRQ_NOPROBE GOT_YOU_MORON #define IRQ_NOREQUEST GOT_YOU_MORON @@ -37,102 +28,98 @@ enum { static inline void irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set) { - desc->status &= ~(clr & _IRQF_MODIFY_MASK); - desc->status |= (set & _IRQF_MODIFY_MASK); + desc->status_use_accessors &= ~(clr & _IRQF_MODIFY_MASK); + desc->status_use_accessors |= (set & _IRQF_MODIFY_MASK); } static inline bool irq_settings_is_per_cpu(struct irq_desc *desc) { - return desc->status & _IRQ_PER_CPU; + return desc->status_use_accessors & _IRQ_PER_CPU; } static inline void irq_settings_set_per_cpu(struct irq_desc *desc) { - desc->status |= _IRQ_PER_CPU; + desc->status_use_accessors |= _IRQ_PER_CPU; } static inline void irq_settings_set_no_balancing(struct irq_desc *desc) { - desc->status |= _IRQ_NO_BALANCING; + desc->status_use_accessors |= _IRQ_NO_BALANCING; } static inline bool irq_settings_has_no_balance_set(struct irq_desc *desc) { - return desc->status & _IRQ_NO_BALANCING; + return desc->status_use_accessors & _IRQ_NO_BALANCING; } static inline u32 irq_settings_get_trigger_mask(struct irq_desc *desc) { - return desc->status & IRQ_TYPE_SENSE_MASK; + return desc->status_use_accessors & IRQ_TYPE_SENSE_MASK; } static inline void irq_settings_set_trigger_mask(struct irq_desc *desc, u32 mask) { - desc->status &= ~IRQ_TYPE_SENSE_MASK; - desc->status |= mask & IRQ_TYPE_SENSE_MASK; + desc->status_use_accessors &= ~IRQ_TYPE_SENSE_MASK; + desc->status_use_accessors |= mask & IRQ_TYPE_SENSE_MASK; } static inline bool irq_settings_is_level(struct irq_desc *desc) { - return desc->status & _IRQ_LEVEL; + return desc->status_use_accessors & _IRQ_LEVEL; } static inline void irq_settings_clr_level(struct irq_desc *desc) { - desc->status &= ~_IRQ_LEVEL; + desc->status_use_accessors &= ~_IRQ_LEVEL; } static inline void irq_settings_set_level(struct irq_desc *desc) { - desc->status |= _IRQ_LEVEL; + desc->status_use_accessors |= _IRQ_LEVEL; } static inline bool irq_settings_can_request(struct irq_desc *desc) { - return !(desc->status & _IRQ_NOREQUEST); + return !(desc->status_use_accessors & _IRQ_NOREQUEST); } static inline void irq_settings_clr_norequest(struct irq_desc *desc) { - desc->status &= ~_IRQ_NOREQUEST; + desc->status_use_accessors &= ~_IRQ_NOREQUEST; } static inline void irq_settings_set_norequest(struct irq_desc *desc) { - desc->status |= _IRQ_NOREQUEST; + desc->status_use_accessors |= _IRQ_NOREQUEST; } static inline bool irq_settings_can_probe(struct irq_desc *desc) { - return !(desc->status & _IRQ_NOPROBE); + return !(desc->status_use_accessors & _IRQ_NOPROBE); } static inline void irq_settings_clr_noprobe(struct irq_desc *desc) { - desc->status &= ~_IRQ_NOPROBE; + desc->status_use_accessors &= ~_IRQ_NOPROBE; } static inline void irq_settings_set_noprobe(struct irq_desc *desc) { - desc->status |= _IRQ_NOPROBE; + desc->status_use_accessors |= _IRQ_NOPROBE; } static inline bool irq_settings_can_move_pcntxt(struct irq_desc *desc) { - return desc->status & _IRQ_MOVE_PCNTXT; + return desc->status_use_accessors & _IRQ_MOVE_PCNTXT; } static inline bool irq_settings_can_autoenable(struct irq_desc *desc) { - return !(desc->status & _IRQ_NOAUTOEN); + return !(desc->status_use_accessors & _IRQ_NOAUTOEN); } static inline bool irq_settings_is_nested_thread(struct irq_desc *desc) { - return desc->status & _IRQ_NESTED_THREAD; + return desc->status_use_accessors & _IRQ_NESTED_THREAD; } - -/* Nothing should touch desc->status from now on */ -#undef status -#define status USE_THE_PROPER_WRAPPERS_YOU_MORON diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 83f4799f46b..dfbd550401b 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -93,7 +93,6 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force) * Already running: If it is shared get the other * CPU to go looking for our mystery interrupt too */ - irq_compat_set_pending(desc); desc->istate |= IRQS_PENDING; goto out; } -- cgit v1.2.3-70-g09d2 From 851d7cf647e0d31668eb5dc496f7698a2f6136b4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 29 Mar 2011 02:51:13 +0200 Subject: genirq: Remove move_*irq leftovers All users converted to new interface. Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 4 ---- kernel/irq/migration.c | 10 ---------- 2 files changed, 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index 7b3faace437..2a375a72ce3 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -386,13 +386,9 @@ extern int __irq_set_affinity_locked(struct irq_data *data, const struct cpumas #ifdef CONFIG_GENERIC_HARDIRQS #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_PENDING_IRQ) -void move_native_irq(int irq); -void move_masked_irq(int irq); void irq_move_irq(struct irq_data *data); void irq_move_masked_irq(struct irq_data *data); #else -static inline void move_native_irq(int irq) { } -static inline void move_masked_irq(int irq) { } static inline void irq_move_irq(struct irq_data *data) { } static inline void irq_move_masked_irq(struct irq_data *data) { } #endif diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index e33d9c8d508..bc6194698df 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -53,11 +53,6 @@ void irq_move_masked_irq(struct irq_data *idata) cpumask_clear(desc->pending_mask); } -void move_masked_irq(int irq) -{ - irq_move_masked_irq(irq_get_irq_data(irq)); -} - void irq_move_irq(struct irq_data *idata) { bool masked; @@ -80,8 +75,3 @@ void irq_move_irq(struct irq_data *idata) if (!masked) idata->chip->irq_unmask(idata); } - -void move_native_irq(int irq) -{ - irq_move_irq(irq_get_irq_data(irq)); -} -- cgit v1.2.3-70-g09d2