From 6d1d8050b4bc89d0165d29b58e894aeba2564a97 Mon Sep 17 00:00:00 2001 From: Will Drewry Date: Tue, 31 Aug 2010 15:47:05 -0500 Subject: block, partition: add partition_meta_info to hd_struct I'm reposting this patch series as v4 since there have been no additional comments, and I cleaned up one extra bit of unneeded code (in 3/3). The patches are against Linus's tree: 2bfc96a127bc1cc94d26bfaa40159966064f9c8c (2.6.36-rc3). Would this patchset be suitable for inclusion in an mm branch? This changes adds a partition_meta_info struct which itself contains a union of structures that provide partition table specific metadata. This change leaves the union empty. The subsequent patch includes an implementation for CONFIG_EFI_PARTITION-based metadata. Signed-off-by: Will Drewry Signed-off-by: Jens Axboe --- fs/partitions/check.c | 23 ++++++++++++++++++++--- fs/partitions/check.h | 3 +++ 2 files changed, 23 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 79fbf3f390f..6dfbee03ccc 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -352,6 +352,7 @@ static void part_release(struct device *dev) { struct hd_struct *p = dev_to_part(dev); free_part_stats(p); + free_part_info(p); kfree(p); } @@ -401,7 +402,8 @@ static DEVICE_ATTR(whole_disk, S_IRUSR | S_IRGRP | S_IROTH, whole_disk_show, NULL); struct hd_struct *add_partition(struct gendisk *disk, int partno, - sector_t start, sector_t len, int flags) + sector_t start, sector_t len, int flags, + struct partition_meta_info *info) { struct hd_struct *p; dev_t devt = MKDEV(0, 0); @@ -438,6 +440,14 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno, p->partno = partno; p->policy = get_disk_ro(disk); + if (info) { + struct partition_meta_info *pinfo = alloc_part_info(disk); + if (!pinfo) + goto out_free_stats; + memcpy(pinfo, info, sizeof(*info)); + p->info = pinfo; + } + dname = dev_name(ddev); if (isdigit(dname[strlen(dname) - 1])) dev_set_name(pdev, "%sp%d", dname, partno); @@ -451,7 +461,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno, err = blk_alloc_devt(p, &devt); if (err) - goto out_free_stats; + goto out_free_info; pdev->devt = devt; /* delay uevent until 'holders' subdir is created */ @@ -481,6 +491,8 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno, return p; +out_free_info: + free_part_info(p); out_free_stats: free_part_stats(p); out_free: @@ -642,6 +654,7 @@ rescan: /* add partitions */ for (p = 1; p < state->limit; p++) { sector_t size, from; + struct partition_meta_info *info = NULL; size = state->parts[p].size; if (!size) @@ -675,8 +688,12 @@ rescan: size = get_capacity(disk) - from; } } + + if (state->parts[p].has_info) + info = &state->parts[p].info; part = add_partition(disk, p, from, size, - state->parts[p].flags); + state->parts[p].flags, + &state->parts[p].info); if (IS_ERR(part)) { printk(KERN_ERR " %s: p%d could not be added: %ld\n", disk->disk_name, p, -PTR_ERR(part)); diff --git a/fs/partitions/check.h b/fs/partitions/check.h index 8e4e103ba21..d68bf4dc3bc 100644 --- a/fs/partitions/check.h +++ b/fs/partitions/check.h @@ -1,5 +1,6 @@ #include #include +#include /* * add_gd_partition adds a partitions details to the devices partition @@ -12,6 +13,8 @@ struct parsed_partitions { sector_t from; sector_t size; int flags; + bool has_info; + struct partition_meta_info info; } parts[DISK_MAX_PARTS]; int next; int limit; -- cgit v1.2.3-70-g09d2 From eec7ecfede74bb996060efefd5c157acd5794e8a Mon Sep 17 00:00:00 2001 From: Will Drewry Date: Tue, 31 Aug 2010 15:47:06 -0500 Subject: genhd, efi: add efi partition metadata to hd_structs This change extends the partition_meta_info structure to support EFI GPT-specific metadata and ensures that data is copied in on partition scanning. Signed-off-by: Will Drewry Signed-off-by: Jens Axboe --- fs/partitions/efi.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'fs') diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c index dbb44d4bb8a..ac0ccb5026a 100644 --- a/fs/partitions/efi.c +++ b/fs/partitions/efi.c @@ -94,6 +94,7 @@ * ************************************************************/ #include +#include #include #include #include "check.h" @@ -604,6 +605,7 @@ int efi_partition(struct parsed_partitions *state) gpt_entry *ptes = NULL; u32 i; unsigned ssz = bdev_logical_block_size(state->bdev) / 512; + u8 unparsed_guid[37]; if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) { kfree(gpt); @@ -614,6 +616,9 @@ int efi_partition(struct parsed_partitions *state) pr_debug("GUID Partition Table is valid! Yea!\n"); for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) { + struct partition_meta_info *info; + unsigned label_count = 0; + unsigned label_max; u64 start = le64_to_cpu(ptes[i].starting_lba); u64 size = le64_to_cpu(ptes[i].ending_lba) - le64_to_cpu(ptes[i].starting_lba) + 1ULL; @@ -627,6 +632,26 @@ int efi_partition(struct parsed_partitions *state) if (!efi_guidcmp(ptes[i].partition_type_guid, PARTITION_LINUX_RAID_GUID)) state->parts[i + 1].flags = ADDPART_FLAG_RAID; + + info = &state->parts[i + 1].info; + /* Instead of doing a manual swap to big endian, reuse the + * common ASCII hex format as the interim. + */ + efi_guid_unparse(&ptes[i].unique_partition_guid, unparsed_guid); + part_pack_uuid(unparsed_guid, info->uuid); + + /* Naively convert UTF16-LE to 7 bits. */ + label_max = min(sizeof(info->volname) - 1, + sizeof(ptes[i].partition_name)); + info->volname[label_max] = 0; + while (label_count < label_max) { + u8 c = ptes[i].partition_name[label_count] & 0xff; + if (c && !isprint(c)) + c = '!'; + info->volname[label_count] = c; + label_count++; + } + state->parts[i + 1].has_info = true; } kfree(ptes); kfree(gpt); -- cgit v1.2.3-70-g09d2 From 749ef9f8423054e326f3a246327ed2db4b6d395f Mon Sep 17 00:00:00 2001 From: Corrado Zoccolo Date: Mon, 20 Sep 2010 15:24:50 +0200 Subject: cfq: improve fsync performance for small files Fsync performance for small files achieved by cfq on high-end disks is lower than what deadline can achieve, due to idling introduced between the sync write happening in process context and the journal commit. Moreover, when competing with a sequential reader, a process writing small files and fsync-ing them is starved. This patch fixes the two problems by: - marking journal commits as WRITE_SYNC, so that they get the REQ_NOIDLE flag set, - force all queues that have REQ_NOIDLE requests to be put in the noidle tree. Having the queue associated to the fsync-ing process and the one associated to journal commits in the noidle tree allows: - switching between them without idling, - fairness vs. competing idling queues, since they will be serviced only after the noidle tree expires its slice. Acked-by: Vivek Goyal Reviewed-by: Jeff Moyer Tested-by: Jeff Moyer Signed-off-by: Corrado Zoccolo Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 18 ++++-------------- fs/jbd/commit.c | 2 +- fs/jbd2/commit.c | 2 +- 3 files changed, 6 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index b9f86190763..68459262173 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -216,7 +216,6 @@ struct cfq_data { enum wl_type_t serving_type; unsigned long workload_expires; struct cfq_group *serving_group; - bool noidle_tree_requires_idle; /* * Each priority tree is sorted by next_request position. These @@ -2126,7 +2125,6 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) slice = max_t(unsigned, slice, CFQ_MIN_TT); cfq_log(cfqd, "workload slice:%d", slice); cfqd->workload_expires = jiffies + slice; - cfqd->noidle_tree_requires_idle = false; } static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd) @@ -3108,7 +3106,9 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, if (cfqq->queued[0] + cfqq->queued[1] >= 4) cfq_mark_cfqq_deep(cfqq); - if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle || + if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE)) + enable_idle = 0; + else if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle || (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq))) enable_idle = 0; else if (sample_valid(cic->ttime_samples)) { @@ -3421,17 +3421,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) cfq_slice_expired(cfqd, 1); else if (sync && cfqq_empty && !cfq_close_cooperator(cfqd, cfqq)) { - cfqd->noidle_tree_requires_idle |= - !(rq->cmd_flags & REQ_NOIDLE); - /* - * Idling is enabled for SYNC_WORKLOAD. - * SYNC_NOIDLE_WORKLOAD idles at the end of the tree - * only if we processed at least one !REQ_NOIDLE request - */ - if (cfqd->serving_type == SYNC_WORKLOAD - || cfqd->noidle_tree_requires_idle - || cfqq->cfqg->nr_cfqq == 1) - cfq_arm_slice_timer(cfqd); + cfq_arm_slice_timer(cfqd); } } diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 95d8c11c929..3f030e9efea 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -318,7 +318,7 @@ void journal_commit_transaction(journal_t *journal) int first_tag = 0; int tag_flag; int i; - int write_op = WRITE; + int write_op = WRITE_SYNC; /* * First job: lock down the current transaction and wait for diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 7c068c189d8..80910f51d4b 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -360,7 +360,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) int tag_bytes = journal_tag_bytes(journal); struct buffer_head *cbh = NULL; /* For transactional checksums */ __u32 crc32_sum = ~0; - int write_op = WRITE; + int write_op = WRITE_SYNC; /* * First job: lock down the current transaction and wait for -- cgit v1.2.3-70-g09d2 From 7681bfeeccff5efa9eb29bf09249a3c400b15327 Mon Sep 17 00:00:00 2001 From: Yasuaki Ishimatsu Date: Tue, 19 Oct 2010 09:05:00 +0200 Subject: block: fix accounting bug on cross partition merges /proc/diskstats would display a strange output as follows. $ cat /proc/diskstats |grep sda 8 0 sda 90524 7579 102154 20464 0 0 0 0 0 14096 20089 8 1 sda1 19085 1352 21841 4209 0 0 0 0 4294967064 15689 4293424691 ~~~~~~~~~~ 8 2 sda2 71252 3624 74891 15950 0 0 0 0 232 23995 1562390 8 3 sda3 54 487 2188 92 0 0 0 0 0 88 92 8 4 sda4 4 0 8 0 0 0 0 0 0 0 0 8 5 sda5 81 2027 2130 138 0 0 0 0 0 87 137 Its reason is the wrong way of accounting hd_struct->in_flight. When a bio is merged into a request belongs to different partition by ELEVATOR_FRONT_MERGE. The detailed root cause is as follows. Assuming that there are two partition, sda1 and sda2. 1. A request for sda2 is in request_queue. Hence sda1's hd_struct->in_flight is 0 and sda2's one is 1. | hd_struct->in_flight --------------------------- sda1 | 0 sda2 | 1 --------------------------- 2. A bio belongs to sda1 is issued and is merged into the request mentioned on step1 by ELEVATOR_BACK_MERGE. The first sector of the request is changed from sda2 region to sda1 region. However the two partition's hd_struct->in_flight are not changed. | hd_struct->in_flight --------------------------- sda1 | 0 sda2 | 1 --------------------------- 3. The request is finished and blk_account_io_done() is called. In this case, sda2's hd_struct->in_flight, not a sda1's one, is decremented. | hd_struct->in_flight --------------------------- sda1 | -1 sda2 | 1 --------------------------- The patch fixes the problem by caching the partition lookup inside the request structure, hence making sure that the increment and decrement will always happen on the same partition struct. This also speeds up IO with accounting enabled, since it cuts down on the number of lookups we have to do. When reloading partition tables, quiesce IO to ensure that no request references to the partition struct exists. When it is safe to free the partition table, the IO for that device is restarted again. Signed-off-by: Yasuaki Ishimatsu Cc: stable@kernel.org Signed-off-by: Jens Axboe --- block/blk-core.c | 24 ++++++++++++++++-------- block/blk-merge.c | 2 +- block/blk.h | 4 ---- block/genhd.c | 14 ++++++++++++++ fs/partitions/check.c | 12 ++++++++++++ include/linux/blkdev.h | 1 + include/linux/elevator.h | 2 ++ include/linux/genhd.h | 1 + 8 files changed, 47 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/block/blk-core.c b/block/blk-core.c index 797d5095eb8..ddc68332d65 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -64,13 +64,15 @@ static void drive_stat_acct(struct request *rq, int new_io) return; cpu = part_stat_lock(); - part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); - if (!new_io) + if (!new_io) { + part = rq->part; part_stat_inc(cpu, part, merges[rw]); - else { + } else { + part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); part_round_stats(cpu, part); part_inc_in_flight(part, rw); + rq->part = part; } part_stat_unlock(); @@ -128,6 +130,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq) rq->ref_count = 1; rq->start_time = jiffies; set_start_time_ns(rq); + rq->part = NULL; } EXPORT_SYMBOL(blk_rq_init); @@ -804,11 +807,16 @@ static struct request *get_request(struct request_queue *q, int rw_flags, rl->starved[is_sync] = 0; priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags); - if (priv) + if (priv) { rl->elvpriv++; - if (blk_queue_io_stat(q)) - rw_flags |= REQ_IO_STAT; + /* + * Don't do stats for non-priv requests + */ + if (blk_queue_io_stat(q)) + rw_flags |= REQ_IO_STAT; + } + spin_unlock_irq(q->queue_lock); rq = blk_alloc_request(q, rw_flags, priv, gfp_mask); @@ -1777,7 +1785,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes) int cpu; cpu = part_stat_lock(); - part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); + part = req->part; part_stat_add(cpu, part, sectors[rw], bytes >> 9); part_stat_unlock(); } @@ -1797,7 +1805,7 @@ static void blk_account_io_done(struct request *req) int cpu; cpu = part_stat_lock(); - part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); + part = req->part; part_stat_inc(cpu, part, ios[rw]); part_stat_add(cpu, part, ticks[rw], duration); diff --git a/block/blk-merge.c b/block/blk-merge.c index 6a725461654..38ff234012a 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -351,7 +351,7 @@ static void blk_account_io_merge(struct request *req) int cpu; cpu = part_stat_lock(); - part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); + part = req->part; part_round_stats(cpu, part); part_dec_in_flight(part, rq_data_dir(req)); diff --git a/block/blk.h b/block/blk.h index 6738831ba44..1340cce5721 100644 --- a/block/blk.h +++ b/block/blk.h @@ -110,10 +110,6 @@ void blk_queue_congestion_threshold(struct request_queue *q); int blk_dev_init(void); -void elv_quiesce_start(struct request_queue *q); -void elv_quiesce_end(struct request_queue *q); - - /* * Return the threshold (number of used requests) at which the queue is * considered to be congested. It include a little hysteresis to keep the diff --git a/block/genhd.c b/block/genhd.c index 7923e720ddf..8313834596d 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -932,8 +932,15 @@ static void disk_free_ptbl_rcu_cb(struct rcu_head *head) { struct disk_part_tbl *ptbl = container_of(head, struct disk_part_tbl, rcu_head); + struct gendisk *disk = ptbl->disk; + struct request_queue *q = disk->queue; + unsigned long flags; kfree(ptbl); + + spin_lock_irqsave(q->queue_lock, flags); + elv_quiesce_end(q); + spin_unlock_irqrestore(q->queue_lock, flags); } /** @@ -951,11 +958,17 @@ static void disk_replace_part_tbl(struct gendisk *disk, struct disk_part_tbl *new_ptbl) { struct disk_part_tbl *old_ptbl = disk->part_tbl; + struct request_queue *q = disk->queue; rcu_assign_pointer(disk->part_tbl, new_ptbl); if (old_ptbl) { rcu_assign_pointer(old_ptbl->last_lookup, NULL); + + spin_lock_irq(q->queue_lock); + elv_quiesce_start(q); + spin_unlock_irq(q->queue_lock); + call_rcu(&old_ptbl->rcu_head, disk_free_ptbl_rcu_cb); } } @@ -996,6 +1009,7 @@ int disk_expand_part_tbl(struct gendisk *disk, int partno) return -ENOMEM; new_ptbl->len = target; + new_ptbl->disk = disk; for (i = 0; i < len; i++) rcu_assign_pointer(new_ptbl->part[i], old_ptbl->part[i]); diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 6dfbee03ccc..30f46c2cb9d 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -365,17 +365,25 @@ struct device_type part_type = { static void delete_partition_rcu_cb(struct rcu_head *head) { struct hd_struct *part = container_of(head, struct hd_struct, rcu_head); + struct gendisk *disk = part_to_disk(part); + struct request_queue *q = disk->queue; + unsigned long flags; part->start_sect = 0; part->nr_sects = 0; part_stat_set_all(part, 0); put_device(part_to_dev(part)); + + spin_lock_irqsave(q->queue_lock, flags); + elv_quiesce_end(q); + spin_unlock_irqrestore(q->queue_lock, flags); } void delete_partition(struct gendisk *disk, int partno) { struct disk_part_tbl *ptbl = disk->part_tbl; struct hd_struct *part; + struct request_queue *q = disk->queue; if (partno >= ptbl->len) return; @@ -390,6 +398,10 @@ void delete_partition(struct gendisk *disk, int partno) kobject_put(part->holder_dir); device_del(part_to_dev(part)); + spin_lock_irq(q->queue_lock); + elv_quiesce_start(q); + spin_unlock_irq(q->queue_lock); + call_rcu(&part->rcu_head, delete_partition_rcu_cb); } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 8f3dd981b97..16f7f1be1ac 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -115,6 +115,7 @@ struct request { void *elevator_private3; struct gendisk *rq_disk; + struct hd_struct *part; unsigned long start_time; #ifdef CONFIG_BLK_CGROUP unsigned long long start_time_ns; diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 2c958f4fce1..df1ee866d71 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -121,6 +121,8 @@ extern void elv_completed_request(struct request_queue *, struct request *); extern int elv_set_request(struct request_queue *, struct request *, gfp_t); extern void elv_put_request(struct request_queue *, struct request *); extern void elv_drain_elevator(struct request_queue *); +extern void elv_quiesce_start(struct request_queue *); +extern void elv_quiesce_end(struct request_queue *); /* * io scheduler registration diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 66e26b5a153..57647ecfc1b 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -140,6 +140,7 @@ struct disk_part_tbl { struct rcu_head rcu_head; int len; struct hd_struct *last_lookup; + struct gendisk *disk; struct hd_struct *part[]; }; -- cgit v1.2.3-70-g09d2