From 97e36834f5a106459ab1b290e663a4eb6264639e Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Wed, 12 Oct 2011 12:12:36 -0400 Subject: xen/blk[front|back]: Squash blkif_request_rw and blkif_request_discard together In a union type structure to deal with the overlapping attributes in a easier manner. Suggested-by: Ian Campbell Signed-off-by: Konrad Rzeszutek Wilk --- include/xen/interface/io/blkif.h | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) (limited to 'include/xen/interface') diff --git a/include/xen/interface/io/blkif.h b/include/xen/interface/io/blkif.h index 9324488f23f..f88e28b6a27 100644 --- a/include/xen/interface/io/blkif.h +++ b/include/xen/interface/io/blkif.h @@ -95,6 +95,12 @@ typedef uint64_t blkif_sector_t; #define BLKIF_MAX_SEGMENTS_PER_REQUEST 11 struct blkif_request_rw { + uint8_t nr_segments; /* number of segments */ + blkif_vdev_t handle; /* only for read/write requests */ +#ifdef CONFIG_X86_64 + uint32_t _pad1; /* offsetof(blkif_request,u.rw.id) == 8 */ +#endif + uint64_t id; /* private guest value, echoed in resp */ blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ struct blkif_request_segment { grant_ref_t gref; /* reference to I/O buffer frame */ @@ -102,23 +108,27 @@ struct blkif_request_rw { /* @last_sect: last sector in frame to transfer (inclusive). */ uint8_t first_sect, last_sect; } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; -}; +} __attribute__((__packed__)); struct blkif_request_discard { + uint8_t nr_segments; /* number of segments */ + blkif_vdev_t _pad1; /* only for read/write requests */ +#ifdef CONFIG_X86_64 + uint32_t _pad2; /* offsetof(blkif_req..,u.discard.id)==8*/ +#endif + uint64_t id; /* private guest value, echoed in resp */ blkif_sector_t sector_number; - uint64_t nr_sectors; -}; + uint64_t nr_sectors; + uint8_t _pad3; +} __attribute__((__packed__)); struct blkif_request { uint8_t operation; /* BLKIF_OP_??? */ - uint8_t nr_segments; /* number of segments */ - blkif_vdev_t handle; /* only for read/write requests */ - uint64_t id; /* private guest value, echoed in resp */ union { struct blkif_request_rw rw; struct blkif_request_discard discard; } u; -}; +} __attribute__((__packed__)); struct blkif_response { uint64_t id; /* copied from request */ -- cgit v1.2.3-70-g09d2 From 5ea42986694a96542644f9cae8b122d3a00c508f Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Wed, 12 Oct 2011 16:23:30 -0400 Subject: xen/blk[front|back]: Enhance discard support with secure erasing support. Part of the blkdev_issue_discard(xx) operation is that it can also issue a secure discard operation that will permanantly remove the sectors in question. We advertise that we can support that via the 'discard-secure' attribute and on the request, if the 'secure' bit is set, we will attempt to pass in REQ_DISCARD | REQ_SECURE. CC: Li Dongyang [v1: Used 'flag' instead of 'secure:1' bit] [v2: Use 'reserved' uint8_t instead of adding a new value] [v3: Check for nseg when mapping instead of operation] Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkback/blkback.c | 18 ++++++++++------ drivers/block/xen-blkback/common.h | 7 +++++-- drivers/block/xen-blkback/xenbus.c | 12 +++++++++++ drivers/block/xen-blkfront.c | 41 ++++++++++++++++++++++++++++--------- include/xen/interface/io/blkif.h | 18 +++++++++++++++- 5 files changed, 77 insertions(+), 19 deletions(-) (limited to 'include/xen/interface') diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index d7104abc8b7..9d2261b02f2 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -422,13 +422,16 @@ static void xen_blk_discard(struct xen_blkif *blkif, struct blkif_request *req) int status = BLKIF_RSP_OKAY; struct block_device *bdev = blkif->vbd.bdev; - if (blkif->blk_backend_type == BLKIF_BACKEND_PHY) + if (blkif->blk_backend_type == BLKIF_BACKEND_PHY) { + unsigned long secure = (blkif->vbd.discard_secure && + (req->u.discard.flag & BLKIF_DISCARD_SECURE)) ? + BLKDEV_DISCARD_SECURE : 0; /* just forward the discard request */ err = blkdev_issue_discard(bdev, req->u.discard.sector_number, req->u.discard.nr_sectors, - GFP_KERNEL, 0); - else if (blkif->blk_backend_type == BLKIF_BACKEND_FILE) { + GFP_KERNEL, secure); + } else if (blkif->blk_backend_type == BLKIF_BACKEND_FILE) { /* punch a hole in the backing file */ struct loop_device *lo = bdev->bd_disk->private_data; struct file *file = lo->lo_backing_file; @@ -643,8 +646,11 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, break; } - /* Check that the number of segments is sane. */ - nseg = req->u.rw.nr_segments; + if (unlikely(operation == REQ_DISCARD)) + nseg = 0; + else + /* Check that the number of segments is sane. */ + nseg = req->u.rw.nr_segments; if (unlikely(nseg == 0 && operation != WRITE_FLUSH && operation != REQ_DISCARD) || @@ -708,7 +714,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, * the hypercall to unmap the grants - that is all done in * xen_blkbk_unmap. */ - if (operation != REQ_DISCARD && xen_blkbk_map(req, pending_req, seg)) + if (nseg && xen_blkbk_map(req, pending_req, seg)) goto fail_flush; /* diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h index dbfe7b3b073..d0ee7edc9be 100644 --- a/drivers/block/xen-blkback/common.h +++ b/drivers/block/xen-blkback/common.h @@ -69,7 +69,7 @@ struct blkif_x86_32_request_rw { } __attribute__((__packed__)); struct blkif_x86_32_request_discard { - uint8_t nr_segments; /* number of segments */ + uint8_t flag; /* BLKIF_DISCARD_SECURE or zero */ blkif_vdev_t _pad1; /* was "handle" for read/write requests */ uint64_t id; /* private guest value, echoed in resp */ blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ @@ -104,7 +104,7 @@ struct blkif_x86_64_request_rw { } __attribute__((__packed__)); struct blkif_x86_64_request_discard { - uint8_t nr_segments; /* number of segments */ + uint8_t flag; /* BLKIF_DISCARD_SECURE or zero */ blkif_vdev_t _pad1; /* was "handle" for read/write requests */ uint32_t _pad2; /* offsetof(blkif_..,u.discard.id)==8 */ uint64_t id; @@ -164,6 +164,7 @@ struct xen_vbd { /* Cached size parameter. */ sector_t size; bool flush_support; + bool discard_secure; }; struct backend_info; @@ -261,6 +262,7 @@ static inline void blkif_get_x86_32_req(struct blkif_request *dst, dst->u.rw.seg[i] = src->u.rw.seg[i]; break; case BLKIF_OP_DISCARD: + dst->u.discard.flag = src->u.discard.flag; dst->u.discard.sector_number = src->u.discard.sector_number; dst->u.discard.nr_sectors = src->u.discard.nr_sectors; break; @@ -290,6 +292,7 @@ static inline void blkif_get_x86_64_req(struct blkif_request *dst, dst->u.rw.seg[i] = src->u.rw.seg[i]; break; case BLKIF_OP_DISCARD: + dst->u.discard.flag = src->u.discard.flag; dst->u.discard.sector_number = src->u.discard.sector_number; dst->u.discard.nr_sectors = src->u.discard.nr_sectors; break; diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index f759ad4584c..187fd2c1a15 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -338,6 +338,9 @@ static int xen_vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle, if (q && q->flush_flags) vbd->flush_support = true; + if (q && blk_queue_secdiscard(q)) + vbd->discard_secure = true; + DPRINTK("Successful creation of handle=%04x (dom=%u)\n", handle, blkif->domid); return 0; @@ -420,6 +423,15 @@ int xen_blkbk_discard(struct xenbus_transaction xbt, struct backend_info *be) state = 1; blkif->blk_backend_type = BLKIF_BACKEND_PHY; } + /* Optional. */ + err = xenbus_printf(xbt, dev->nodename, + "discard-secure", "%d", + blkif->vbd.discard_secure); + if (err) { + xenbus_dev_fatal(dev, err, + "writting discard-secure"); + goto kfree; + } } } else { err = PTR_ERR(type); diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 2c2c4be7d9d..351ddeffd43 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -98,7 +98,8 @@ struct blkfront_info unsigned long shadow_free; unsigned int feature_flush; unsigned int flush_op; - unsigned int feature_discard; + unsigned int feature_discard:1; + unsigned int feature_secdiscard:1; unsigned int discard_granularity; unsigned int discard_alignment; int is_ready; @@ -305,11 +306,14 @@ static int blkif_queue_request(struct request *req) ring_req->operation = info->flush_op; } - if (unlikely(req->cmd_flags & REQ_DISCARD)) { + if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) { /* id, sector_number and handle are set above. */ ring_req->operation = BLKIF_OP_DISCARD; - ring_req->u.discard.nr_segments = 0; ring_req->u.discard.nr_sectors = blk_rq_sectors(req); + if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard) + ring_req->u.discard.flag = BLKIF_DISCARD_SECURE; + else + ring_req->u.discard.flag = 0; } else { ring_req->u.rw.nr_segments = blk_rq_map_sg(req->q, req, info->sg); @@ -426,6 +430,8 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size) blk_queue_max_discard_sectors(rq, get_capacity(gd)); rq->limits.discard_granularity = info->discard_granularity; rq->limits.discard_alignment = info->discard_alignment; + if (info->feature_secdiscard) + queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, rq); } /* Hard sector size and max sectors impersonate the equiv. hardware. */ @@ -707,6 +713,8 @@ static void blkif_free(struct blkfront_info *info, int suspend) static void blkif_completion(struct blk_shadow *s) { int i; + /* Do not let BLKIF_OP_DISCARD as nr_segment is in the same place + * flag. */ for (i = 0; i < s->req.u.rw.nr_segments; i++) gnttab_end_foreign_access(s->req.u.rw.seg[i].gref, 0, 0UL); } @@ -738,7 +746,8 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) id = bret->id; req = info->shadow[id].request; - blkif_completion(&info->shadow[id]); + if (bret->operation != BLKIF_OP_DISCARD) + blkif_completion(&info->shadow[id]); add_id_to_freelist(info, id); @@ -751,7 +760,9 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) info->gd->disk_name); error = -EOPNOTSUPP; info->feature_discard = 0; + info->feature_secdiscard = 0; queue_flag_clear(QUEUE_FLAG_DISCARD, rq); + queue_flag_clear(QUEUE_FLAG_SECDISCARD, rq); } __blk_end_request_all(req, error); break; @@ -1039,13 +1050,15 @@ static int blkif_recover(struct blkfront_info *info) req->u.rw.id = get_id_from_freelist(info); memcpy(&info->shadow[req->u.rw.id], ©[i], sizeof(copy[i])); + if (req->operation != BLKIF_OP_DISCARD) { /* Rewrite any grant references invalidated by susp/resume. */ - for (j = 0; j < req->u.rw.nr_segments; j++) - gnttab_grant_foreign_access_ref( - req->u.rw.seg[j].gref, - info->xbdev->otherend_id, - pfn_to_mfn(info->shadow[req->u.rw.id].frame[j]), - rq_data_dir(info->shadow[req->u.rw.id].request)); + for (j = 0; j < req->u.rw.nr_segments; j++) + gnttab_grant_foreign_access_ref( + req->u.rw.seg[j].gref, + info->xbdev->otherend_id, + pfn_to_mfn(info->shadow[req->u.rw.id].frame[j]), + rq_data_dir(info->shadow[req->u.rw.id].request)); + } info->shadow[req->u.rw.id].req = *req; info->ring.req_prod_pvt++; @@ -1137,11 +1150,13 @@ static void blkfront_setup_discard(struct blkfront_info *info) char *type; unsigned int discard_granularity; unsigned int discard_alignment; + unsigned int discard_secure; type = xenbus_read(XBT_NIL, info->xbdev->otherend, "type", NULL); if (IS_ERR(type)) return; + info->feature_secdiscard = 0; if (strncmp(type, "phy", 3) == 0) { err = xenbus_gather(XBT_NIL, info->xbdev->otherend, "discard-granularity", "%u", &discard_granularity, @@ -1152,6 +1167,12 @@ static void blkfront_setup_discard(struct blkfront_info *info) info->discard_granularity = discard_granularity; info->discard_alignment = discard_alignment; } + err = xenbus_gather(XBT_NIL, info->xbdev->otherend, + "discard-secure", "%d", &discard_secure, + NULL); + if (!err) + info->feature_secdiscard = discard_secure; + } else if (strncmp(type, "file", 4) == 0) info->feature_discard = 1; diff --git a/include/xen/interface/io/blkif.h b/include/xen/interface/io/blkif.h index f88e28b6a27..ee338bfde18 100644 --- a/include/xen/interface/io/blkif.h +++ b/include/xen/interface/io/blkif.h @@ -84,6 +84,21 @@ typedef uint64_t blkif_sector_t; * e07154r6-Data_Set_Management_Proposal_for_ATA-ACS2.doc * http://www.seagate.com/staticfiles/support/disc/manuals/ * Interface%20manuals/100293068c.pdf + * The backend can optionally provide three extra XenBus attributes to + * further optimize the discard functionality: + * 'discard-aligment' - Devices that support discard functionality may + * internally allocate space in units that are bigger than the exported + * logical block size. The discard-alignment parameter indicates how many bytes + * the beginning of the partition is offset from the internal allocation unit's + * natural alignment. + * 'discard-granularity' - Devices that support discard functionality may + * internally allocate space using units that are bigger than the logical block + * size. The discard-granularity parameter indicates the size of the internal + * allocation unit in bytes if reported by the device. Otherwise the + * discard-granularity will be set to match the device's physical block size. + * 'discard-secure' - All copies of the discarded sectors (potentially created + * by garbage collection) must also be erased. To use this feature, the flag + * BLKIF_DISCARD_SECURE must be set in the blkif_request_trim. */ #define BLKIF_OP_DISCARD 5 @@ -111,7 +126,8 @@ struct blkif_request_rw { } __attribute__((__packed__)); struct blkif_request_discard { - uint8_t nr_segments; /* number of segments */ + uint8_t flag; /* BLKIF_DISCARD_SECURE or zero. */ +#define BLKIF_DISCARD_SECURE (1<<0) /* ignored if discard-secure=0 */ blkif_vdev_t _pad1; /* only for read/write requests */ #ifdef CONFIG_X86_64 uint32_t _pad2; /* offsetof(blkif_req..,u.discard.id)==8*/ -- cgit v1.2.3-70-g09d2 From 0f9f5a9588468cddeccc9146b86798492c7cd4f5 Mon Sep 17 00:00:00 2001 From: Annie Li Date: Tue, 22 Nov 2011 09:58:06 +0800 Subject: xen/granttable: Introducing grant table V2 stucture This patch introduces new structures of grant table V2, grant table V2 is an extension from V1. Grant table is shared between guest and Xen, and Xen is responsible to do corresponding work for grant operations, such as: figure out guest's grant table version, perform different actions based on different grant table version, etc. Although full-page structure of V2 is different from V1, it play the same role as V1. Acked-by: Ian Campbell Signed-off-by: Annie Li Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/grant-table.c | 7 +- drivers/xen/grant-table.c | 181 ++++++++++++++++++++++++++++-------- include/xen/grant_table.h | 4 +- include/xen/interface/grant_table.h | 167 ++++++++++++++++++++++++++++++++- include/xen/interface/xen.h | 2 + 5 files changed, 310 insertions(+), 51 deletions(-) (limited to 'include/xen/interface') diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c index 6bbfd7ac5e8..c6ab2e7ca3a 100644 --- a/arch/x86/xen/grant-table.c +++ b/arch/x86/xen/grant-table.c @@ -64,10 +64,10 @@ static int unmap_pte_fn(pte_t *pte, struct page *pmd_page, int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes, unsigned long max_nr_gframes, - struct grant_entry **__shared) + void **__shared) { int rc; - struct grant_entry *shared = *__shared; + void *shared = *__shared; if (shared == NULL) { struct vm_struct *area = @@ -83,8 +83,7 @@ int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes, return rc; } -void arch_gnttab_unmap_shared(struct grant_entry *shared, - unsigned long nr_gframes) +void arch_gnttab_unmap_shared(void *shared, unsigned long nr_gframes) { apply_to_page_range(&init_mm, (unsigned long)shared, PAGE_SIZE * nr_gframes, unmap_pte_fn, NULL); diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index bf1c094f4eb..18355a53763 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -53,7 +53,7 @@ /* External tools reserve first few grant table entries. */ #define NR_RESERVED_ENTRIES 8 #define GNTTAB_LIST_END 0xffffffff -#define GREFS_PER_GRANT_FRAME (PAGE_SIZE / sizeof(struct grant_entry)) +#define GREFS_PER_GRANT_FRAME (PAGE_SIZE / sizeof(struct grant_entry_v1)) static grant_ref_t **gnttab_list; static unsigned int nr_grant_frames; @@ -64,7 +64,63 @@ static DEFINE_SPINLOCK(gnttab_list_lock); unsigned long xen_hvm_resume_frames; EXPORT_SYMBOL_GPL(xen_hvm_resume_frames); -static struct grant_entry *shared; +static union { + struct grant_entry_v1 *v1; + void *addr; +} gnttab_shared; + +/*This is a structure of function pointers for grant table*/ +struct gnttab_ops { + /* + * Mapping a list of frames for storing grant entries. First input + * parameter is used to storing grant table address when grant table + * being setup, second parameter is the number of frames to map grant + * table. Returning GNTST_okay means success and negative value means + * failure. + */ + int (*map_frames)(unsigned long *, unsigned int); + /* + * Release a list of frames which are mapped in map_frames for grant + * entry status. + */ + void (*unmap_frames)(void); + /* + * Introducing a valid entry into the grant table, granting the frame + * of this grant entry to domain for accessing, or transfering, or + * transitively accessing. First input parameter is reference of this + * introduced grant entry, second one is domid of granted domain, third + * one is the frame to be granted, and the last one is status of the + * grant entry to be updated. + */ + void (*update_entry)(grant_ref_t, domid_t, unsigned long, unsigned); + /* + * Stop granting a grant entry to domain for accessing. First input + * parameter is reference of a grant entry whose grant access will be + * stopped, second one is not in use now. If the grant entry is + * currently mapped for reading or writing, just return failure(==0) + * directly and don't tear down the grant access. Otherwise, stop grant + * access for this entry and return success(==1). + */ + int (*end_foreign_access_ref)(grant_ref_t, int); + /* + * Stop granting a grant entry to domain for transfer. If tranfer has + * not started, just reclaim the grant entry and return failure(==0). + * Otherwise, wait for the transfer to complete and then return the + * frame. + */ + unsigned long (*end_foreign_transfer_ref)(grant_ref_t); + /* + * Query the status of a grant entry. Input parameter is reference of + * queried grant entry, return value is the status of queried entry. + * Detailed status(writing/reading) can be gotten from the return value + * by bit operations. + */ + int (*query_foreign_access)(grant_ref_t); +}; + +static struct gnttab_ops *gnttab_interface; + +static int grant_table_version; static struct gnttab_free_callback *gnttab_free_callback_list; @@ -142,23 +198,23 @@ static void put_free_entry(grant_ref_t ref) spin_unlock_irqrestore(&gnttab_list_lock, flags); } -static void update_grant_entry(grant_ref_t ref, domid_t domid, - unsigned long frame, unsigned flags) +/* + * Introducing a valid entry into the grant table: + * 1. Write ent->domid. + * 2. Write ent->frame: + * GTF_permit_access: Frame to which access is permitted. + * GTF_accept_transfer: Pseudo-phys frame slot being filled by new + * frame, or zero if none. + * 3. Write memory barrier (WMB). + * 4. Write ent->flags, inc. valid type. + */ +static void gnttab_update_entry_v1(grant_ref_t ref, domid_t domid, + unsigned long frame, unsigned flags) { - /* - * Introducing a valid entry into the grant table: - * 1. Write ent->domid. - * 2. Write ent->frame: - * GTF_permit_access: Frame to which access is permitted. - * GTF_accept_transfer: Pseudo-phys frame slot being filled by new - * frame, or zero if none. - * 3. Write memory barrier (WMB). - * 4. Write ent->flags, inc. valid type. - */ - shared[ref].frame = frame; - shared[ref].domid = domid; + gnttab_shared.v1[ref].domid = domid; + gnttab_shared.v1[ref].frame = frame; wmb(); - shared[ref].flags = flags; + gnttab_shared.v1[ref].flags = flags; } /* @@ -167,7 +223,7 @@ static void update_grant_entry(grant_ref_t ref, domid_t domid, void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid, unsigned long frame, int readonly) { - update_grant_entry(ref, domid, frame, + gnttab_interface->update_entry(ref, domid, frame, GTF_permit_access | (readonly ? GTF_readonly : 0)); } EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_ref); @@ -187,31 +243,37 @@ int gnttab_grant_foreign_access(domid_t domid, unsigned long frame, } EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access); -int gnttab_query_foreign_access(grant_ref_t ref) +static int gnttab_query_foreign_access_v1(grant_ref_t ref) { - u16 nflags; - - nflags = shared[ref].flags; + return gnttab_shared.v1[ref].flags & (GTF_reading|GTF_writing); +} - return nflags & (GTF_reading|GTF_writing); +int gnttab_query_foreign_access(grant_ref_t ref) +{ + return gnttab_interface->query_foreign_access(ref); } EXPORT_SYMBOL_GPL(gnttab_query_foreign_access); -int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly) +static int gnttab_end_foreign_access_ref_v1(grant_ref_t ref, int readonly) { u16 flags, nflags; - nflags = shared[ref].flags; + nflags = gnttab_shared.v1[ref].flags; do { flags = nflags; if (flags & (GTF_reading|GTF_writing)) { printk(KERN_ALERT "WARNING: g.e. still in use!\n"); return 0; } - } while ((nflags = sync_cmpxchg(&shared[ref].flags, flags, 0)) != flags); + } while ((nflags = sync_cmpxchg(&gnttab_shared.v1[ref].flags, flags, 0)) != flags); return 1; } + +int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly) +{ + return gnttab_interface->end_foreign_access_ref(ref, readonly); +} EXPORT_SYMBOL_GPL(gnttab_end_foreign_access_ref); void gnttab_end_foreign_access(grant_ref_t ref, int readonly, @@ -246,11 +308,11 @@ EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer); void gnttab_grant_foreign_transfer_ref(grant_ref_t ref, domid_t domid, unsigned long pfn) { - update_grant_entry(ref, domid, pfn, GTF_accept_transfer); + gnttab_interface->update_entry(ref, domid, pfn, GTF_accept_transfer); } EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer_ref); -unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref) +static unsigned long gnttab_end_foreign_transfer_ref_v1(grant_ref_t ref) { unsigned long frame; u16 flags; @@ -259,24 +321,29 @@ unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref) * If a transfer is not even yet started, try to reclaim the grant * reference and return failure (== 0). */ - while (!((flags = shared[ref].flags) & GTF_transfer_committed)) { - if (sync_cmpxchg(&shared[ref].flags, flags, 0) == flags) + while (!((flags = gnttab_shared.v1[ref].flags) & GTF_transfer_committed)) { + if (sync_cmpxchg(&gnttab_shared.v1[ref].flags, flags, 0) == flags) return 0; cpu_relax(); } /* If a transfer is in progress then wait until it is completed. */ while (!(flags & GTF_transfer_completed)) { - flags = shared[ref].flags; + flags = gnttab_shared.v1[ref].flags; cpu_relax(); } rmb(); /* Read the frame number /after/ reading completion status. */ - frame = shared[ref].frame; + frame = gnttab_shared.v1[ref].frame; BUG_ON(frame == 0); return frame; } + +unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref) +{ + return gnttab_interface->end_foreign_transfer_ref(ref); +} EXPORT_SYMBOL_GPL(gnttab_end_foreign_transfer_ref); unsigned long gnttab_end_foreign_transfer(grant_ref_t ref) @@ -520,6 +587,23 @@ int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops, } EXPORT_SYMBOL_GPL(gnttab_unmap_refs); +static int gnttab_map_frames_v1(unsigned long *frames, unsigned int nr_gframes) +{ + int rc; + + rc = arch_gnttab_map_shared(frames, nr_gframes, + gnttab_max_grant_frames(), + &gnttab_shared.addr); + BUG_ON(rc); + + return 0; +} + +static void gnttab_unmap_frames_v1(void) +{ + arch_gnttab_unmap_shared(gnttab_shared.addr, nr_grant_frames); +} + static int gnttab_map(unsigned int start_idx, unsigned int end_idx) { struct gnttab_setup_table setup; @@ -567,19 +651,35 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx) BUG_ON(rc || setup.status); - rc = arch_gnttab_map_shared(frames, nr_gframes, gnttab_max_grant_frames(), - &shared); - BUG_ON(rc); + rc = gnttab_interface->map_frames(frames, nr_gframes); kfree(frames); - return 0; + return rc; +} + +static struct gnttab_ops gnttab_v1_ops = { + .map_frames = gnttab_map_frames_v1, + .unmap_frames = gnttab_unmap_frames_v1, + .update_entry = gnttab_update_entry_v1, + .end_foreign_access_ref = gnttab_end_foreign_access_ref_v1, + .end_foreign_transfer_ref = gnttab_end_foreign_transfer_ref_v1, + .query_foreign_access = gnttab_query_foreign_access_v1, +}; + +static void gnttab_request_version(void) +{ + grant_table_version = 1; + gnttab_interface = &gnttab_v1_ops; + printk(KERN_INFO "Grant tables using version %d layout.\n", + grant_table_version); } int gnttab_resume(void) { unsigned int max_nr_gframes; + gnttab_request_version(); max_nr_gframes = gnttab_max_grant_frames(); if (max_nr_gframes < nr_grant_frames) return -ENOSYS; @@ -587,9 +687,10 @@ int gnttab_resume(void) if (xen_pv_domain()) return gnttab_map(0, nr_grant_frames - 1); - if (!shared) { - shared = ioremap(xen_hvm_resume_frames, PAGE_SIZE * max_nr_gframes); - if (shared == NULL) { + if (gnttab_shared.addr == NULL) { + gnttab_shared.addr = ioremap(xen_hvm_resume_frames, + PAGE_SIZE * max_nr_gframes); + if (gnttab_shared.addr == NULL) { printk(KERN_WARNING "Failed to ioremap gnttab share frames!"); return -ENOMEM; @@ -603,7 +704,7 @@ int gnttab_resume(void) int gnttab_suspend(void) { - arch_gnttab_unmap_shared(shared, nr_grant_frames); + gnttab_interface->unmap_frames(); return 0; } diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h index 11e2dfce42f..c7a40f8d455 100644 --- a/include/xen/grant_table.h +++ b/include/xen/grant_table.h @@ -145,8 +145,8 @@ gnttab_set_unmap_op(struct gnttab_unmap_grant_ref *unmap, phys_addr_t addr, int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes, unsigned long max_nr_gframes, - struct grant_entry **__shared); -void arch_gnttab_unmap_shared(struct grant_entry *shared, + void **__shared); +void arch_gnttab_unmap_shared(void *shared, unsigned long nr_gframes); extern unsigned long xen_hvm_resume_frames; diff --git a/include/xen/interface/grant_table.h b/include/xen/interface/grant_table.h index 39e571796e3..a17d84433e6 100644 --- a/include/xen/interface/grant_table.h +++ b/include/xen/interface/grant_table.h @@ -84,13 +84,23 @@ * Use SMP-safe bit-setting instruction. */ +/* + * Reference to a grant entry in a specified domain's grant table. + */ +typedef uint32_t grant_ref_t; + /* * A grant table comprises a packed array of grant entries in one or more * page frames shared between Xen and a guest. * [XEN]: This field is written by Xen and read by the sharing guest. * [GST]: This field is written by the guest and read by Xen. */ -struct grant_entry { + +/* + * Version 1 of the grant table entry structure is maintained purely + * for backwards compatibility. New guests should use version 2. + */ +struct grant_entry_v1 { /* GTF_xxx: various type and flag information. [XEN,GST] */ uint16_t flags; /* The domain being granted foreign privileges. [GST] */ @@ -108,10 +118,13 @@ struct grant_entry { * GTF_permit_access: Allow @domid to map/access @frame. * GTF_accept_transfer: Allow @domid to transfer ownership of one page frame * to this guest. Xen writes the page number to @frame. + * GTF_transitive: Allow @domid to transitively access a subrange of + * @trans_grant in @trans_domid. No mappings are allowed. */ #define GTF_invalid (0U<<0) #define GTF_permit_access (1U<<0) #define GTF_accept_transfer (2U<<0) +#define GTF_transitive (3U<<0) #define GTF_type_mask (3U<<0) /* @@ -119,6 +132,9 @@ struct grant_entry { * GTF_readonly: Restrict @domid to read-only mappings and accesses. [GST] * GTF_reading: Grant entry is currently mapped for reading by @domid. [XEN] * GTF_writing: Grant entry is currently mapped for writing by @domid. [XEN] + * GTF_sub_page: Grant access to only a subrange of the page. @domid + * will only be allowed to copy from the grant, and not + * map it. [GST] */ #define _GTF_readonly (2) #define GTF_readonly (1U<<_GTF_readonly) @@ -126,6 +142,8 @@ struct grant_entry { #define GTF_reading (1U<<_GTF_reading) #define _GTF_writing (4) #define GTF_writing (1U<<_GTF_writing) +#define _GTF_sub_page (8) +#define GTF_sub_page (1U<<_GTF_sub_page) /* * Subflags for GTF_accept_transfer: @@ -142,15 +160,81 @@ struct grant_entry { #define _GTF_transfer_completed (3) #define GTF_transfer_completed (1U<<_GTF_transfer_completed) +/* + * Version 2 grant table entries. These fulfil the same role as + * version 1 entries, but can represent more complicated operations. + * Any given domain will have either a version 1 or a version 2 table, + * and every entry in the table will be the same version. + * + * The interface by which domains use grant references does not depend + * on the grant table version in use by the other domain. + */ -/*********************************** - * GRANT TABLE QUERIES AND USES +/* + * Version 1 and version 2 grant entries share a common prefix. The + * fields of the prefix are documented as part of struct + * grant_entry_v1. */ +struct grant_entry_header { + uint16_t flags; + domid_t domid; +}; /* - * Reference to a grant entry in a specified domain's grant table. + * Version 2 of the grant entry structure, here is an union because three + * different types are suppotted: full_page, sub_page and transitive. + */ +union grant_entry_v2 { + struct grant_entry_header hdr; + + /* + * This member is used for V1-style full page grants, where either: + * + * -- hdr.type is GTF_accept_transfer, or + * -- hdr.type is GTF_permit_access and GTF_sub_page is not set. + * + * In that case, the frame field has the same semantics as the + * field of the same name in the V1 entry structure. + */ + struct { + struct grant_entry_header hdr; + uint32_t pad0; + uint64_t frame; + } full_page; + + /* + * If the grant type is GTF_grant_access and GTF_sub_page is set, + * @domid is allowed to access bytes [@page_off,@page_off+@length) + * in frame @frame. + */ + struct { + struct grant_entry_header hdr; + uint16_t page_off; + uint16_t length; + uint64_t frame; + } sub_page; + + /* + * If the grant is GTF_transitive, @domid is allowed to use the + * grant @gref in domain @trans_domid, as if it was the local + * domain. Obviously, the transitive access must be compatible + * with the original grant. + */ + struct { + struct grant_entry_header hdr; + domid_t trans_domid; + uint16_t pad0; + grant_ref_t gref; + } transitive; + + uint32_t __spacer[4]; /* Pad to a power of two */ +}; + +typedef uint16_t grant_status_t; + +/*********************************** + * GRANT TABLE QUERIES AND USES */ -typedef uint32_t grant_ref_t; /* * Handle to track a mapping created via a grant reference. @@ -321,6 +405,79 @@ struct gnttab_query_size { }; DEFINE_GUEST_HANDLE_STRUCT(gnttab_query_size); +/* + * GNTTABOP_unmap_and_replace: Destroy one or more grant-reference mappings + * tracked by but atomically replace the page table entry with one + * pointing to the machine address under . will be + * redirected to the null entry. + * NOTES: + * 1. The call may fail in an undefined manner if either mapping is not + * tracked by . + * 2. After executing a batch of unmaps, it is guaranteed that no stale + * mappings will remain in the device or host TLBs. + */ +#define GNTTABOP_unmap_and_replace 7 +struct gnttab_unmap_and_replace { + /* IN parameters. */ + uint64_t host_addr; + uint64_t new_addr; + grant_handle_t handle; + /* OUT parameters. */ + int16_t status; /* GNTST_* */ +}; +DEFINE_GUEST_HANDLE_STRUCT(gnttab_unmap_and_replace); + +/* + * GNTTABOP_set_version: Request a particular version of the grant + * table shared table structure. This operation can only be performed + * once in any given domain. It must be performed before any grants + * are activated; otherwise, the domain will be stuck with version 1. + * The only defined versions are 1 and 2. + */ +#define GNTTABOP_set_version 8 +struct gnttab_set_version { + /* IN parameters */ + uint32_t version; +}; +DEFINE_GUEST_HANDLE_STRUCT(gnttab_set_version); + +/* + * GNTTABOP_get_status_frames: Get the list of frames used to store grant + * status for . In grant format version 2, the status is separated + * from the other shared grant fields to allow more efficient synchronization + * using barriers instead of atomic cmpexch operations. + * specify the size of vector . + * The frame addresses are returned in the . + * Only addresses are returned, even if the table is larger. + * NOTES: + * 1. may be specified as DOMID_SELF. + * 2. Only a sufficiently-privileged domain may specify != DOMID_SELF. + */ +#define GNTTABOP_get_status_frames 9 +struct gnttab_get_status_frames { + /* IN parameters. */ + uint32_t nr_frames; + domid_t dom; + /* OUT parameters. */ + int16_t status; /* GNTST_* */ + GUEST_HANDLE(uint64_t) frame_list; +}; +DEFINE_GUEST_HANDLE_STRUCT(gnttab_get_status_frames); + +/* + * GNTTABOP_get_version: Get the grant table version which is in + * effect for domain . + */ +#define GNTTABOP_get_version 10 +struct gnttab_get_version { + /* IN parameters */ + domid_t dom; + uint16_t pad; + /* OUT parameters */ + uint32_t version; +}; +DEFINE_GUEST_HANDLE_STRUCT(gnttab_get_version); + /* * Bitfield values for update_pin_status.flags. */ diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h index 6a6e9144934..a890804945e 100644 --- a/include/xen/interface/xen.h +++ b/include/xen/interface/xen.h @@ -523,6 +523,8 @@ struct tmem_op { } u; }; +DEFINE_GUEST_HANDLE(u64); + #else /* __ASSEMBLY__ */ /* In assembly code we cannot use C numeric constant suffixes. */ -- cgit v1.2.3-70-g09d2 From 9e7860cee18241633eddb36a4c34c7b61d8cecbc Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Wed, 4 Jan 2012 09:34:49 +0000 Subject: xen/xenbus: Reject replies with payload > XENSTORE_PAYLOAD_MAX. Haogang Chen found out that: There is a potential integer overflow in process_msg() that could result in cross-domain attack. body = kmalloc(msg->hdr.len + 1, GFP_NOIO | __GFP_HIGH); When a malicious guest passes 0xffffffff in msg->hdr.len, the subsequent call to xb_read() would write to a zero-length buffer. The other end of this connection is always the xenstore backend daemon so there is no guest (malicious or otherwise) which can do this. The xenstore daemon is a trusted component in the system. However this seem like a reasonable robustness improvement so we should have it. And Ian when read the API docs found that: The payload length (len field of the header) is limited to 4096 (XENSTORE_PAYLOAD_MAX) in both directions. If a client exceeds the limit, its xenstored connection will be immediately killed by xenstored, which is usually catastrophic from the client's point of view. Clients (particularly domains, which cannot just reconnect) should avoid this. so this patch checks against that instead. This also avoids a potential integer overflow pointed out by Haogang Chen. Signed-off-by: Ian Campbell Cc: Haogang Chen CC: stable@kernel.org Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/xenbus/xenbus_xs.c | 6 ++++++ include/xen/interface/io/xs_wire.h | 3 +++ 2 files changed, 9 insertions(+) (limited to 'include/xen/interface') diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c index b3b8f2f3ad1..6f0121e3be6 100644 --- a/drivers/xen/xenbus/xenbus_xs.c +++ b/drivers/xen/xenbus/xenbus_xs.c @@ -810,6 +810,12 @@ static int process_msg(void) goto out; } + if (msg->hdr.len > XENSTORE_PAYLOAD_MAX) { + kfree(msg); + err = -EINVAL; + goto out; + } + body = kmalloc(msg->hdr.len + 1, GFP_NOIO | __GFP_HIGH); if (body == NULL) { kfree(msg); diff --git a/include/xen/interface/io/xs_wire.h b/include/xen/interface/io/xs_wire.h index f0b6890370b..3c1877caaef 100644 --- a/include/xen/interface/io/xs_wire.h +++ b/include/xen/interface/io/xs_wire.h @@ -88,4 +88,7 @@ struct xenstore_domain_interface { XENSTORE_RING_IDX rsp_cons, rsp_prod; }; +/* Violating this is very bad. See docs/misc/xenstore.txt. */ +#define XENSTORE_PAYLOAD_MAX 4096 + #endif /* _XS_WIRE_H */ -- cgit v1.2.3-70-g09d2