diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-02-28 17:43:09 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-02-28 17:43:09 -0800 |
commit | 1cf0209c431fa7790253c532039d53b0773193aa (patch) | |
tree | 24310eaaf4c9583988d9098f6c85a4a34970b5b9 /net/ceph/osdmap.c | |
parent | de1a2262b006220dae2561a299a6ea128c46f4fe (diff) | |
parent | 83ca14fdd35821554058e5fd4fa7b118ee504a33 (diff) |
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client
Pull Ceph updates from Sage Weil:
"A few groups of patches here. Alex has been hard at work improving
the RBD code, layout groundwork for understanding the new formats and
doing layering. Most of the infrastructure is now in place for the
final bits that will come with the next window.
There are a few changes to the data layout. Jim Schutt's patch fixes
some non-ideal CRUSH behavior, and a set of patches from me updates
the client to speak a newer version of the protocol and implement an
improved hashing strategy across storage nodes (when the server side
supports it too).
A pair of patches from Sam Lang fix the atomicity of open+create
operations. Several patches from Yan, Zheng fix various mds/client
issues that turned up during multi-mds torture tests.
A final set of patches expose file layouts via virtual xattrs, and
allow the policies to be set on directories via xattrs as well
(avoiding the awkward ioctl interface and providing a consistent
interface for both kernel mount and ceph-fuse users)."
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (143 commits)
libceph: add support for HASHPSPOOL pool flag
libceph: update osd request/reply encoding
libceph: calculate placement based on the internal data types
ceph: update support for PGID64, PGPOOL3, OSDENC protocol features
ceph: update "ceph_features.h"
libceph: decode into cpu-native ceph_pg type
libceph: rename ceph_pg -> ceph_pg_v1
rbd: pass length, not op for osd completions
rbd: move rbd_osd_trivial_callback()
libceph: use a do..while loop in con_work()
libceph: use a flag to indicate a fault has occurred
libceph: separate non-locked fault handling
libceph: encapsulate connection backoff
libceph: eliminate sparse warnings
ceph: eliminate sparse warnings in fs code
rbd: eliminate sparse warnings
libceph: define connection flag helpers
rbd: normalize dout() calls
rbd: barriers are hard
rbd: ignore zero-length requests
...
Diffstat (limited to 'net/ceph/osdmap.c')
-rw-r--r-- | net/ceph/osdmap.c | 290 |
1 files changed, 155 insertions, 135 deletions
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index de73214b5d2..69bc4bf89e3 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -13,26 +13,18 @@ char *ceph_osdmap_state_str(char *str, int len, int state) { - int flag = 0; - if (!len) - goto done; - - *str = '\0'; - if (state) { - if (state & CEPH_OSD_EXISTS) { - snprintf(str, len, "exists"); - flag = 1; - } - if (state & CEPH_OSD_UP) { - snprintf(str, len, "%s%s%s", str, (flag ? ", " : ""), - "up"); - flag = 1; - } - } else { + return str; + + if ((state & CEPH_OSD_EXISTS) && (state & CEPH_OSD_UP)) + snprintf(str, len, "exists, up"); + else if (state & CEPH_OSD_EXISTS) + snprintf(str, len, "exists"); + else if (state & CEPH_OSD_UP) + snprintf(str, len, "up"); + else snprintf(str, len, "doesn't exist"); - } -done: + return str; } @@ -53,13 +45,8 @@ static int calc_bits_of(unsigned int t) */ static void calc_pg_masks(struct ceph_pg_pool_info *pi) { - pi->pg_num_mask = (1 << calc_bits_of(le32_to_cpu(pi->v.pg_num)-1)) - 1; - pi->pgp_num_mask = - (1 << calc_bits_of(le32_to_cpu(pi->v.pgp_num)-1)) - 1; - pi->lpg_num_mask = - (1 << calc_bits_of(le32_to_cpu(pi->v.lpg_num)-1)) - 1; - pi->lpgp_num_mask = - (1 << calc_bits_of(le32_to_cpu(pi->v.lpgp_num)-1)) - 1; + pi->pg_num_mask = (1 << calc_bits_of(pi->pg_num-1)) - 1; + pi->pgp_num_mask = (1 << calc_bits_of(pi->pgp_num-1)) - 1; } /* @@ -170,6 +157,7 @@ static struct crush_map *crush_decode(void *pbyval, void *end) c->choose_local_tries = 2; c->choose_local_fallback_tries = 5; c->choose_total_tries = 19; + c->chooseleaf_descend_once = 0; ceph_decode_need(p, end, 4*sizeof(u32), bad); magic = ceph_decode_32(p); @@ -336,6 +324,11 @@ static struct crush_map *crush_decode(void *pbyval, void *end) dout("crush decode tunable choose_total_tries = %d", c->choose_total_tries); + ceph_decode_need(p, end, sizeof(u32), done); + c->chooseleaf_descend_once = ceph_decode_32(p); + dout("crush decode tunable chooseleaf_descend_once = %d", + c->chooseleaf_descend_once); + done: dout("crush_decode success\n"); return c; @@ -354,12 +347,13 @@ bad: */ static int pgid_cmp(struct ceph_pg l, struct ceph_pg r) { - u64 a = *(u64 *)&l; - u64 b = *(u64 *)&r; - - if (a < b) + if (l.pool < r.pool) + return -1; + if (l.pool > r.pool) + return 1; + if (l.seed < r.seed) return -1; - if (a > b) + if (l.seed > r.seed) return 1; return 0; } @@ -405,8 +399,8 @@ static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root, } else if (c > 0) { n = n->rb_right; } else { - dout("__lookup_pg_mapping %llx got %p\n", - *(u64 *)&pgid, pg); + dout("__lookup_pg_mapping %lld.%x got %p\n", + pgid.pool, pgid.seed, pg); return pg; } } @@ -418,12 +412,13 @@ static int __remove_pg_mapping(struct rb_root *root, struct ceph_pg pgid) struct ceph_pg_mapping *pg = __lookup_pg_mapping(root, pgid); if (pg) { - dout("__remove_pg_mapping %llx %p\n", *(u64 *)&pgid, pg); + dout("__remove_pg_mapping %lld.%x %p\n", pgid.pool, pgid.seed, + pg); rb_erase(&pg->node, root); kfree(pg); return 0; } - dout("__remove_pg_mapping %llx dne\n", *(u64 *)&pgid); + dout("__remove_pg_mapping %lld.%x dne\n", pgid.pool, pgid.seed); return -ENOENT; } @@ -452,7 +447,7 @@ static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new) return 0; } -static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id) +static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, u64 id) { struct ceph_pg_pool_info *pi; struct rb_node *n = root->rb_node; @@ -508,24 +503,57 @@ static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi) static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) { - unsigned int n, m; + u8 ev, cv; + unsigned len, num; + void *pool_end; + + ceph_decode_need(p, end, 2 + 4, bad); + ev = ceph_decode_8(p); /* encoding version */ + cv = ceph_decode_8(p); /* compat version */ + if (ev < 5) { + pr_warning("got v %d < 5 cv %d of ceph_pg_pool\n", ev, cv); + return -EINVAL; + } + if (cv > 7) { + pr_warning("got v %d cv %d > 7 of ceph_pg_pool\n", ev, cv); + return -EINVAL; + } + len = ceph_decode_32(p); + ceph_decode_need(p, end, len, bad); + pool_end = *p + len; - ceph_decode_copy(p, &pi->v, sizeof(pi->v)); - calc_pg_masks(pi); + pi->type = ceph_decode_8(p); + pi->size = ceph_decode_8(p); + pi->crush_ruleset = ceph_decode_8(p); + pi->object_hash = ceph_decode_8(p); + + pi->pg_num = ceph_decode_32(p); + pi->pgp_num = ceph_decode_32(p); + + *p += 4 + 4; /* skip lpg* */ + *p += 4; /* skip last_change */ + *p += 8 + 4; /* skip snap_seq, snap_epoch */ - /* num_snaps * snap_info_t */ - n = le32_to_cpu(pi->v.num_snaps); - while (n--) { - ceph_decode_need(p, end, sizeof(u64) + 1 + sizeof(u64) + - sizeof(struct ceph_timespec), bad); - *p += sizeof(u64) + /* key */ - 1 + sizeof(u64) + /* u8, snapid */ - sizeof(struct ceph_timespec); - m = ceph_decode_32(p); /* snap name */ - *p += m; + /* skip snaps */ + num = ceph_decode_32(p); + while (num--) { + *p += 8; /* snapid key */ + *p += 1 + 1; /* versions */ + len = ceph_decode_32(p); + *p += len; } - *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2; + /* skip removed snaps */ + num = ceph_decode_32(p); + *p += num * (8 + 8); + + *p += 8; /* skip auid */ + pi->flags = ceph_decode_64(p); + + /* ignore the rest */ + + *p = pool_end; + calc_pg_masks(pi); return 0; bad: @@ -535,14 +563,15 @@ bad: static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map) { struct ceph_pg_pool_info *pi; - u32 num, len, pool; + u32 num, len; + u64 pool; ceph_decode_32_safe(p, end, num, bad); dout(" %d pool names\n", num); while (num--) { - ceph_decode_32_safe(p, end, pool, bad); + ceph_decode_64_safe(p, end, pool, bad); ceph_decode_32_safe(p, end, len, bad); - dout(" pool %d len %d\n", pool, len); + dout(" pool %llu len %d\n", pool, len); ceph_decode_need(p, end, len, bad); pi = __lookup_pg_pool(&map->pg_pools, pool); if (pi) { @@ -633,7 +662,6 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) struct ceph_osdmap *map; u16 version; u32 len, max, i; - u8 ev; int err = -EINVAL; void *start = *p; struct ceph_pg_pool_info *pi; @@ -646,9 +674,12 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) map->pg_temp = RB_ROOT; ceph_decode_16_safe(p, end, version, bad); - if (version > CEPH_OSDMAP_VERSION) { - pr_warning("got unknown v %d > %d of osdmap\n", version, - CEPH_OSDMAP_VERSION); + if (version > 6) { + pr_warning("got unknown v %d > 6 of osdmap\n", version); + goto bad; + } + if (version < 6) { + pr_warning("got old v %d < 6 of osdmap\n", version); goto bad; } @@ -660,20 +691,12 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) ceph_decode_32_safe(p, end, max, bad); while (max--) { - ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad); + ceph_decode_need(p, end, 8 + 2, bad); err = -ENOMEM; pi = kzalloc(sizeof(*pi), GFP_NOFS); if (!pi) goto bad; - pi->id = ceph_decode_32(p); - err = -EINVAL; - ev = ceph_decode_8(p); /* encoding version */ - if (ev > CEPH_PG_POOL_VERSION) { - pr_warning("got unknown v %d > %d of ceph_pg_pool\n", - ev, CEPH_PG_POOL_VERSION); - kfree(pi); - goto bad; - } + pi->id = ceph_decode_64(p); err = __decode_pool(p, end, pi); if (err < 0) { kfree(pi); @@ -682,12 +705,10 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) __insert_pg_pool(&map->pg_pools, pi); } - if (version >= 5) { - err = __decode_pool_names(p, end, map); - if (err < 0) { - dout("fail to decode pool names"); - goto bad; - } + err = __decode_pool_names(p, end, map); + if (err < 0) { + dout("fail to decode pool names"); + goto bad; } ceph_decode_32_safe(p, end, map->pool_max, bad); @@ -724,10 +745,13 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) for (i = 0; i < len; i++) { int n, j; struct ceph_pg pgid; + struct ceph_pg_v1 pgid_v1; struct ceph_pg_mapping *pg; ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad); - ceph_decode_copy(p, &pgid, sizeof(pgid)); + ceph_decode_copy(p, &pgid_v1, sizeof(pgid_v1)); + pgid.pool = le32_to_cpu(pgid_v1.pool); + pgid.seed = le16_to_cpu(pgid_v1.ps); n = ceph_decode_32(p); err = -EINVAL; if (n > (UINT_MAX - sizeof(*pg)) / sizeof(u32)) @@ -745,7 +769,8 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) err = __insert_pg_mapping(pg, &map->pg_temp); if (err) goto bad; - dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, len); + dout(" added pg_temp %lld.%x len %d\n", pgid.pool, pgid.seed, + len); } /* crush */ @@ -784,16 +809,17 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, struct ceph_fsid fsid; u32 epoch = 0; struct ceph_timespec modified; - u32 len, pool; - __s32 new_pool_max, new_flags, max; + s32 len; + u64 pool; + __s64 new_pool_max; + __s32 new_flags, max; void *start = *p; int err = -EINVAL; u16 version; ceph_decode_16_safe(p, end, version, bad); - if (version > CEPH_OSDMAP_INC_VERSION) { - pr_warning("got unknown v %d > %d of inc osdmap\n", version, - CEPH_OSDMAP_INC_VERSION); + if (version > 6) { + pr_warning("got unknown v %d > %d of inc osdmap\n", version, 6); goto bad; } @@ -803,7 +829,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, epoch = ceph_decode_32(p); BUG_ON(epoch != map->epoch+1); ceph_decode_copy(p, &modified, sizeof(modified)); - new_pool_max = ceph_decode_32(p); + new_pool_max = ceph_decode_64(p); new_flags = ceph_decode_32(p); /* full map? */ @@ -853,18 +879,9 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, /* new_pool */ ceph_decode_32_safe(p, end, len, bad); while (len--) { - __u8 ev; struct ceph_pg_pool_info *pi; - ceph_decode_32_safe(p, end, pool, bad); - ceph_decode_need(p, end, 1 + sizeof(pi->v), bad); - ev = ceph_decode_8(p); /* encoding version */ - if (ev > CEPH_PG_POOL_VERSION) { - pr_warning("got unknown v %d > %d of ceph_pg_pool\n", - ev, CEPH_PG_POOL_VERSION); - err = -EINVAL; - goto bad; - } + ceph_decode_64_safe(p, end, pool, bad); pi = __lookup_pg_pool(&map->pg_pools, pool); if (!pi) { pi = kzalloc(sizeof(*pi), GFP_NOFS); @@ -890,7 +907,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, while (len--) { struct ceph_pg_pool_info *pi; - ceph_decode_32_safe(p, end, pool, bad); + ceph_decode_64_safe(p, end, pool, bad); pi = __lookup_pg_pool(&map->pg_pools, pool); if (pi) __remove_pg_pool(&map->pg_pools, pi); @@ -946,10 +963,13 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, while (len--) { struct ceph_pg_mapping *pg; int j; + struct ceph_pg_v1 pgid_v1; struct ceph_pg pgid; u32 pglen; ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad); - ceph_decode_copy(p, &pgid, sizeof(pgid)); + ceph_decode_copy(p, &pgid_v1, sizeof(pgid_v1)); + pgid.pool = le32_to_cpu(pgid_v1.pool); + pgid.seed = le16_to_cpu(pgid_v1.ps); pglen = ceph_decode_32(p); if (pglen) { @@ -975,8 +995,8 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, kfree(pg); goto bad; } - dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, - pglen); + dout(" added pg_temp %lld.%x len %d\n", pgid.pool, + pgid.seed, pglen); } else { /* remove */ __remove_pg_mapping(&map->pg_temp, pgid); @@ -1010,7 +1030,7 @@ bad: * pass a stride back to the caller. */ int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, - u64 off, u64 *plen, + u64 off, u64 len, u64 *ono, u64 *oxoff, u64 *oxlen) { @@ -1021,7 +1041,7 @@ int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, u32 su_per_object; u64 t, su_offset; - dout("mapping %llu~%llu osize %u fl_su %u\n", off, *plen, + dout("mapping %llu~%llu osize %u fl_su %u\n", off, len, osize, su); if (su == 0 || sc == 0) goto invalid; @@ -1054,11 +1074,10 @@ int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, /* * Calculate the length of the extent being written to the selected - * object. This is the minimum of the full length requested (plen) or + * object. This is the minimum of the full length requested (len) or * the remainder of the current stripe being written to. */ - *oxlen = min_t(u64, *plen, su - su_offset); - *plen = *oxlen; + *oxlen = min_t(u64, len, su - su_offset); dout(" obj extent %llu~%llu\n", *oxoff, *oxlen); return 0; @@ -1076,33 +1095,24 @@ EXPORT_SYMBOL(ceph_calc_file_object_mapping); * calculate an object layout (i.e. pgid) from an oid, * file_layout, and osdmap */ -int ceph_calc_object_layout(struct ceph_object_layout *ol, +int ceph_calc_object_layout(struct ceph_pg *pg, const char *oid, struct ceph_file_layout *fl, struct ceph_osdmap *osdmap) { unsigned int num, num_mask; - struct ceph_pg pgid; - int poolid = le32_to_cpu(fl->fl_pg_pool); struct ceph_pg_pool_info *pool; - unsigned int ps; BUG_ON(!osdmap); - - pool = __lookup_pg_pool(&osdmap->pg_pools, poolid); + pg->pool = le32_to_cpu(fl->fl_pg_pool); + pool = __lookup_pg_pool(&osdmap->pg_pools, pg->pool); if (!pool) return -EIO; - ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid)); - num = le32_to_cpu(pool->v.pg_num); + pg->seed = ceph_str_hash(pool->object_hash, oid, strlen(oid)); + num = pool->pg_num; num_mask = pool->pg_num_mask; - pgid.ps = cpu_to_le16(ps); - pgid.preferred = cpu_to_le16(-1); - pgid.pool = fl->fl_pg_pool; - dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps); - - ol->ol_pgid = pgid; - ol->ol_stripe_unit = fl->fl_object_stripe_unit; + dout("calc_object_layout '%s' pgid %lld.%x\n", oid, pg->pool, pg->seed); return 0; } EXPORT_SYMBOL(ceph_calc_object_layout); @@ -1117,19 +1127,16 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid, struct ceph_pg_mapping *pg; struct ceph_pg_pool_info *pool; int ruleno; - unsigned int poolid, ps, pps, t, r; - - poolid = le32_to_cpu(pgid.pool); - ps = le16_to_cpu(pgid.ps); + int r; + u32 pps; - pool = __lookup_pg_pool(&osdmap->pg_pools, poolid); + pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool); if (!pool) return NULL; /* pg_temp? */ - t = ceph_stable_mod(ps, le32_to_cpu(pool->v.pg_num), - pool->pgp_num_mask); - pgid.ps = cpu_to_le16(t); + pgid.seed = ceph_stable_mod(pgid.seed, pool->pg_num, + pool->pgp_num_mask); pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid); if (pg) { *num = pg->len; @@ -1137,26 +1144,39 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid, } /* crush */ - ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset, - pool->v.type, pool->v.size); + ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset, + pool->type, pool->size); if (ruleno < 0) { - pr_err("no crush rule pool %d ruleset %d type %d size %d\n", - poolid, pool->v.crush_ruleset, pool->v.type, - pool->v.size); + pr_err("no crush rule pool %lld ruleset %d type %d size %d\n", + pgid.pool, pool->crush_ruleset, pool->type, + pool->size); return NULL; } - pps = ceph_stable_mod(ps, - le32_to_cpu(pool->v.pgp_num), - pool->pgp_num_mask); - pps += poolid; + if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) { + /* hash pool id and seed sothat pool PGs do not overlap */ + pps = crush_hash32_2(CRUSH_HASH_RJENKINS1, + ceph_stable_mod(pgid.seed, pool->pgp_num, + pool->pgp_num_mask), + pgid.pool); + } else { + /* + * legacy ehavior: add ps and pool together. this is + * not a great approach because the PGs from each pool + * will overlap on top of each other: 0.5 == 1.4 == + * 2.3 == ... + */ + pps = ceph_stable_mod(pgid.seed, pool->pgp_num, + pool->pgp_num_mask) + + (unsigned)pgid.pool; + } r = crush_do_rule(osdmap->crush, ruleno, pps, osds, - min_t(int, pool->v.size, *num), + min_t(int, pool->size, *num), osdmap->osd_weight); if (r < 0) { - pr_err("error %d from crush rule: pool %d ruleset %d type %d" - " size %d\n", r, poolid, pool->v.crush_ruleset, - pool->v.type, pool->v.size); + pr_err("error %d from crush rule: pool %lld ruleset %d type %d" + " size %d\n", r, pgid.pool, pool->crush_ruleset, + pool->type, pool->size); return NULL; } *num = r; |