4 #include <linux/module.h>
5 #include <linux/slab.h>
28 snprintf(str, len,
"%s%s%s", str, (flag ?
", " :
""),
41 static int calc_bits_of(
unsigned int t)
68 static int crush_decode_uniform_bucket(
void **
p,
void *
end,
71 dout(
"crush_decode_uniform_bucket %p to %p\n", *p, end);
79 static int crush_decode_list_bucket(
void **p,
void *end,
83 dout(
"crush_decode_list_bucket %p to %p\n", *p, end);
91 for (j = 0; j < b->
h.size; j++) {
100 static int crush_decode_tree_bucket(
void **p,
void *end,
104 dout(
"crush_decode_tree_bucket %p to %p\n", *p, end);
117 static int crush_decode_straw_bucket(
void **p,
void *end,
121 dout(
"crush_decode_straw_bucket %p to %p\n", *p, end);
129 for (j = 0; j < b->
h.size; j++) {
131 b->
straws[
j] = ceph_decode_32(p);
138 static int skip_name_map(
void **p,
void *end)
153 static struct crush_map *crush_decode(
void *pbyval,
void *end)
159 void *
start = pbyval;
163 dout(
"crush_decode %p to %p len %d\n", *p, end, (
int)(end - *p));
175 magic = ceph_decode_32(p);
177 pr_err(
"crush_decode magic %x != current %x\n",
203 dout(
"crush_decode bucket %d off %x %p to %p\n",
204 i, (
int)(*p-start), *p, end);
229 b->
id = ceph_decode_32(p);
230 b->
type = ceph_decode_16(p);
231 b->
alg = ceph_decode_8(p);
232 b->
hash = ceph_decode_8(p);
233 b->
weight = ceph_decode_32(p);
234 b->
size = ceph_decode_32(p);
236 dout(
"crush_decode bucket size %d off %x %p to %p\n",
237 b->
size, (
int)(*p-start), *p, end);
248 for (j = 0; j < b->
size; j++)
249 b->
items[j] = ceph_decode_32(p);
253 err = crush_decode_uniform_bucket(p, end,
259 err = crush_decode_list_bucket(p, end,
265 err = crush_decode_tree_bucket(p, end,
271 err = crush_decode_straw_bucket(p, end,
287 dout(
"crush_decode NO rule %d off %x %p to %p\n",
288 i, (
int)(*p-start), *p, end);
293 dout(
"crush_decode rule %d off %x %p to %p\n",
294 i, (
int)(*p-start), *p, end);
298 #if BITS_PER_LONG == 32
309 dout(
" rule %d is at %p\n", i, r);
313 for (j = 0; j < r->
len; j++) {
314 r->
steps[
j].op = ceph_decode_32(p);
315 r->
steps[
j].arg1 = ceph_decode_32(p);
316 r->
steps[
j].arg2 = ceph_decode_32(p);
321 for (num_name_maps = 0; num_name_maps < 3; num_name_maps++) {
322 err = skip_name_map(p, end);
332 dout(
"crush decode tunable choose_local_tries = %d",
334 dout(
"crush decode tunable choose_local_fallback_tries = %d",
336 dout(
"crush decode tunable choose_total_tries = %d",
340 dout(
"crush_decode success\n");
346 dout(
"crush_decode fail %d\n", err);
375 dout(
"__insert_pg_mapping %llx %p\n", *(
u64 *)&new->pgid,
new);
379 c = pgid_cmp(new->pgid, pg->
pgid);
388 rb_link_node(&new->node, parent, p);
402 c = pgid_cmp(pgid, pg->
pgid);
408 dout(
"__lookup_pg_mapping %llx got %p\n",
416 static int __remove_pg_mapping(
struct rb_root *root,
struct ceph_pg pgid)
421 dout(
"__remove_pg_mapping %llx %p\n", *(
u64 *)&pgid, pg);
426 dout(
"__remove_pg_mapping %llx dne\n", *(
u64 *)&pgid);
442 if (new->id < pi->
id)
444 else if (new->id > pi->
id)
450 rb_link_node(&new->node, parent, p);
464 else if (
id > pi->
id)
497 ceph_decode_copy(p, &pi->
v,
sizeof(pi->
v));
508 m = ceph_decode_32(p);
519 static int __decode_pool_names(
void **p,
void *end,
struct ceph_osdmap *
map)
525 dout(
" %d pool names\n", num);
529 dout(
" pool %d len %d\n", pool, len);
531 pi = __lookup_pg_pool(&map->
pg_pools, pool);
554 dout(
"osdmap_destroy %p\n", map);
568 __remove_pg_pool(&map->
pg_pools, pi);
585 state = kcalloc(max,
sizeof(*state),
GFP_NOFS);
586 addr = kcalloc(max,
sizeof(*addr),
GFP_NOFS);
587 weight = kcalloc(max,
sizeof(*weight),
GFP_NOFS);
625 dout(
"osdmap_decode %p to %p len %d\n", *p, end, (
int)(end - *p));
627 map = kzalloc(
sizeof(*map),
GFP_NOFS);
634 pr_warning(
"got unknown v %d > %d of osdmap\n", version,
640 ceph_decode_copy(p, &map->
fsid,
sizeof(map->
fsid));
641 map->
epoch = ceph_decode_32(p);
642 ceph_decode_copy(p, &map->created,
sizeof(map->created));
648 pi = kzalloc(
sizeof(*pi),
GFP_NOFS);
651 pi->
id = ceph_decode_32(p);
652 ev = ceph_decode_8(p);
654 pr_warning(
"got unknown v %d > %d of ceph_pg_pool\n",
659 err = __decode_pool(p, end, pi);
664 __insert_pg_pool(&map->
pg_pools, pi);
667 if (version >= 5 && __decode_pool_names(p, end, map) < 0)
674 max = ceph_decode_32(p);
677 err = osdmap_set_max_osd(map, max);
680 dout(
"osdmap_decode max_osd = %d\n", map->
max_osd);
691 for (i = 0; i < map->
max_osd; i++)
696 for (i = 0; i < map->
max_osd; i++)
697 ceph_decode_addr(&map->
osd_addr[i]);
701 for (i = 0; i < len; i++) {
707 ceph_decode_copy(p, &pgid,
sizeof(pgid));
708 n = ceph_decode_32(p);
719 for (j = 0; j <
n; j++)
720 pg->
osds[j] = ceph_decode_32(p);
722 err = __insert_pg_mapping(pg, &map->
pg_temp);
725 dout(
" added pg_temp %llx len %d\n", *(
u64 *)&pgid, len);
730 dout(
"osdmap_decode crush len %d from off 0x%x\n", len,
733 map->
crush = crush_decode(*p, end);
735 if (IS_ERR(map->
crush)) {
736 err = PTR_ERR(map->
crush);
744 dout(
"osdmap_decode done %p %p\n", *p, end);
748 dout(
"osdmap_decode fail\n");
772 pr_warning(
"got unknown v %d > %d of inc osdmap\n", version,
779 ceph_decode_copy(p, &fsid,
sizeof(fsid));
780 epoch = ceph_decode_32(p);
782 ceph_decode_copy(p, &modified,
sizeof(modified));
783 new_pool_max = ceph_decode_32(p);
784 new_flags = ceph_decode_32(p);
789 dout(
"apply_incremental full map len %d, %p to %p\n",
797 dout(
"apply_incremental new crush map len %d, %p to %p\n",
799 newcrush = crush_decode(*p,
min(*p+len, end));
800 if (IS_ERR(newcrush))
801 return ERR_CAST(newcrush);
807 map->
flags = new_flags;
808 if (new_pool_max >= 0)
814 max = ceph_decode_32(p);
816 err = osdmap_set_max_osd(map, max);
826 map->
crush = newcrush;
838 ev = ceph_decode_8(p);
840 pr_warning(
"got unknown v %d > %d of ceph_pg_pool\n",
844 pi = __lookup_pg_pool(&map->
pg_pools, pool);
846 pi = kzalloc(
sizeof(*pi),
GFP_NOFS);
852 __insert_pg_pool(&map->
pg_pools, pi);
854 err = __decode_pool(p, end, pi);
858 if (version >= 5 && __decode_pool_names(p, end, map) < 0)
867 pi = __lookup_pg_pool(&map->
pg_pools, pool);
869 __remove_pg_pool(&map->
pg_pools, pi);
880 ceph_decode_addr(&addr);
893 xorstate = **(
u8 **)p;
899 if (osd < map->max_osd)
908 osd = ceph_decode_32(p);
909 off = ceph_decode_32(p);
910 pr_info(
"osd%d weight 0x%x %s\n", osd, off,
913 if (osd < map->max_osd)
925 ceph_decode_copy(p, &pgid,
sizeof(pgid));
926 pglen = ceph_decode_32(p);
935 if (pglen > (
UINT_MAX -
sizeof(*pg)) /
sizeof(
u32)) {
946 for (j = 0; j < pglen; j++)
947 pg->
osds[j] = ceph_decode_32(p);
948 err = __insert_pg_mapping(pg, &map->
pg_temp);
953 dout(
" added pg_temp %llx len %d\n", *(
u64 *)&pgid,
957 __remove_pg_mapping(&map->
pg_temp, pgid);
966 pr_err(
"corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n",
967 epoch, (
int)(*p - start), *p, start, end);
970 start, end - start,
true);
995 u32 bl, stripeno, stripepos, objsetno;
999 dout(
"mapping %llu~%llu osize %u fl_su %u\n", off, *plen,
1001 if (su == 0 || sc == 0)
1003 su_per_object = osize /
su;
1004 if (su_per_object == 0)
1006 dout(
"osize %u / su %u = su_per_object %u\n", osize, su,
1016 dout(
"off %llu / su %u = bl %u\n", off, su, bl);
1019 stripepos = bl %
sc;
1020 objsetno = stripeno / su_per_object;
1022 *ono = objsetno * sc + stripepos;
1023 dout(
"objset %u * sc %u = ono %u\n", objsetno, sc, (
unsigned int)*ono);
1027 su_offset =
do_div(t, su);
1028 *oxoff = su_offset + (stripeno % su_per_object) * su;
1035 *oxlen =
min_t(
u64, *plen, su - su_offset);
1038 dout(
" obj extent %llu~%llu\n", *oxoff, *oxlen);
1042 dout(
" invalid layout\n");
1059 unsigned int num, num_mask;
1067 pool = __lookup_pg_pool(&osdmap->
pg_pools, poolid);
1077 dout(
"calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps);
1090 int *osds,
int *num)
1095 unsigned int poolid,
ps, pps,
t,
r;
1100 pool = __lookup_pg_pool(&osdmap->
pg_pools, poolid);
1108 pg = __lookup_pg_mapping(&osdmap->
pg_temp, pgid);
1116 pool->
v.type, pool->
v.size);
1118 pr_err(
"no crush rule pool %d ruleset %d type %d size %d\n",
1119 poolid, pool->
v.crush_ruleset, pool->
v.type,
1124 pps = ceph_stable_mod(ps,
1129 min_t(
int, pool->
v.size, *num),
1132 pr_err(
"error %d from crush rule: pool %d ruleset %d type %d"
1133 " size %d\n", r, poolid, pool->
v.crush_ruleset,
1134 pool->
v.type, pool->
v.size);
1150 osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
1156 for (i = 0; i < num; i++)
1157 if (ceph_osd_is_up(osdmap, osds[i]))
1158 acting[o++] = osds[
i];
1170 osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
1175 for (i = 0; i < num; i++)
1176 if (ceph_osd_is_up(osdmap, osds[i]))