8 #include <linux/slab.h>
9 #include <linux/module.h>
19 #define DM_MSG_PREFIX "raid"
49 #define DMPF_NOSYNC 0x2
50 #define DMPF_REBUILD 0x4
51 #define DMPF_DAEMON_SLEEP 0x8
52 #define DMPF_MIN_RECOVERY_RATE 0x10
53 #define DMPF_MAX_RECOVERY_RATE 0x20
54 #define DMPF_MAX_WRITE_BEHIND 0x40
55 #define DMPF_STRIPE_CACHE 0x80
56 #define DMPF_REGION_SIZE 0x100
57 #define DMPF_RAID10_COPIES 0x200
58 #define DMPF_RAID10_FORMAT 0x400
74 static struct raid_type {
77 const unsigned parity_devs;
78 const unsigned minimal_devs;
82 {
"raid1",
"RAID1 (mirroring)", 0, 2, 1, 0 },
83 {
"raid10",
"RAID10 (striped mirrors)", 0, 2, 10,
UINT_MAX },
94 static unsigned raid10_md_layout_to_copies(
int layout)
99 static int raid10_format_to_md_layout(
char *
format,
unsigned copies)
102 return (1 << 8) | (copies & 0xFF);
105 static struct raid_type *get_raid_type(
char *
name)
110 if (!
strcmp(raid_types[i].name, name))
111 return &raid_types[i];
121 if (raid_devs <= raid_type->parity_devs) {
122 ti->
error =
"Insufficient number of devices";
126 rs = kzalloc(
sizeof(*rs) + raid_devs *
sizeof(rs->
dev[0]),
GFP_KERNEL);
128 ti->
error =
"Cannot allocate raid context";
136 rs->
md.raid_disks = raid_devs;
137 rs->
md.level = raid_type->level;
138 rs->
md.new_level = rs->
md.level;
139 rs->
md.layout = raid_type->algorithm;
140 rs->
md.new_layout = rs->
md.layout;
141 rs->
md.delta_disks = 0;
142 rs->
md.recovery_cp = 0;
144 for (i = 0; i < raid_devs; i++)
159 static void context_free(
struct raid_set *rs)
163 for (i = 0; i < rs->
md.raid_disks; i++) {
164 if (rs->
dev[i].meta_dev)
167 if (rs->
dev[i].data_dev)
190 static int dev_parms(
struct raid_set *rs,
char **argv)
194 int metadata_available = 0;
197 for (i = 0; i < rs->
md.raid_disks; i++, argv += 2) {
198 rs->
dev[
i].rdev.raid_disk =
i;
207 rs->
dev[
i].rdev.data_offset = 0;
208 rs->
dev[
i].rdev.mddev = &rs->
md;
210 if (
strcmp(argv[0],
"-")) {
213 &rs->
dev[i].meta_dev);
214 rs->
ti->error =
"RAID metadata device lookup failure";
219 if (!rs->
dev[i].rdev.sb_page)
223 if (!
strcmp(argv[1],
"-")) {
225 (!rs->
dev[i].rdev.recovery_offset)) {
226 rs->
ti->error =
"Drive designated for rebuild not specified";
230 rs->
ti->error =
"No data device supplied with metadata device";
231 if (rs->
dev[i].meta_dev)
239 &rs->
dev[i].data_dev);
241 rs->
ti->error =
"RAID device lookup failure";
245 if (rs->
dev[i].meta_dev) {
246 metadata_available = 1;
247 rs->
dev[
i].rdev.meta_bdev = rs->
dev[
i].meta_dev->bdev;
249 rs->
dev[
i].rdev.bdev = rs->
dev[
i].data_dev->bdev;
250 list_add(&rs->
dev[i].rdev.same_set, &rs->
md.disks);
255 if (metadata_available) {
257 rs->
md.persistent = 1;
258 rs->
md.major_version = 2;
259 }
else if (rebuild && !rs->
md.recovery_cp) {
271 DMERR(
"Unable to rebuild drive while array is not in-sync");
272 rs->
ti->error =
"RAID device lookup failure";
289 static int validate_region_size(
struct raid_set *rs,
unsigned long region_size)
291 unsigned long min_region_size = rs->
ti->len / (1 << 21);
297 if (min_region_size > (1 << 13)) {
298 DMINFO(
"Choosing default region size of %lu sectors",
300 region_size = min_region_size;
302 DMINFO(
"Choosing default region size of 4MiB");
303 region_size = 1 << 13;
309 if (region_size > rs->
ti->len) {
310 rs->
ti->error =
"Supplied region size is too large";
314 if (region_size < min_region_size) {
315 DMERR(
"Supplied region_size (%lu sectors) below minimum (%lu)",
316 region_size, min_region_size);
317 rs->
ti->error =
"Supplied region size is too small";
322 rs->
ti->error =
"Region size is not a power of 2";
326 if (region_size < rs->
md.chunk_sectors) {
327 rs->
ti->error =
"Region size is smaller than the chunk size";
335 rs->
md.bitmap_info.chunksize = (region_size << 9);
349 static int validate_rebuild_devices(
struct raid_set *rs)
351 unsigned i, rebuild_cnt = 0;
352 unsigned rebuilds_per_group, copies,
d;
357 for (i = 0; i < rs->
md.raid_disks; i++)
363 if (rebuild_cnt >= rs->
md.raid_disks)
369 if (rebuild_cnt > rs->
raid_type->parity_devs)
373 copies = raid10_md_layout_to_copies(rs->
md.layout);
374 if (rebuild_cnt < copies)
394 rebuilds_per_group = 0;
395 for (i = 0; i < rs->
md.raid_disks * copies; i++) {
396 d = i % rs->
md.raid_disks;
398 (++rebuilds_per_group >= copies))
400 if (!((i + 1) % copies))
401 rebuilds_per_group = 0;
405 DMERR(
"The rebuild parameter is not supported for %s",
407 rs->
ti->error =
"Rebuild not supported for this RAID type";
414 rs->
ti->error =
"Too many rebuild devices specified";
441 static int parse_raid_params(
struct raid_set *rs,
char **argv,
442 unsigned num_raid_params)
444 char *raid10_format =
"near";
445 unsigned raid10_copies = 2;
447 unsigned long value, region_size = 0;
457 rs->
ti->error =
"Bad chunk size";
461 DMERR(
"Ignoring chunk size parameter for RAID 1");
464 rs->
ti->error =
"Chunk size must be a power of 2";
466 }
else if (value < 8) {
467 rs->
ti->error =
"Chunk size value is too small";
471 rs->
md.new_chunk_sectors = rs->
md.chunk_sectors =
value;
492 for (i = 0; i < rs->
md.raid_disks; i++) {
500 for (i = 0; i < num_raid_params; i++) {
507 rs->
md.recovery_cp = 0;
513 if ((i + 1) >= num_raid_params) {
514 rs->
ti->error =
"Wrong number of raid parameters given";
523 rs->
ti->error =
"'raid10_format' is an invalid parameter for this RAID type";
526 if (
strcmp(
"near", argv[i])) {
527 rs->
ti->error =
"Invalid 'raid10_format' value given";
530 raid10_format = argv[
i];
536 rs->
ti->error =
"Bad numerical argument given in raid params";
542 if (value >= rs->
md.raid_disks) {
543 rs->
ti->error =
"Invalid rebuild index given";
547 rs->
dev[
value].rdev.recovery_offset = 0;
549 }
else if (!
strcasecmp(key,
"write_mostly")) {
551 rs->
ti->error =
"write_mostly option is only valid for RAID1";
554 if (value >= rs->
md.raid_disks) {
555 rs->
ti->error =
"Invalid write_mostly drive index given";
559 }
else if (!
strcasecmp(key,
"max_write_behind")) {
561 rs->
ti->error =
"max_write_behind option is only valid for RAID1";
571 if (value > COUNTER_MAX) {
572 rs->
ti->error =
"Max write-behind limit out of range";
575 rs->
md.bitmap_info.max_write_behind =
value;
576 }
else if (!
strcasecmp(key,
"daemon_sleep")) {
579 rs->
ti->error =
"daemon sleep period out of range";
582 rs->
md.bitmap_info.daemon_sleep =
value;
583 }
else if (!
strcasecmp(key,
"stripe_cache")) {
594 rs->
ti->error =
"Inappropriate argument: stripe_cache";
598 rs->
ti->error =
"Bad stripe_cache size";
601 }
else if (!
strcasecmp(key,
"min_recovery_rate")) {
604 rs->
ti->error =
"min_recovery_rate out of range";
607 rs->
md.sync_speed_min = (
int)value;
608 }
else if (!
strcasecmp(key,
"max_recovery_rate")) {
611 rs->
ti->error =
"max_recovery_rate out of range";
614 rs->
md.sync_speed_max = (
int)value;
618 }
else if (!
strcasecmp(key,
"raid10_copies") &&
620 if ((value < 2) || (value > 0xFF)) {
621 rs->
ti->error =
"Bad value for 'raid10_copies'";
625 raid10_copies =
value;
627 DMERR(
"Unable to parse RAID parameter: %s", key);
628 rs->
ti->error =
"Unable to parse RAID parameters";
633 if (validate_region_size(rs, region_size))
636 if (rs->
md.chunk_sectors)
637 max_io_len = rs->
md.chunk_sectors;
639 max_io_len = region_size;
645 if (raid10_copies > rs->
md.raid_disks) {
646 rs->
ti->error =
"Not enough devices to satisfy specification";
651 sectors_per_dev = rs->
ti->len * raid10_copies;
654 rs->
md.layout = raid10_format_to_md_layout(raid10_format,
656 rs->
md.new_layout = rs->
md.layout;
659 (rs->
md.raid_disks - rs->
raid_type->parity_devs))) {
660 rs->
ti->error =
"Target length not divisible by number of data devices";
663 rs->
md.dev_sectors = sectors_per_dev;
665 if (validate_rebuild_devices(rs))
669 rs->
md.persistent = 0;
699 #define DM_RAID_MAGIC 0x64526D44
741 DMERR(
"Failed to read superblock of device at position %d",
763 if (!rs->
dev[i].data_dev ||
765 failed_devices |= (1ULL <<
i);
767 memset(sb, 0,
sizeof(*sb));
794 static int super_load(
struct md_rdev *rdev,
struct md_rdev *refdev)
804 ret = read_disk_sb(rdev, rdev->
sb_size);
817 super_sync(rdev->
mddev, rdev);
825 return refdev ? 0 : 1;
836 return (events_sb > events_refsb) ? 1 : 0;
839 static int super_init_validation(
struct mddev *mddev,
struct md_rdev *rdev)
858 mddev->
events = events_sb ? : 1;
866 DMERR(
"Reshaping arrays not yet supported.");
873 DMERR(
"Reshaping arrays not yet supported.");
893 DMINFO(
"Device %d specified for rebuild: "
902 DMINFO(
"Superblocks created for new array");
904 }
else if (new_devs) {
905 DMERR(
"New device injected "
906 "into existing array without 'rebuild' "
907 "parameter specified");
910 }
else if (new_devs) {
911 DMERR(
"'rebuild' devices cannot be "
912 "injected into an array with other first-time devices");
915 DMERR(
"'rebuild' specified while array is not in-sync");
936 rs->
ti->error =
"Cannot change device "
937 "positions in RAID array";
940 DMINFO(
"RAID1 device #%d now at position #%d",
948 if (failed_devices & (1 << role))
956 static int super_validate(
struct mddev *mddev,
struct md_rdev *rdev)
964 if (!mddev->
events && super_init_validation(mddev, rdev))
968 rdev->
mddev->bitmap_info.default_offset = 4096 >> 9;
996 unsigned redundancy = 0;
999 struct mddev *mddev = &rs->
md;
1003 redundancy = rs->
md.raid_disks - 1;
1008 redundancy = rs->
raid_type->parity_devs;
1011 redundancy = raid10_md_layout_to_copies(mddev->
layout) - 1;
1014 ti->
error =
"Unknown RAID type";
1036 ret = super_load(rdev, freshest);
1076 ti->
error =
"Failed to load superblock";
1088 ti->
error =
"Unable to assemble array: Invalid superblocks";
1089 if (super_validate(mddev, freshest))
1093 if ((rdev != freshest) && super_validate(mddev, rdev))
1111 struct raid_type *rt;
1112 unsigned long num_raid_params, num_raid_devs;
1117 ti->error =
"Too few arguments";
1122 rt = get_raid_type(argv[0]);
1124 ti->error =
"Unrecognised raid_type";
1132 ti->error =
"Cannot understand number of RAID parameters";
1139 if (num_raid_params + 1 > argc) {
1140 ti->error =
"Arguments do not agree with counts given";
1144 if ((
strict_strtoul(argv[num_raid_params], 10, &num_raid_devs) < 0) ||
1146 ti->error =
"Cannot understand number of raid devices";
1150 rs = context_alloc(ti, rt, (
unsigned)num_raid_devs);
1154 ret = parse_raid_params(rs, argv, (
unsigned)num_raid_params);
1160 argc -= num_raid_params + 1;
1161 argv += num_raid_params + 1;
1163 if (argc != (num_raid_devs * 2)) {
1164 ti->error =
"Supplied RAID devices does not match the count given";
1168 ret = dev_parms(rs, argv);
1172 rs->
md.sync_super = super_sync;
1173 ret = analyse_superblocks(ti, rs);
1179 ti->num_flush_requests = 1;
1187 ti->error =
"Fail to run raid array";
1191 if (ti->len != rs->
md.array_sectors) {
1192 ti->error =
"Array size does not match requested target length";
1196 rs->
callbacks.congested_fn = raid_is_congested;
1210 static void raid_dtr(
struct dm_target *ti)
1219 static int raid_map(
struct dm_target *ti,
struct bio *bio,
union map_info *map_context)
1222 struct mddev *mddev = &rs->
md;
1224 mddev->
pers->make_request(mddev, bio);
1230 unsigned status_flags,
char *
result,
unsigned maxlen)
1233 unsigned raid_param_cnt = 1;
1235 int i, array_in_sync = 0;
1243 sync = rs->
md.curr_resync_completed;
1245 sync = rs->
md.recovery_cp;
1247 if (sync >= rs->
md.resync_max_sectors) {
1249 sync = rs->
md.resync_max_sectors;
1257 for (i = 0; i < rs->
md.raid_disks; i++)
1267 for (i = 0; i < rs->
md.raid_disks; i++) {
1270 else if (!array_in_sync ||
1286 (
unsigned long long) sync,
1287 (
unsigned long long) rs->
md.resync_max_sectors);
1292 for (i = 0; i < rs->
md.raid_disks; i++) {
1294 rs->
dev[i].data_dev &&
1296 raid_param_cnt += 2;
1297 if (rs->
dev[i].data_dev &&
1299 raid_param_cnt += 2;
1307 raid_param_cnt, rs->
md.chunk_sectors);
1315 for (i = 0; i < rs->
md.raid_disks; i++)
1317 rs->
dev[i].data_dev &&
1319 DMEMIT(
" rebuild %u", i);
1322 DMEMIT(
" daemon_sleep %lu",
1323 rs->
md.bitmap_info.daemon_sleep);
1326 DMEMIT(
" min_recovery_rate %d", rs->
md.sync_speed_min);
1329 DMEMIT(
" max_recovery_rate %d", rs->
md.sync_speed_max);
1331 for (i = 0; i < rs->
md.raid_disks; i++)
1332 if (rs->
dev[i].data_dev &&
1334 DMEMIT(
" write_mostly %u", i);
1337 DMEMIT(
" max_write_behind %lu",
1338 rs->
md.bitmap_info.max_write_behind);
1341 struct r5conf *conf = rs->
md.private;
1344 DMEMIT(
" stripe_cache %d",
1349 DMEMIT(
" region_size %lu",
1350 rs->
md.bitmap_info.chunksize >> 9);
1353 DMEMIT(
" raid10_copies %u",
1354 raid10_md_layout_to_copies(rs->
md.layout));
1357 DMEMIT(
" raid10_format near");
1360 for (i = 0; i < rs->
md.raid_disks; i++) {
1361 if (rs->
dev[i].meta_dev)
1362 DMEMIT(
" %s", rs->
dev[i].meta_dev->name);
1366 if (rs->
dev[i].data_dev)
1367 DMEMIT(
" %s", rs->
dev[i].data_dev->name);
1382 for (i = 0; !ret && i < rs->
md.raid_disks; i++)
1383 if (rs->
dev[i].data_dev)
1385 rs->
dev[i].data_dev,
1393 static void raid_io_hints(
struct dm_target *ti,
struct queue_limits *limits)
1396 unsigned chunk_size = rs->
md.chunk_sectors << 9;
1397 struct r5conf *conf = rs->
md.private;
1403 static void raid_presuspend(
struct dm_target *ti)
1410 static void raid_postsuspend(
struct dm_target *ti)
1417 static void raid_resume(
struct dm_target *ti)
1433 .version = {1, 3, 1},
1438 .status = raid_status,
1439 .iterate_devices = raid_iterate_devices,
1440 .io_hints = raid_io_hints,
1441 .presuspend = raid_presuspend,
1442 .postsuspend = raid_postsuspend,
1443 .resume = raid_resume,
1446 static int __init dm_raid_init(
void)
1451 static void __exit dm_raid_exit(
void)