50 #include <linux/module.h>
54 #include <linux/slab.h>
65 #define NR_STRIPES 256
66 #define STRIPE_SIZE PAGE_SIZE
67 #define STRIPE_SHIFT (PAGE_SHIFT - 9)
68 #define STRIPE_SECTORS (STRIPE_SIZE>>9)
69 #define IO_THRESHOLD 1
70 #define BYPASS_THRESHOLD 1
71 #define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head))
72 #define HASH_MASK (NR_HASH - 1)
89 static inline struct bio *r5_next_bio(
struct bio *bio,
sector_t sector)
91 int sectors = bio->bi_size >> 9;
102 static inline int raid5_bi_processed_stripes(
struct bio *bio)
108 static inline int raid5_dec_bi_active_stripes(
struct bio *bio)
114 static inline void raid5_inc_bi_active_stripes(
struct bio *bio)
120 static inline void raid5_set_bi_processed_stripes(
struct bio *bio,
128 new = (old & 0xffff) | (cnt << 16);
132 static inline void raid5_set_bi_stripes(
struct bio *bio,
unsigned int cnt)
150 static inline int raid6_next_disk(
int disk,
int raid_disks)
153 return (disk < raid_disks) ? disk : 0;
162 int *
count,
int syndrome_disks)
169 return syndrome_disks;
171 return syndrome_disks + 1;
177 static void return_io(
struct bio *return_bi)
179 struct bio *
bi = return_bi;
182 return_bi = bi->bi_next;
190 static void print_raid5_conf (
struct r5conf *conf);
192 static int stripe_operations_active(
struct stripe_head *sh)
217 BUG_ON(stripe_operations_active(sh));
235 do_release_stripe(conf, sh);
245 do_release_stripe(conf, sh);
251 static inline void remove_hash(
struct stripe_head *sh)
253 pr_debug(
"remove_hash(), stripe %llu\n",
254 (
unsigned long long)sh->
sector);
256 hlist_del_init(&sh->
hash);
263 pr_debug(
"insert_hash(), stripe %llu\n",
264 (
unsigned long long)sh->
sector);
266 hlist_add_head(&sh->
hash, hp);
280 list_del_init(first);
293 for (i = 0; i < num ; i++) {
307 for (i = 0; i < num; i++) {
318 static void raid5_build_block(
struct stripe_head *sh,
int i,
int previous);
329 BUG_ON(stripe_operations_active(sh));
331 pr_debug(
"init_stripe called, stripe %llu\n",
332 (
unsigned long long)sh->
sector);
339 stripe_set_idx(sector, conf, previous, sh);
343 for (i = sh->
disks; i--; ) {
344 struct r5dev *
dev = &sh->
dev[
i];
346 if (dev->toread || dev->read || dev->towrite || dev->written ||
349 (
unsigned long long)sh->
sector, i, dev->toread,
350 dev->read, dev->towrite, dev->written,
355 raid5_build_block(sh, i, previous);
357 insert_hash(conf, sh);
366 pr_debug(
"__find_stripe, sector %llu\n", (
unsigned long long)sector);
368 if (sh->sector == sector && sh->generation == generation)
389 int degraded, degraded2;
394 for (i = 0; i < conf->previous_raid_disks; i++) {
412 if (conf->raid_disks >= conf->previous_raid_disks)
416 if (conf->raid_disks == conf->previous_raid_disks)
420 for (i = 0; i < conf->raid_disks; i++) {
434 if (conf->raid_disks <= conf->previous_raid_disks)
438 if (degraded2 > degraded)
443 static int has_failed(
struct r5conf *conf)
450 degraded = calc_degraded(conf);
458 int previous,
int noblock,
int noquiesce)
462 pr_debug(
"get_stripe, sector %llu\n", (
unsigned long long)sector);
468 conf->
quiesce == 0 || noquiesce,
470 sh = __find_stripe(conf, sector, conf->
generation - previous);
473 sh = get_free_stripe(conf);
474 if (noblock && sh == NULL)
487 init_stripe(sh, sector, previous);
496 if (list_empty(&sh->
lru) &&
499 list_del_init(&sh->
lru);
502 }
while (sh == NULL);
533 raid5_end_read_request(
struct bio *bi,
int error);
535 raid5_end_write_request(
struct bio *bi,
int error);
544 for (i = disks; i--; ) {
546 int replace_only = 0;
547 struct bio *
bi, *rbi;
559 &sh->
dev[i].flags)) {
567 bi = &sh->
dev[
i].req;
568 rbi = &sh->
dev[
i].rreq;
573 bi->bi_end_io = raid5_end_write_request;
574 rbi->bi_end_io = raid5_end_write_request;
576 bi->bi_end_io = raid5_end_read_request;
612 while ((rw & WRITE) && rdev &&
617 &first_bad, &bad_sectors);
623 if (!conf->
mddev->external &&
624 conf->
mddev->flags) {
640 rdev_dec_pending(rdev, conf->
mddev);
652 bi->bi_bdev = rdev->
bdev;
653 pr_debug(
"%s: for %llu schedule op %ld on disc %d\n",
654 __func__, (
unsigned long long)sh->
sector,
657 if (use_new_offset(conf, sh))
658 bi->bi_sector = (sh->
sector
661 bi->bi_sector = (sh->
sector
666 bi->bi_flags = 1 << BIO_UPTODATE;
669 bi->bi_io_vec[0].bv_offset = 0;
683 rbi->bi_bdev = rrdev->
bdev;
684 pr_debug(
"%s: for %llu schedule op %ld on "
685 "replacement disc %d\n",
686 __func__, (
unsigned long long)sh->
sector,
689 if (use_new_offset(conf, sh))
690 rbi->bi_sector = (sh->
sector
693 rbi->bi_sector = (sh->
sector
695 rbi->bi_flags = 1 << BIO_UPTODATE;
698 rbi->bi_io_vec[0].bv_offset = 0;
703 if (!rdev && !rrdev) {
706 pr_debug(
"skip op %ld on disc %d for sector %llu\n",
707 bi->bi_rw, i, (
unsigned long long)sh->
sector);
715 async_copy_data(
int frombio,
struct bio *bio,
struct page *page,
719 struct page *bio_page;
725 if (bio->bi_sector >= sector)
726 page_offset = (signed)(bio->bi_sector - sector) * 512;
728 page_offset = (signed)(sector - bio->bi_sector) * -512;
732 init_async_submit(&submit, flags, tx, NULL, NULL, NULL);
734 bio_for_each_segment(bvl, bio, i) {
735 int len = bvl->bv_len;
739 if (page_offset < 0) {
741 page_offset += b_offset;
751 b_offset += bvl->bv_offset;
752 bio_page = bvl->bv_page;
755 b_offset, clen, &submit);
758 page_offset, clen, &submit);
761 submit.depend_tx =
tx;
771 static void ops_complete_biofill(
void *stripe_head_ref)
774 struct bio *return_bi =
NULL;
777 pr_debug(
"%s: stripe %llu\n", __func__,
778 (
unsigned long long)sh->
sector);
781 for (i = sh->
disks; i--; ) {
782 struct r5dev *dev = &sh->
dev[
i];
790 struct bio *rbi, *rbi2;
795 while (rbi && rbi->bi_sector <
797 rbi2 = r5_next_bio(rbi, dev->sector);
798 if (!raid5_dec_bi_active_stripes(rbi)) {
799 rbi->bi_next = return_bi;
808 return_io(return_bi);
814 static void ops_run_biofill(
struct stripe_head *sh)
820 pr_debug(
"%s: stripe %llu\n", __func__,
821 (
unsigned long long)sh->
sector);
823 for (i = sh->
disks; i--; ) {
824 struct r5dev *dev = &sh->
dev[
i];
828 dev->read = rbi = dev->toread;
831 while (rbi && rbi->bi_sector <
833 tx = async_copy_data(0, rbi, dev->page,
835 rbi = r5_next_bio(rbi, dev->sector);
841 init_async_submit(&submit,
ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL);
858 static void ops_complete_compute(
void *stripe_head_ref)
862 pr_debug(
"%s: stripe %llu\n", __func__,
863 (
unsigned long long)sh->
sector);
866 mark_target_uptodate(sh, sh->
ops.target);
867 mark_target_uptodate(sh, sh->
ops.target2);
878 struct raid5_percpu *percpu)
880 return percpu->scribble +
sizeof(
struct page *) * (sh->
disks + 2);
884 ops_run_compute5(
struct stripe_head *sh,
struct raid5_percpu *percpu)
886 int disks = sh->
disks;
887 struct page **xor_srcs = percpu->scribble;
888 int target = sh->
ops.target;
890 struct page *xor_dest = tgt->page;
896 pr_debug(
"%s: stripe %llu block: %d\n",
897 __func__, (
unsigned long long)sh->
sector, target);
900 for (i = disks; i--; )
902 xor_srcs[count++] = sh->
dev[
i].page;
907 ops_complete_compute, sh, to_addr_conv(sh, percpu));
925 static int set_syndrome_sources(
struct page **srcs,
struct stripe_head *sh)
927 int disks = sh->
disks;
928 int syndrome_disks = sh->
ddf_layout ? disks : (disks - 2);
929 int d0_idx = raid6_d0(sh);
933 for (i = 0; i < disks; i++)
939 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
942 i = raid6_next_disk(i, disks);
943 }
while (i != d0_idx);
945 return syndrome_disks;
949 ops_run_compute6_1(
struct stripe_head *sh,
struct raid5_percpu *percpu)
951 int disks = sh->
disks;
952 struct page **blocks = percpu->scribble;
962 if (sh->
ops.target < 0)
963 target = sh->
ops.target2;
964 else if (sh->
ops.target2 < 0)
965 target = sh->
ops.target;
970 pr_debug(
"%s: stripe %llu block: %d\n",
971 __func__, (
unsigned long long)sh->
sector, target);
979 if (target == qd_idx) {
980 count = set_syndrome_sources(blocks, sh);
982 BUG_ON(blocks[count+1] != dest);
984 ops_complete_compute, sh,
985 to_addr_conv(sh, percpu));
990 for (i = disks; i-- ; ) {
991 if (i == target || i == qd_idx)
993 blocks[count++] = sh->
dev[
i].page;
997 NULL, ops_complete_compute, sh,
998 to_addr_conv(sh, percpu));
1006 ops_run_compute6_2(
struct stripe_head *sh,
struct raid5_percpu *percpu)
1009 int syndrome_disks = sh->
ddf_layout ? disks : disks-2;
1010 int d0_idx = raid6_d0(sh);
1011 int faila = -1, failb = -1;
1012 int target = sh->
ops.target;
1013 int target2 = sh->
ops.target2;
1015 struct r5dev *tgt2 = &sh->
dev[target2];
1017 struct page **blocks = percpu->scribble;
1020 pr_debug(
"%s: stripe %llu block1: %d block2: %d\n",
1021 __func__, (
unsigned long long)sh->
sector, target, target2);
1022 BUG_ON(target < 0 || target2 < 0);
1029 for (i = 0; i < disks ; i++)
1034 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
1042 i = raid6_next_disk(i, disks);
1043 }
while (i != d0_idx);
1048 pr_debug(
"%s: stripe: %llu faila: %d failb: %d\n",
1049 __func__, (
unsigned long long)sh->
sector, faila, failb);
1053 if (failb == syndrome_disks+1) {
1055 if (faila == syndrome_disks) {
1058 ops_complete_compute, sh,
1059 to_addr_conv(sh, percpu));
1068 if (target == qd_idx)
1069 data_target = target2;
1074 for (i = disks; i-- ; ) {
1075 if (i == data_target || i == qd_idx)
1077 blocks[count++] = sh->
dev[
i].page;
1079 dest = sh->
dev[data_target].page;
1080 init_async_submit(&submit,
1083 to_addr_conv(sh, percpu));
1087 count = set_syndrome_sources(blocks, sh);
1089 ops_complete_compute, sh,
1090 to_addr_conv(sh, percpu));
1096 ops_complete_compute, sh,
1097 to_addr_conv(sh, percpu));
1098 if (failb == syndrome_disks) {
1113 static void ops_complete_prexor(
void *stripe_head_ref)
1117 pr_debug(
"%s: stripe %llu\n", __func__,
1118 (
unsigned long long)sh->
sector);
1122 ops_run_prexor(
struct stripe_head *sh,
struct raid5_percpu *percpu,
1125 int disks = sh->
disks;
1126 struct page **xor_srcs = percpu->scribble;
1127 int count = 0, pd_idx = sh->
pd_idx,
i;
1131 struct page *xor_dest = xor_srcs[count++] = sh->
dev[pd_idx].page;
1133 pr_debug(
"%s: stripe %llu\n", __func__,
1134 (
unsigned long long)sh->
sector);
1136 for (i = disks; i--; ) {
1137 struct r5dev *dev = &sh->
dev[
i];
1140 xor_srcs[count++] = dev->page;
1144 ops_complete_prexor, sh, to_addr_conv(sh, percpu));
1153 int disks = sh->
disks;
1156 pr_debug(
"%s: stripe %llu\n", __func__,
1157 (
unsigned long long)sh->
sector);
1159 for (i = disks; i--; ) {
1160 struct r5dev *dev = &sh->
dev[
i];
1167 chosen = dev->towrite;
1168 dev->towrite =
NULL;
1170 wbi = dev->written = chosen;
1173 while (wbi && wbi->bi_sector <
1182 tx = async_copy_data(1, wbi, dev->page,
1184 wbi = r5_next_bio(wbi, dev->sector);
1192 static void ops_complete_reconstruct(
void *stripe_head_ref)
1195 int disks = sh->
disks;
1201 pr_debug(
"%s: stripe %llu\n", __func__,
1202 (
unsigned long long)sh->
sector);
1204 for (i = disks; i--; ) {
1210 for (i = disks; i--; ) {
1211 struct r5dev *dev = &sh->
dev[
i];
1213 if (dev->written || i == pd_idx || i == qd_idx) {
1237 ops_run_reconstruct5(
struct stripe_head *sh,
struct raid5_percpu *percpu,
1240 int disks = sh->
disks;
1241 struct page **xor_srcs = percpu->scribble;
1243 int count = 0, pd_idx = sh->
pd_idx,
i;
1244 struct page *xor_dest;
1246 unsigned long flags;
1248 pr_debug(
"%s: stripe %llu\n", __func__,
1249 (
unsigned long long)sh->
sector);
1251 for (i = 0; i < sh->
disks; i++) {
1257 if (i >= sh->
disks) {
1260 ops_complete_reconstruct(sh);
1268 xor_dest = xor_srcs[count++] = sh->
dev[pd_idx].page;
1269 for (i = disks; i--; ) {
1270 struct r5dev *dev = &sh->
dev[
i];
1272 xor_srcs[count++] = dev->page;
1275 xor_dest = sh->
dev[pd_idx].page;
1276 for (i = disks; i--; ) {
1277 struct r5dev *dev = &sh->
dev[
i];
1279 xor_srcs[count++] = dev->page;
1293 init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh,
1294 to_addr_conv(sh, percpu));
1302 ops_run_reconstruct6(
struct stripe_head *sh,
struct raid5_percpu *percpu,
1306 struct page **blocks = percpu->scribble;
1309 pr_debug(
"%s: stripe %llu\n", __func__, (
unsigned long long)sh->
sector);
1311 for (i = 0; i < sh->
disks; i++) {
1317 if (i >= sh->
disks) {
1321 ops_complete_reconstruct(sh);
1325 count = set_syndrome_sources(blocks, sh);
1329 init_async_submit(&submit,
ASYNC_TX_ACK, tx, ops_complete_reconstruct,
1330 sh, to_addr_conv(sh, percpu));
1334 static void ops_complete_check(
void *stripe_head_ref)
1338 pr_debug(
"%s: stripe %llu\n", __func__,
1339 (
unsigned long long)sh->
sector);
1346 static void ops_run_check_p(
struct stripe_head *sh,
struct raid5_percpu *percpu)
1348 int disks = sh->
disks;
1351 struct page *xor_dest;
1352 struct page **xor_srcs = percpu->scribble;
1358 pr_debug(
"%s: stripe %llu\n", __func__,
1359 (
unsigned long long)sh->
sector);
1362 xor_dest = sh->
dev[pd_idx].page;
1363 xor_srcs[count++] = xor_dest;
1364 for (i = disks; i--; ) {
1365 if (i == pd_idx || i == qd_idx)
1367 xor_srcs[count++] = sh->
dev[
i].page;
1370 init_async_submit(&submit, 0, NULL, NULL, NULL,
1371 to_addr_conv(sh, percpu));
1373 &sh->
ops.zero_sum_result, &submit);
1376 init_async_submit(&submit,
ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL);
1380 static void ops_run_check_pq(
struct stripe_head *sh,
struct raid5_percpu *percpu,
int checkp)
1382 struct page **srcs = percpu->scribble;
1386 pr_debug(
"%s: stripe %llu checkp: %d\n", __func__,
1387 (
unsigned long long)sh->
sector, checkp);
1389 count = set_syndrome_sources(srcs, sh);
1394 init_async_submit(&submit,
ASYNC_TX_ACK, NULL, ops_complete_check,
1395 sh, to_addr_conv(sh, percpu));
1397 &sh->
ops.zero_sum_result, percpu->spare_page, &submit);
1400 static void __raid_run_ops(
struct stripe_head *sh,
unsigned long ops_request)
1402 int overlap_clear = 0,
i, disks = sh->
disks;
1406 struct raid5_percpu *percpu;
1412 ops_run_biofill(sh);
1418 tx = ops_run_compute5(sh, percpu);
1420 if (sh->
ops.target2 < 0 || sh->
ops.target < 0)
1421 tx = ops_run_compute6_1(sh, percpu);
1423 tx = ops_run_compute6_2(sh, percpu);
1431 tx = ops_run_prexor(sh, percpu, tx);
1434 tx = ops_run_biodrain(sh, tx);
1440 ops_run_reconstruct5(sh, percpu, tx);
1442 ops_run_reconstruct6(sh, percpu, tx);
1447 ops_run_check_p(sh, percpu);
1449 ops_run_check_pq(sh, percpu, 0);
1451 ops_run_check_pq(sh, percpu, 1);
1457 for (i = disks; i--; ) {
1458 struct r5dev *dev = &sh->
dev[
i];
1465 #ifdef CONFIG_MULTICORE_RAID456
1469 unsigned long ops_request = sh->
ops.request;
1474 __raid_run_ops(sh, ops_request);
1486 sh->
ops.request = ops_request;
1492 #define raid_run_ops __raid_run_ops
1495 static int grow_one_stripe(
struct r5conf *conf)
1503 #ifdef CONFIG_MULTICORE_RAID456
1509 if (grow_buffers(sh)) {
1517 INIT_LIST_HEAD(&sh->
lru);
1522 static int grow_stripes(
struct r5conf *conf,
int num)
1527 if (conf->
mddev->gendisk)
1529 "raid%d-%s", conf->
level, mdname(conf->
mddev));
1537 sizeof(
struct stripe_head)+(devs-1)*
sizeof(
struct r5dev),
1544 if (!grow_one_stripe(conf))
1562 static size_t scribble_len(
int num)
1566 len =
sizeof(
struct page *) * (num+2) +
sizeof(
addr_conv_t) * (num+2);
1571 static int resize_stripes(
struct r5conf *conf,
int newsize)
1604 if (newsize <= conf->pool_size)
1613 sizeof(
struct stripe_head)+(newsize-1)*
sizeof(
struct r5dev),
1624 #ifdef CONFIG_MULTICORE_RAID456
1629 list_add(&nsh->
lru, &newstripes);
1633 while (!list_empty(&newstripes)) {
1651 osh = get_free_stripe(conf);
1655 nsh->
dev[i].page = osh->
dev[i].page;
1656 for( ; i<newsize; i++)
1670 ndisks[i] = conf->
disks[i];
1672 conf->
disks = ndisks;
1679 struct raid5_percpu *percpu;
1686 kfree(percpu->scribble);
1687 percpu->scribble = scribble;
1696 while(!list_empty(&newstripes)) {
1698 list_del_init(&nsh->
lru);
1701 if (nsh->
dev[i].page == NULL) {
1703 nsh->
dev[
i].page =
p;
1707 release_stripe(nsh);
1717 static int drop_one_stripe(
struct r5conf *conf)
1722 sh = get_free_stripe(conf);
1733 static void shrink_stripes(
struct r5conf *conf)
1735 while (drop_one_stripe(conf))
1743 static void raid5_end_read_request(
struct bio * bi,
int error)
1747 int disks = sh->
disks,
i;
1748 int uptodate =
test_bit(BIO_UPTODATE, &bi->bi_flags);
1753 for (i=0 ; i<disks; i++)
1754 if (bi == &sh->
dev[i].req)
1757 pr_debug(
"end_read_request %llu/%d, count: %d, uptodate %d.\n",
1770 rdev = conf->
disks[
i].replacement;
1772 rdev = conf->
disks[
i].rdev;
1774 if (use_new_offset(conf, sh))
1787 "md/raid:%s: read error corrected"
1788 " (%lu sectors at %llu on %s)\n",
1790 (
unsigned long long)s,
1810 "md/raid:%s: read error on replacement device "
1811 "(sector %llu on %s).\n",
1812 mdname(conf->
mddev),
1813 (
unsigned long long)s,
1819 "md/raid:%s: read error not correctable "
1820 "(sector %llu on %s).\n",
1821 mdname(conf->
mddev),
1822 (
unsigned long long)s,
1829 "md/raid:%s: read error NOT corrected!! "
1830 "(sector %llu on %s).\n",
1831 mdname(conf->
mddev),
1832 (
unsigned long long)s,
1837 "md/raid:%s: Too many read errors, failing device %s.\n",
1838 mdname(conf->
mddev), bdn);
1857 rdev_dec_pending(rdev, conf->
mddev);
1863 static void raid5_end_write_request(
struct bio *bi,
int error)
1867 int disks = sh->
disks,
i;
1869 int uptodate =
test_bit(BIO_UPTODATE, &bi->bi_flags);
1872 int replacement = 0;
1874 for (i = 0 ; i < disks; i++) {
1875 if (bi == &sh->
dev[i].req) {
1876 rdev = conf->
disks[
i].rdev;
1879 if (bi == &sh->
dev[i].rreq) {
1880 rdev = conf->
disks[
i].replacement;
1888 rdev = conf->
disks[
i].rdev;
1892 pr_debug(
"end_write_request %llu/%d, count %d, uptodate: %d.\n",
1903 else if (is_badblock(rdev, sh->
sector,
1905 &first_bad, &bad_sectors))
1913 &rdev->
mddev->recovery);
1914 }
else if (is_badblock(rdev, sh->
sector,
1916 &first_bad, &bad_sectors))
1919 rdev_dec_pending(rdev, conf->
mddev);
1929 static void raid5_build_block(
struct stripe_head *sh,
int i,
int previous)
1931 struct r5dev *dev = &sh->
dev[
i];
1934 dev->req.bi_io_vec = &dev->vec;
1936 dev->req.bi_max_vecs++;
1937 dev->req.bi_private = sh;
1938 dev->vec.bv_page = dev->page;
1941 dev->rreq.bi_io_vec = &dev->rvec;
1942 dev->rreq.bi_vcnt++;
1943 dev->rreq.bi_max_vecs++;
1944 dev->rreq.bi_private = sh;
1945 dev->rvec.bv_page = dev->page;
1948 dev->
sector = compute_blocknr(sh, i, previous);
1955 unsigned long flags;
1956 pr_debug(
"raid456: error called\n");
1960 mddev->
degraded = calc_degraded(conf);
1961 spin_unlock_irqrestore(&conf->
device_lock, flags);
1968 "md/raid:%s: Disk failure on %s, disabling device.\n"
1969 "md/raid:%s: Operation continuing on %d devices.\n",
1981 int previous,
int *dd_idx,
2003 chunk_offset =
sector_div(r_sector, sectors_per_chunk);
2004 chunk_number = r_sector;
2009 stripe = chunk_number;
2015 pd_idx = qd_idx = -1;
2016 switch(conf->
level) {
2018 pd_idx = data_disks;
2021 switch (algorithm) {
2023 pd_idx = data_disks -
sector_div(stripe2, raid_disks);
2024 if (*dd_idx >= pd_idx)
2029 if (*dd_idx >= pd_idx)
2033 pd_idx = data_disks -
sector_div(stripe2, raid_disks);
2034 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
2038 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
2045 pd_idx = data_disks;
2053 switch (algorithm) {
2055 pd_idx = raid_disks - 1 -
sector_div(stripe2, raid_disks);
2056 qd_idx = pd_idx + 1;
2057 if (pd_idx == raid_disks-1) {
2060 }
else if (*dd_idx >= pd_idx)
2065 qd_idx = pd_idx + 1;
2066 if (pd_idx == raid_disks-1) {
2069 }
else if (*dd_idx >= pd_idx)
2073 pd_idx = raid_disks - 1 -
sector_div(stripe2, raid_disks);
2074 qd_idx = (pd_idx + 1) % raid_disks;
2075 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
2079 qd_idx = (pd_idx + 1) % raid_disks;
2080 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
2089 pd_idx = data_disks;
2090 qd_idx = data_disks + 1;
2098 qd_idx = pd_idx + 1;
2099 if (pd_idx == raid_disks-1) {
2102 }
else if (*dd_idx >= pd_idx)
2113 pd_idx = raid_disks - 1 -
sector_div(stripe2, raid_disks);
2114 qd_idx = pd_idx + 1;
2115 if (pd_idx == raid_disks-1) {
2118 }
else if (*dd_idx >= pd_idx)
2125 pd_idx = raid_disks - 1 -
sector_div(stripe2, raid_disks);
2126 qd_idx = (pd_idx + raid_disks - 1) % raid_disks;
2127 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
2133 pd_idx = data_disks -
sector_div(stripe2, raid_disks-1);
2134 if (*dd_idx >= pd_idx)
2136 qd_idx = raid_disks - 1;
2141 if (*dd_idx >= pd_idx)
2143 qd_idx = raid_disks - 1;
2147 pd_idx = data_disks -
sector_div(stripe2, raid_disks-1);
2148 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
2149 qd_idx = raid_disks - 1;
2154 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
2155 qd_idx = raid_disks - 1;
2161 qd_idx = raid_disks - 1;
2178 new_sector = (
sector_t)stripe * sectors_per_chunk + chunk_offset;
2186 int raid_disks = sh->
disks;
2191 int algorithm = previous ? conf->
prev_algo
2196 int dummy1, dd_idx =
i;
2201 chunk_offset =
sector_div(new_sector, sectors_per_chunk);
2202 stripe = new_sector;
2206 switch(conf->
level) {
2209 switch (algorithm) {
2233 switch (algorithm) {
2238 if (sh->
pd_idx == raid_disks-1)
2245 if (sh->
pd_idx == raid_disks-1)
2278 i += data_disks + 1;
2290 chunk_number = stripe * data_disks +
i;
2291 r_sector = chunk_number * sectors_per_chunk +
chunk_offset;
2293 check = raid5_compute_sector(conf, r_sector,
2294 previous, &dummy1, &sh2);
2296 || sh2.qd_idx != sh->
qd_idx) {
2297 printk(
KERN_ERR "md/raid:%s: compute_blocknr: map not correct\n",
2298 mdname(conf->
mddev));
2307 int rcw,
int expand)
2311 int level = conf->
level;
2326 for (i = disks; i--; ) {
2327 struct r5dev *dev = &sh->
dev[
i];
2350 for (i = disks; i--; ) {
2351 struct r5dev *dev = &sh->
dev[
i];
2375 struct r5dev *dev = &sh->
dev[qd_idx];
2382 pr_debug(
"%s: stripe %llu locked: %d ops_request: %lx\n",
2383 __func__, (
unsigned long long)sh->
sector,
2392 static int add_stripe_bio(
struct stripe_head *sh,
struct bio *bi,
int dd_idx,
int forwrite)
2398 pr_debug(
"adding bi b#%llu to stripe s#%llu\n",
2399 (
unsigned long long)bi->bi_sector,
2400 (
unsigned long long)sh->
sector);
2412 bip = &sh->
dev[dd_idx].towrite;
2416 bip = &sh->
dev[dd_idx].toread;
2417 while (*bip && (*bip)->bi_sector < bi->bi_sector) {
2418 if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector)
2420 bip = & (*bip)->bi_next;
2422 if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9))
2425 BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
2429 raid5_inc_bi_active_stripes(bi);
2434 for (bi=sh->
dev[dd_idx].towrite;
2436 bi && bi->bi_sector <= sector;
2437 bi = r5_next_bio(bi, sh->
dev[dd_idx].sector)) {
2438 if (bi->bi_sector + (bi->bi_size>>9) >= sector)
2439 sector = bi->bi_sector + (bi->bi_size>>9);
2445 pr_debug(
"added bi b#%llu to stripe s#%llu, disk %d.\n",
2446 (
unsigned long long)(*bip)->bi_sector,
2447 (
unsigned long long)sh->
sector, dd_idx);
2450 if (conf->
mddev->bitmap && firstwrite) {
2464 static void end_reshape(
struct r5conf *conf);
2466 static void stripe_set_idx(
sector_t stripe,
struct r5conf *conf,
int previous,
2469 int sectors_per_chunk =
2472 int chunk_offset =
sector_div(stripe, sectors_per_chunk);
2475 raid5_compute_sector(conf,
2477 *sectors_per_chunk + chunk_offset,
2485 struct bio **return_bi)
2488 for (i = disks; i--; ) {
2507 rdev_dec_pending(rdev, conf->
mddev);
2512 bi = sh->
dev[
i].towrite;
2521 while (bi && bi->bi_sector <
2523 struct bio *nextbi = r5_next_bio(bi, sh->
dev[i].sector);
2525 if (!raid5_dec_bi_active_stripes(bi)) {
2527 bi->bi_next = *return_bi;
2537 bi = sh->
dev[
i].written;
2539 if (bi) bitmap_end = 1;
2540 while (bi && bi->bi_sector <
2542 struct bio *bi2 = r5_next_bio(bi, sh->
dev[i].sector);
2544 if (!raid5_dec_bi_active_stripes(bi)) {
2546 bi->bi_next = *return_bi;
2559 bi = sh->
dev[
i].toread;
2564 while (bi && bi->bi_sector <
2566 struct bio *nextbi =
2567 r5_next_bio(bi, sh->
dev[i].sector);
2569 if (!raid5_dec_bi_active_stripes(bi)) {
2570 bi->bi_next = *return_bi;
2619 rdev = conf->
disks[
i].replacement;
2629 conf->
mddev->recovery_disabled;
2634 static int want_replace(
struct stripe_head *sh,
int disk_idx)
2639 rdev = sh->
raid_conf->disks[disk_idx].replacement;
2657 int disk_idx,
int disks)
2659 struct r5dev *dev = &sh->
dev[disk_idx];
2669 (s->
replacing && want_replace(sh, disk_idx)) ||
2670 (s->
failed >= 1 && fdev[0]->toread) ||
2671 (s->
failed >= 2 && fdev[1]->toread) ||
2686 pr_debug(
"Computing stripe %llu block %d\n",
2687 (
unsigned long long)sh->
sector, disk_idx);
2691 sh->
ops.target = disk_idx;
2692 sh->
ops.target2 = -1;
2707 for (other = disks; other--; ) {
2708 if (other == disk_idx)
2711 &sh->
dev[other].flags))
2715 pr_debug(
"Computing stripe %llu blocks %d,%d\n",
2716 (
unsigned long long)sh->
sector,
2722 sh->
ops.target = disk_idx;
2731 pr_debug(
"Reading block %d (sync=%d)\n",
2742 static void handle_stripe_fill(
struct stripe_head *sh,
2754 for (i = disks; i--; )
2755 if (fetch_block(sh, s, i, disks))
2766 static void handle_stripe_clean_event(
struct r5conf *conf,
2767 struct stripe_head *sh,
int disks,
struct bio **return_bi)
2772 for (i = disks; i--; )
2773 if (sh->
dev[i].written) {
2779 struct bio *wbi, *wbi2;
2780 pr_debug(
"Return write for disc %d\n", i);
2784 dev->written =
NULL;
2785 while (wbi && wbi->bi_sector <
2787 wbi2 = r5_next_bio(wbi, dev->sector);
2788 if (!raid5_dec_bi_active_stripes(wbi)) {
2790 wbi->bi_next = *return_bi;
2808 static void handle_stripe_dirtying(
struct r5conf *conf,
2813 int rmw = 0, rcw = 0,
i;
2825 (recovery_cp < MaxSector && sh->sector >= recovery_cp)) {
2830 pr_debug(
"force RCW max_degraded=%u, recovery_cp=%llu sh->sector=%llu\n",
2832 (
unsigned long long)sh->
sector);
2833 }
else for (i = disks; i--; ) {
2835 struct r5dev *dev = &sh->
dev[
i];
2836 if ((dev->towrite || i == sh->
pd_idx) &&
2855 pr_debug(
"for sector %llu, rmw=%d rcw=%d\n",
2856 (
unsigned long long)sh->
sector, rmw, rcw);
2858 if (rmw < rcw && rmw > 0)
2860 for (i = disks; i--; ) {
2861 struct r5dev *dev = &sh->
dev[
i];
2862 if ((dev->towrite || i == sh->
pd_idx) &&
2870 "%d for r-m-w\n", i);
2880 if (rcw <= rmw && rcw > 0) {
2883 for (i = disks; i--; ) {
2884 struct r5dev *dev = &sh->
dev[
i];
2896 "%d for Reconstruct\n", i);
2918 (s->
locked == 0 && (rcw == 0 || rmw == 0) &&
2920 schedule_reconstruction(sh, s, rcw == 0, 0);
2926 struct r5dev *dev =
NULL;
2995 sh->
ops.target2 = -1;
3005 (
unsigned long long) sh->
sector);
3050 sh->
ops.zero_sum_result = 0;
3093 dev = &sh->
dev[pd_idx];
3099 dev = &sh->
dev[qd_idx];
3119 if (sh->
ops.zero_sum_result == 0) {
3141 int *target = &sh->
ops.target;
3143 sh->
ops.target = -1;
3144 sh->
ops.target2 = -1;
3150 &sh->
dev[pd_idx].flags);
3152 target = &sh->
ops.target2;
3157 &sh->
dev[qd_idx].flags);
3169 (
unsigned long long) sh->
sector);
3174 static void handle_stripe_expansion(
struct r5conf *conf,
struct stripe_head *sh)
3183 for (i = 0; i < sh->
disks; i++)
3189 sector_t bn = compute_blocknr(sh, i, 1);
3190 sector_t s = raid5_compute_sector(conf, bn, 0,
3192 sh2 = get_active_stripe(conf, s, 0, 1, 1);
3202 release_stripe(sh2);
3207 init_async_submit(&submit, 0, tx, NULL, NULL, NULL);
3223 release_stripe(sh2);
3250 int disks = sh->
disks;
3253 int do_recovery = 0;
3255 memset(s, 0,
sizeof(*s));
3264 for (i=disks; i--; ) {
3272 pr_debug(
"check %d: state 0x%lx read %p write %p written %p\n",
3274 dev->toread, dev->towrite, dev->written);
3296 else if (dev->toread)
3312 &first_bad, &bad_sectors))
3324 &first_bad, &bad_sectors);
3365 conf->
disks[i].rdev);
3378 conf->
disks[i].rdev);
3387 conf->
disks[i].replacement);
3434 int disks = sh->
disks;
3435 struct r5dev *
pdev, *qdev;
3451 pr_debug(
"handling stripe %llu, state=%#lx cnt=%d, "
3452 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n",
3457 analyse_stripe(sh, &s);
3480 pr_debug(
"locked=%d uptodate=%d to_read=%d"
3481 " to_write=%d failed=%d failed_num=%d,%d\n",
3491 handle_failed_stripe(conf, sh, &s, disks, &s.
return_bi);
3493 handle_failed_sync(conf, sh, &s);
3514 for (i = disks; i--; ) {
3515 struct r5dev *dev = &sh->
dev[
i];
3554 handle_stripe_clean_event(conf, sh, disks, &s.
return_bi);
3565 handle_stripe_fill(sh, &s, disks);
3574 handle_stripe_dirtying(conf, sh, &s, disks);
3585 if (conf->
level == 6)
3586 handle_parity_checks6(conf, sh, &s, disks);
3588 handle_parity_checks5(conf, sh, &s, disks);
3613 for (i = 0; i < s.
failed; i++) {
3637 = get_active_stripe(conf, sh->
sector, 1, 1, 1);
3647 release_stripe(sh_src);
3651 release_stripe(sh_src);
3666 stripe_set_idx(sh->
sector, conf, 0, sh);
3667 schedule_reconstruction(sh, &s, 1, 1);
3677 handle_stripe_expansion(conf, sh);
3682 if (conf->
mddev->external)
3695 for (i = disks; i--; ) {
3697 struct r5dev *dev = &sh->
dev[
i];
3700 rdev = conf->
disks[
i].rdev;
3704 rdev_dec_pending(rdev, conf->
mddev);
3707 rdev = conf->
disks[
i].rdev;
3710 rdev_dec_pending(rdev, conf->
mddev);
3713 rdev = conf->
disks[
i].replacement;
3716 rdev = conf->
disks[
i].rdev;
3719 rdev_dec_pending(rdev, conf->
mddev);
3744 static void raid5_activate_delayed(
struct r5conf *conf)
3760 static void activate_bit_delay(
struct r5conf *conf)
3766 while (!list_empty(&
head)) {
3768 list_del_init(&sh->
lru);
3770 __release_stripe(conf, sh);
3793 static int raid5_congested(
void *
data,
int bits)
3795 struct mddev *mddev =
data;
3805 struct bvec_merge_data *bvm,
3806 struct bio_vec *biovec)
3808 struct mddev *mddev = q->queuedata;
3809 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
3812 unsigned int bio_sectors = bvm->bi_size >> 9;
3814 if ((bvm->bi_rw & 1) == WRITE)
3815 return biovec->bv_len;
3819 max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
3820 if (max < 0) max = 0;
3821 if (max <= biovec->bv_len && bio_sectors == 0)
3822 return biovec->bv_len;
3828 static int in_chunk_boundary(
struct mddev *mddev,
struct bio *bio)
3830 sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
3832 unsigned int bio_sectors = bio->bi_size >> 9;
3836 return chunk_sectors >=
3837 ((sector & (chunk_sectors - 1)) + bio_sectors);
3844 static void add_bio_to_retry(
struct bio *bi,
struct r5conf *conf)
3846 unsigned long flags;
3853 spin_unlock_irqrestore(&conf->
device_lock, flags);
3858 static struct bio *remove_bio_from_retry(
struct r5conf *conf)
3875 raid5_set_bi_stripes(bi, 1);
3888 static void raid5_align_endio(
struct bio *bi,
int error)
3890 struct bio* raid_bi = bi->bi_private;
3891 struct mddev *mddev;
3893 int uptodate =
test_bit(BIO_UPTODATE, &bi->bi_flags);
3898 rdev = (
void*)raid_bi->bi_next;
3899 raid_bi->bi_next = NULL;
3900 mddev = rdev->
mddev;
3903 rdev_dec_pending(rdev, conf->
mddev);
3905 if (!error && uptodate) {
3913 pr_debug(
"raid5_align_endio : io error...handing IO for a retry\n");
3915 add_bio_to_retry(raid_bi, conf);
3918 static int bio_fits_rdev(
struct bio *bi)
3922 if ((bi->bi_size>>9) > queue_max_sectors(q))
3925 if (bi->bi_phys_segments > queue_max_segments(q))
3928 if (q->merge_bvec_fn)
3938 static int chunk_aligned_read(
struct mddev *mddev,
struct bio * raid_bio)
3942 struct bio* align_bi;
3946 if (!in_chunk_boundary(mddev, raid_bio)) {
3947 pr_debug(
"chunk_aligned_read : non aligned\n");
3960 align_bi->bi_end_io = raid5_align_endio;
3961 align_bi->bi_private = raid_bio;
3965 align_bi->bi_sector = raid5_compute_sector(conf, raid_bio->bi_sector,
3969 end_sector = align_bi->bi_sector + (align_bi->bi_size >> 9);
3987 raid_bio->bi_next = (
void*)rdev;
3988 align_bi->bi_bdev = rdev->
bdev;
3989 align_bi->bi_flags &= ~(1 << BIO_SEG_VALID);
3991 if (!bio_fits_rdev(align_bi) ||
3992 is_badblock(rdev, align_bi->bi_sector, align_bi->bi_size>>9,
3993 &first_bad, &bad_sectors)) {
3996 rdev_dec_pending(rdev, mddev);
4033 pr_debug(
"%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n",
4035 list_empty(&conf->
handle_list) ?
"empty" :
"busy",
4036 list_empty(&conf->
hold_list) ?
"empty" :
"busy",
4054 }
else if (!list_empty(&conf->
hold_list) &&
4066 list_del_init(&sh->
lru);
4077 static void raid5_unplug(
struct blk_plug_cb *blk_cb,
bool from_schedule)
4082 struct mddev *mddev = cb->
cb.data;
4085 if (cb->
list.next && !list_empty(&cb->
list)) {
4087 while (!list_empty(&cb->
list)) {
4089 list_del_init(&sh->
lru);
4097 __release_stripe(conf, sh);
4104 static void release_stripe_plug(
struct mddev *mddev,
4108 raid5_unplug, mddev,
4119 if (cb->
list.next == NULL)
4120 INIT_LIST_HEAD(&cb->
list);
4128 static void make_discard_request(
struct mddev *mddev,
struct bio *bi)
4131 sector_t logical_sector, last_sector;
4141 last_sector = bi->bi_sector + (bi->bi_size>>9);
4144 bi->bi_phys_segments = 1;
4155 for (; logical_sector < last_sector;
4160 sh = get_active_stripe(conf, logical_sector, 0, 0, 0);
4167 if (sh->
dev[d].towrite || sh->
dev[d].toread) {
4181 raid5_inc_bi_active_stripes(bi);
4184 if (conf->
mddev->bitmap) {
4200 release_stripe_plug(mddev, sh);
4203 remaining = raid5_dec_bi_active_stripes(bi);
4204 if (remaining == 0) {
4210 static void make_request(
struct mddev *mddev,
struct bio * bi)
4215 sector_t logical_sector, last_sector;
4217 const int rw = bio_data_dir(bi);
4229 chunk_aligned_read(mddev,bi))
4233 make_discard_request(mddev, bi);
4238 last_sector = bi->bi_sector + (bi->bi_size>>9);
4240 bi->bi_phys_segments = 1;
4242 for (;logical_sector < last_sector; logical_sector +=
STRIPE_SECTORS) {
4260 ? logical_sector < conf->reshape_progress
4265 ? logical_sector < conf->reshape_safe
4275 new_sector = raid5_compute_sector(conf, logical_sector,
4278 pr_debug(
"raid456: make_request, sector %llu logical %llu\n",
4279 (
unsigned long long)new_sector,
4280 (
unsigned long long)logical_sector);
4282 sh = get_active_stripe(conf, new_sector, previous,
4298 : logical_sector < conf->reshape_progress)
4311 logical_sector < mddev->suspend_hi) {
4321 logical_sector < mddev->suspend_hi)
4327 !add_stripe_bio(sh, bi, dd_idx, rw)) {
4343 release_stripe_plug(mddev, sh);
4352 remaining = raid5_dec_bi_active_stripes(bi);
4353 if (remaining == 0) {
4362 static sector_t raid5_size(
struct mddev *mddev,
sector_t sectors,
int raid_disks);
4364 static sector_t reshape_request(
struct mddev *mddev,
sector_t sector_nr,
int *skipped)
4377 sector_t first_sector, last_sector;
4383 sector_t writepos, readpos, safepos;
4385 int reshape_sectors;
4388 if (sector_nr == 0) {
4392 sector_nr = raid5_size(mddev, 0, 0)
4429 readpos += reshape_sectors;
4430 safepos += reshape_sectors;
4432 writepos += reshape_sectors;
4442 stripe_addr = writepos;
4445 - reshape_sectors - stripe_addr
4448 BUG_ON(writepos != sector_nr + reshape_sectors);
4449 stripe_addr = sector_nr;
4479 ? (safepos > writepos && readpos < writepos)
4480 : (safepos < writepos && readpos > writepos)) ||
4499 INIT_LIST_HEAD(&stripes);
4502 int skipped_disk = 0;
4503 sh = get_active_stripe(conf, stripe_addr+i, 0, 0, 1);
4509 for (j=sh->
disks; j--;) {
4513 if (conf->
level == 6 &&
4516 s = compute_blocknr(sh, j, 0);
4517 if (s < raid5_size(mddev, 0, 0)) {
4525 if (!skipped_disk) {
4529 list_add(&sh->
lru, &stripes);
4543 raid5_compute_sector(conf, stripe_addr*(new_data_disks),
4546 raid5_compute_sector(conf, ((stripe_addr+reshape_sectors)
4547 * new_data_disks - 1),
4551 while (first_sector <= last_sector) {
4552 sh = get_active_stripe(conf, first_sector, 1, 0, 1);
4561 while (!list_empty(&stripes)) {
4563 list_del_init(&sh->
lru);
4569 sector_nr += reshape_sectors;
4589 return reshape_sectors;
4593 static inline sector_t sync_request(
struct mddev *mddev,
sector_t sector_nr,
int *skipped,
int go_faster)
4599 int still_degraded = 0;
4602 if (sector_nr >= max_sector) {
4624 return reshape_request(mddev, sector_nr, skipped);
4653 sh = get_active_stripe(conf, sector_nr, 0, 1, 0);
4655 sh = get_active_stripe(conf, sector_nr, 0, 0, 0);
4666 if (conf->
disks[i].rdev == NULL)
4679 static int retry_aligned_read(
struct r5conf *conf,
struct bio *raid_bio)
4699 sector = raid5_compute_sector(conf, logical_sector,
4701 last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9);
4703 for (; logical_sector < last_sector;
4708 if (scnt < raid5_bi_processed_stripes(raid_bio))
4712 sh = get_active_stripe(conf, sector, 0, 1, 0);
4716 raid5_set_bi_processed_stripes(raid_bio, scnt);
4721 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) {
4723 raid5_set_bi_processed_stripes(raid_bio, scnt);
4733 remaining = raid5_dec_bi_active_stripes(raid_bio);
4741 #define MAX_STRIPE_BATCH 8
4742 static int handle_active_stripes(
struct r5conf *conf)
4745 int i, batch_size = 0;
4748 (sh = __get_priority_stripe(conf)) != NULL)
4749 batch[batch_size++] = sh;
4751 if (batch_size == 0)
4755 for (i = 0; i < batch_size; i++)
4756 handle_stripe(batch[i]);
4761 for (i = 0; i < batch_size; i++)
4762 __release_stripe(conf, batch[i]);
4775 struct mddev *mddev = thread->
mddev;
4799 activate_bit_delay(conf);
4801 raid5_activate_delayed(conf);
4803 while ((bio = remove_bio_from_retry(conf))) {
4806 ok = retry_aligned_read(conf, bio);
4813 batch_size = handle_active_stripes(conf);
4816 handled += batch_size;
4824 pr_debug(
"%d stripes handled\n", handled);
4828 async_tx_issue_pending_all();
4835 raid5_show_stripe_cache_size(
struct mddev *mddev,
char *page)
4850 if (size <= 16 || size > 32768)
4853 if (drop_one_stripe(conf))
4862 if (grow_one_stripe(conf))
4871 raid5_store_stripe_cache_size(
struct mddev *mddev,
const char *page,
size_t len)
4892 raid5_show_stripe_cache_size,
4893 raid5_store_stripe_cache_size);
4896 raid5_show_preread_threshold(
struct mddev *mddev,
char *page)
4906 raid5_store_preread_threshold(
struct mddev *mddev,
const char *page,
size_t len)
4924 raid5_preread_bypass_threshold =
__ATTR(preread_bypass_threshold,
4926 raid5_show_preread_threshold,
4927 raid5_store_preread_threshold);
4930 stripe_cache_active_show(
struct mddev *mddev,
char *page)
4940 raid5_stripecache_active =
__ATTR_RO(stripe_cache_active);
4942 static struct attribute *raid5_attrs[] = {
4943 &raid5_stripecache_size.
attr,
4944 &raid5_stripecache_active.
attr,
4945 &raid5_preread_bypass_threshold.
attr,
4950 .attrs = raid5_attrs,
4954 raid5_size(
struct mddev *mddev,
sector_t sectors,
int raid_disks)
4969 static void raid5_free_percpu(
struct r5conf *conf)
4971 struct raid5_percpu *percpu;
4980 safe_put_page(percpu->spare_page);
4981 kfree(percpu->scribble);
4983 #ifdef CONFIG_HOTPLUG_CPU
4984 unregister_cpu_notifier(&conf->cpu_notify);
4991 static void free_conf(
struct r5conf *conf)
4993 shrink_stripes(conf);
4994 raid5_free_percpu(conf);
5000 #ifdef CONFIG_HOTPLUG_CPU
5005 long cpu = (
long)hcpu;
5011 if (conf->
level == 6 && !percpu->spare_page)
5013 if (!percpu->scribble)
5016 if (!percpu->scribble ||
5017 (conf->
level == 6 && !percpu->spare_page)) {
5018 safe_put_page(percpu->spare_page);
5019 kfree(percpu->scribble);
5020 pr_err(
"%s: failed memory allocation for cpu%ld\n",
5022 return notifier_from_errno(-
ENOMEM);
5027 safe_put_page(percpu->spare_page);
5028 kfree(percpu->scribble);
5029 percpu->spare_page =
NULL;
5030 percpu->scribble =
NULL;
5039 static int raid5_alloc_percpu(
struct r5conf *conf)
5042 struct page *spare_page;
5043 struct raid5_percpu
__percpu *allcpus;
5055 if (conf->
level == 6) {
5070 #ifdef CONFIG_HOTPLUG_CPU
5071 conf->cpu_notify.notifier_call = raid456_cpu_notify;
5072 conf->cpu_notify.priority = 0;
5074 err = register_cpu_notifier(&conf->cpu_notify);
5081 static struct r5conf *setup_conf(
struct mddev *mddev)
5084 int raid_disk,
memory, max_disks;
5092 printk(
KERN_ERR "md/raid:%s: raid level not set to 4/5/6 (%d)\n",
5094 return ERR_PTR(-
EIO);
5097 && !algorithm_valid_raid5(mddev->
new_layout)) ||
5099 && !algorithm_valid_raid6(mddev->
new_layout))) {
5102 return ERR_PTR(-
EIO);
5105 printk(
KERN_ERR "md/raid:%s: not enough configured devices (%d, minimum 4)\n",
5148 conf->
mddev = mddev;
5154 if (raid5_alloc_percpu(conf) != 0)
5157 pr_debug(
"raid456: run(%s) called.\n", mdname(mddev));
5161 if (raid_disk >= max_disks
5164 disk = conf->
disks + raid_disk;
5188 if (conf->
level == 6)
5201 max_disks * ((
sizeof(
struct bio) +
PAGE_SIZE))) / 1024;
5204 "md/raid:%s: couldn't allocate %dkB for buffers\n",
5205 mdname(mddev), memory);
5209 mdname(mddev), memory);
5215 "md/raid:%s: couldn't allocate thread.\n",
5225 return ERR_PTR(-
EIO);
5231 static int only_parity(
int raid_disk,
int algo,
int raid_disks,
int max_degraded)
5235 if (raid_disk < max_degraded)
5239 if (raid_disk >= raid_disks - max_degraded)
5243 if (raid_disk == 0 ||
5244 raid_disk == raid_disks - 1)
5251 if (raid_disk == raid_disks - 1)
5257 static int run(
struct mddev *mddev)
5260 int working_disks = 0;
5261 int dirty_parity_disks = 0;
5265 long long min_offset_diff = 0;
5270 " -- starting background reconstruction\n",
5279 min_offset_diff = diff;
5282 diff < min_offset_diff)
5283 min_offset_diff = diff;
5285 diff > min_offset_diff)
5286 min_offset_diff = diff;
5304 int max_degraded = (mddev->
level == 6 ? 2 : 1);
5308 "required - aborting.\n",
5321 "on a stripe boundary\n", mdname(mddev));
5328 (old_disks-max_degraded));
5335 " confused - aborting\n", mdname(mddev));
5348 else if (mddev->
ro == 0) {
5350 "must be started in read-only mode "
5362 "auto-recovery - aborting.\n",
5377 conf = setup_conf(mddev);
5382 return PTR_ERR(conf);
5391 rdev = conf->
disks[
i].rdev;
5392 if (!rdev && conf->
disks[i].replacement) {
5394 rdev = conf->
disks[
i].replacement;
5401 if (conf->
disks[i].replacement &&
5405 "replacement and reshape.\n");
5438 dirty_parity_disks++;
5444 mddev->
degraded = calc_degraded(conf);
5446 if (has_failed(conf)) {
5448 " (%d/%d failed)\n",
5457 if (mddev->
degraded > dirty_parity_disks &&
5461 "md/raid:%s: starting dirty degraded array"
5462 " - data corruption possible.\n",
5466 "md/raid:%s: cannot start dirty degraded array.\n",
5474 " devices, algorithm %d\n", mdname(mddev), conf->
level,
5479 " out of %d devices, algorithm %d\n",
5480 mdname(mddev), conf->
level,
5484 print_raid5_conf(conf);
5499 if (mddev->
to_remove == &raid5_attrs_group)
5501 else if (mddev->
kobj.sd &&
5504 "raid5: failed to create sysfs attributes for %s\n",
5510 bool discard_supported =
true;
5516 int stripe = data_disks *
5518 if (mddev->
queue->backing_dev_info.ra_pages < 2 * stripe)
5519 mddev->
queue->backing_dev_info.ra_pages = 2 *
stripe;
5523 mddev->
queue->backing_dev_info.congested_data =
mddev;
5524 mddev->
queue->backing_dev_info.congested_fn = raid5_congested;
5537 while ((stripe-1) & stripe)
5538 stripe = (stripe | (stripe-1)) + 1;
5540 mddev->
queue->limits.discard_granularity =
stripe;
5545 mddev->
queue->limits.discard_zeroes_data = 0;
5562 if (!blk_queue_discard(bdev_get_queue(rdev->
bdev)) ||
5563 !bdev_get_queue(rdev->
bdev)->
5564 limits.discard_zeroes_data)
5565 discard_supported =
false;
5568 if (discard_supported &&
5569 mddev->
queue->limits.max_discard_sectors >= stripe &&
5570 mddev->
queue->limits.discard_granularity >= stripe)
5571 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
5574 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
5581 print_raid5_conf(conf);
5588 static int stop(
struct mddev *mddev)
5594 mddev->
queue->backing_dev_info.congested_fn =
NULL;
5611 conf->
disks[i].rdev &&
5616 static void print_raid5_conf (
struct r5conf *conf)
5623 printk(
"(conf==NULL)\n");
5640 static int raid5_spare_active(
struct mddev *mddev)
5646 unsigned long flags;
5664 sysfs_notify_dirent_safe(
5665 tmp->
rdev->sysfs_state);
5667 sysfs_notify_dirent_safe(tmp->
replacement->sysfs_state);
5668 }
else if (tmp->
rdev
5673 sysfs_notify_dirent_safe(tmp->
rdev->sysfs_state);
5677 mddev->
degraded = calc_degraded(conf);
5678 spin_unlock_irqrestore(&conf->
device_lock, flags);
5679 print_raid5_conf(conf);
5683 static int raid5_remove_disk(
struct mddev *mddev,
struct md_rdev *rdev)
5691 print_raid5_conf(conf);
5692 if (rdev == p->
rdev)
5713 !has_failed(conf) &&
5715 number < conf->raid_disks) {
5741 print_raid5_conf(conf);
5745 static int raid5_add_disk(
struct mddev *mddev,
struct md_rdev *rdev)
5773 for (disk = first; disk <= last; disk++) {
5774 p = conf->
disks + disk;
5775 if (p->
rdev == NULL) {
5785 for (disk = first; disk <= last; disk++) {
5786 p = conf->
disks + disk;
5799 print_raid5_conf(conf);
5803 static int raid5_resize(
struct mddev *mddev,
sector_t sectors)
5814 newsize = raid5_size(mddev, sectors, mddev->
raid_disks);
5836 static int check_stripe_cache(
struct mddev *mddev)
5860 static int check_reshape(
struct mddev *mddev)
5868 if (has_failed(conf))
5877 if (mddev->
level == 6)
5883 if (!check_stripe_cache(mddev))
5890 static int raid5_start_reshape(
struct mddev *mddev)
5895 unsigned long flags;
5900 if (!check_stripe_cache(mddev))
5903 if (has_failed(conf))
5925 "before number of disks\n", mdname(mddev));
5958 if (rdev->raid_disk < 0 &&
5960 if (raid5_add_disk(mddev, rdev) == 0) {
5967 if (sysfs_link_rdev(mddev, rdev))
5981 mddev->
degraded = calc_degraded(conf);
5982 spin_unlock_irqrestore(&conf->
device_lock, flags);
6002 mddev->reshape_position = MaxSector;
6003 spin_unlock_irq(&conf->device_lock);
6006 conf->reshape_checkpoint = jiffies;
6021 spin_lock_irq(&conf->device_lock);
6022 conf->previous_raid_disks = conf->raid_disks;
6026 conf->reshape_progress = MaxSector;
6027 spin_unlock_irq(&conf->device_lock);
6028 wake_up(&conf->wait_for_overlap);
6034 int data_disks = conf->raid_disks - conf->max_degraded;
6035 int stripe = data_disks * ((conf->chunk_sectors << 9)
6037 if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
6038 conf->mddev->queue->backing_dev_info.ra_pages = 2 *
stripe;
6046 static void raid5_finish_reshape(
struct mddev *mddev)
6059 mddev->
degraded = calc_degraded(conf);
6067 rdev = conf->
disks[
d].replacement;
6080 static void raid5_quiesce(
struct mddev *mddev,
int state)
6116 static void *raid45_takeover_raid0(
struct mddev *mddev,
int level)
6123 printk(
KERN_ERR "md/raid:%s: cannot takeover raid0 with more than one zone.\n",
6128 sectors = raid0_conf->
strip_zone[0].zone_end;
6139 return setup_conf(mddev);
6143 static void *raid5_takeover_raid1(
struct mddev *mddev)
6167 return setup_conf(mddev);
6170 static void *raid5_takeover_raid6(
struct mddev *mddev)
6200 return setup_conf(mddev);
6204 static int raid5_check_reshape(
struct mddev *mddev)
6216 if (new_chunk > 0) {
6219 if (new_chunk < (PAGE_SIZE>>9))
6234 if (new_chunk > 0) {
6241 return check_reshape(mddev);
6244 static int raid6_check_reshape(
struct mddev *mddev)
6250 if (new_chunk > 0) {
6253 if (new_chunk < (PAGE_SIZE >> 9))
6261 return check_reshape(mddev);
6264 static void *raid5_takeover(
struct mddev *mddev)
6272 if (mddev->
level == 0)
6273 return raid45_takeover_raid0(mddev, 5);
6274 if (mddev->
level == 1)
6275 return raid5_takeover_raid1(mddev);
6276 if (mddev->
level == 4) {
6279 return setup_conf(mddev);
6281 if (mddev->
level == 6)
6282 return raid5_takeover_raid6(mddev);
6287 static void *raid4_takeover(
struct mddev *mddev)
6293 if (mddev->
level == 0)
6294 return raid45_takeover_raid0(mddev, 4);
6295 if (mddev->
level == 5 &&
6299 return setup_conf(mddev);
6306 static void *raid6_takeover(
struct mddev *mddev)
6314 if (mddev->
pers != &raid5_personality)
6349 return setup_conf(mddev);
6362 .error_handler =
error,
6363 .hot_add_disk = raid5_add_disk,
6364 .hot_remove_disk= raid5_remove_disk,
6365 .spare_active = raid5_spare_active,
6367 .resize = raid5_resize,
6369 .check_reshape = raid6_check_reshape,
6370 .start_reshape = raid5_start_reshape,
6371 .finish_reshape = raid5_finish_reshape,
6372 .quiesce = raid5_quiesce,
6373 .takeover = raid6_takeover,
6384 .error_handler =
error,
6385 .hot_add_disk = raid5_add_disk,
6386 .hot_remove_disk= raid5_remove_disk,
6387 .spare_active = raid5_spare_active,
6389 .resize = raid5_resize,
6391 .check_reshape = raid5_check_reshape,
6392 .start_reshape = raid5_start_reshape,
6393 .finish_reshape = raid5_finish_reshape,
6394 .quiesce = raid5_quiesce,
6395 .takeover = raid5_takeover,
6407 .error_handler =
error,
6408 .hot_add_disk = raid5_add_disk,
6409 .hot_remove_disk= raid5_remove_disk,
6410 .spare_active = raid5_spare_active,
6412 .resize = raid5_resize,
6414 .check_reshape = raid5_check_reshape,
6415 .start_reshape = raid5_start_reshape,
6416 .finish_reshape = raid5_finish_reshape,
6417 .quiesce = raid5_quiesce,
6418 .takeover = raid4_takeover,
6421 static int __init raid5_init(
void)
6429 static void raid5_exit(
void)