22 #include <linux/kernel.h>
23 #include <linux/sched.h>
25 #include <linux/module.h>
33 #include <linux/random.h>
35 #include <linux/time.h>
37 #include <linux/slab.h>
78 #define O2HB_DB_TYPE_LIVENODES 0
79 #define O2HB_DB_TYPE_LIVEREGIONS 1
80 #define O2HB_DB_TYPE_QUORUMREGIONS 2
81 #define O2HB_DB_TYPE_FAILEDREGIONS 3
82 #define O2HB_DB_TYPE_REGION_LIVENODES 4
83 #define O2HB_DB_TYPE_REGION_NUMBER 5
84 #define O2HB_DB_TYPE_REGION_ELAPSED_TIME 6
85 #define O2HB_DB_TYPE_REGION_PINNED 7
98 #define O2HB_DEBUG_DIR "o2hb"
99 #define O2HB_DEBUG_LIVENODES "livenodes"
100 #define O2HB_DEBUG_LIVEREGIONS "live_regions"
101 #define O2HB_DEBUG_QUORUMREGIONS "quorum_regions"
102 #define O2HB_DEBUG_FAILEDREGIONS "failed_regions"
103 #define O2HB_DEBUG_REGION_NUMBER "num"
104 #define O2HB_DEBUG_REGION_ELAPSED_TIME "elapsed_time_in_ms"
105 #define O2HB_DEBUG_REGION_PINNED "pinned"
107 static struct dentry *o2hb_debug_dir;
108 static struct dentry *o2hb_debug_livenodes;
109 static struct dentry *o2hb_debug_liveregions;
110 static struct dentry *o2hb_debug_quorumregions;
111 static struct dentry *o2hb_debug_failedregions;
115 static struct o2hb_callback {
121 #define O2HB_DEFAULT_BLOCK_BITS 9
151 #define O2HB_PIN_CUT_OFF 3
161 static int o2hb_region_pin(
const char *region_uuid);
162 static void o2hb_region_unpin(
const char *region_uuid);
169 static void o2hb_dead_threshold_set(
unsigned int threshold)
172 spin_lock(&o2hb_live_lock);
173 if (list_empty(&o2hb_all_regions))
175 spin_unlock(&o2hb_live_lock);
179 static int o2hb_global_hearbeat_mode_set(
unsigned int hb_mode)
184 spin_lock(&o2hb_live_lock);
185 if (list_empty(&o2hb_all_regions)) {
189 spin_unlock(&o2hb_live_lock);
285 static int o2hb_pop_count(
void *
map,
int count)
302 mlog(
ML_ERROR,
"Heartbeat write timeout to device %s after %u "
310 failed = o2hb_pop_count(&o2hb_failed_region_bitmap,
312 quorum = o2hb_pop_count(&o2hb_quorum_region_bitmap,
314 spin_unlock_irqrestore(&o2hb_live_lock, flags);
323 if ((failed << 1) < quorum)
330 static void o2hb_arm_write_timeout(
struct o2hb_region *reg)
340 spin_lock(&o2hb_live_lock);
342 spin_unlock(&o2hb_live_lock);
350 static void o2hb_disarm_write_timeout(
struct o2hb_region *reg)
376 static void o2hb_wait_on_io(
struct o2hb_region *reg,
379 o2hb_bio_wait_dec(wc, 1);
383 static void o2hb_bio_end_io(
struct bio *bio,
393 o2hb_bio_wait_dec(wc, 1);
399 static struct bio *o2hb_setup_one_bio(
struct o2hb_region *reg,
401 unsigned int *current_slot,
402 unsigned int max_slots)
404 int len, current_page;
405 unsigned int vec_len, vec_start;
408 unsigned int cs = *current_slot;
426 bio->bi_private =
wc;
427 bio->bi_end_io = o2hb_bio_end_io;
430 while(cs < max_slots) {
431 current_page = cs / spp;
438 current_page, vec_len, vec_start);
441 if (len != vec_len)
break;
452 static int o2hb_read_slots(
struct o2hb_region *reg,
453 unsigned int max_slots)
455 unsigned int current_slot=0;
460 o2hb_bio_wait_init(&wc);
462 while(current_slot < max_slots) {
463 bio = o2hb_setup_one_bio(reg, &wc, ¤t_slot, max_slots);
465 status = PTR_ERR(bio);
477 o2hb_wait_on_io(reg, &wc);
484 static int o2hb_issue_node_write(
struct o2hb_region *reg,
491 o2hb_bio_wait_init(write_wc);
495 bio = o2hb_setup_one_bio(reg, write_wc, &slot, slot+1);
497 status = PTR_ERR(bio);
531 mlog(
ML_ERROR,
"Dump slot information: seq = 0x%llx, node = %u, "
532 "cksum = 0x%x, generation 0x%llx\n",
538 static int o2hb_verify_crc(
struct o2hb_region *reg,
544 computed = o2hb_compute_block_crc_le(reg, hb_block);
546 return read == computed;
556 static int o2hb_check_own_slot(
struct o2hb_region *reg)
573 #define ERRSTR1 "Another node is heartbeating on device"
574 #define ERRSTR2 "Heartbeat generation mismatch on device"
575 #define ERRSTR3 "Heartbeat sequence mismatch on device"
586 "ondisk(%u:0x%llx, 0x%llx)\n", errstr, reg->
hr_dev_name,
595 static inline void o2hb_prepare_block(
struct o2hb_region *reg,
622 mlog(
ML_HB_BIO,
"our node generation = 0x%llx, cksum = 0x%x\n",
623 (
long long)generation,
627 static void o2hb_fire_callbacks(
struct o2hb_callback *hbcall,
645 struct o2hb_callback *hbcall;
648 spin_lock(&o2hb_live_lock);
649 empty = list_empty(&queued_event->
hn_item);
650 spin_unlock(&o2hb_live_lock);
659 spin_lock(&o2hb_live_lock);
660 while (!list_empty(&o2hb_node_events)
661 && !list_empty(&queued_event->
hn_item)) {
665 list_del_init(&event->
hn_item);
666 spin_unlock(&o2hb_live_lock);
681 spin_lock(&o2hb_live_lock);
683 spin_unlock(&o2hb_live_lock);
697 event->hn_event_type =
type;
698 event->hn_node =
node;
699 event->hn_node_num = node_num;
717 spin_lock(&o2hb_live_lock);
724 if (list_empty(&o2hb_live_slots[slot->
ds_node_num])) {
731 spin_unlock(&o2hb_live_lock);
733 o2hb_run_event_list(&event);
738 static void o2hb_set_quorum_device(
struct o2hb_region *reg)
751 spin_lock(&o2hb_live_lock);
762 sizeof(o2hb_live_node_bitmap)))
774 if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
776 o2hb_region_unpin(
NULL);
778 spin_unlock(&o2hb_live_lock);
781 static int o2hb_check_slot(
struct o2hb_region *reg,
784 int changed = 0, gen_changed = 0;
791 unsigned int slot_dead_ms;
802 spin_lock(&o2hb_live_lock);
804 spin_unlock(&o2hb_live_lock);
809 if (!o2hb_verify_crc(reg, hb_block)) {
812 spin_lock(&o2hb_live_lock);
825 o2hb_dump_slot(hb_block);
857 "seq %llu last %llu changed %u equal %u\n",
864 spin_lock(&o2hb_live_lock);
877 if (list_empty(&o2hb_live_slots[slot->
ds_node_num])) {
900 if (slot_dead_ms && slot_dead_ms != dead_ms) {
903 "of %u ms, but our count is %u ms.\n"
904 "Please double check your configuration values "
905 "for 'O2CB_HEARTBEAT_THRESHOLD'\n",
927 if (list_empty(&o2hb_live_slots[slot->
ds_node_num])) {
950 spin_unlock(&o2hb_live_lock);
952 o2hb_run_event_list(&event);
961 static int o2hb_highest_node(
unsigned long *nodes,
968 while ((node =
find_next_bit(nodes, numbits, node + 1)) != -1) {
978 static int o2hb_do_disk_heartbeat(
struct o2hb_region *reg)
981 int membership_change = 0, own_slot_ok = 0;
987 sizeof(configured_nodes));
1004 highest_node = o2hb_highest_node(configured_nodes,
O2NM_MAX_NODES);
1015 ret = o2hb_read_slots(reg, highest_node + 1);
1024 own_slot_ok = o2hb_check_own_slot(reg);
1029 ret = o2hb_issue_node_write(reg, &write_wc);
1038 membership_change |= o2hb_check_slot(reg, ®->
hr_slots[i]);
1046 o2hb_wait_on_io(reg, &write_wc);
1059 o2hb_set_quorum_device(reg);
1060 o2hb_arm_write_timeout(reg);
1066 if (!ret && own_slot_ok && !membership_change) {
1075 "heartbeart on region %s (%s)\n",
1076 config_item_name(®->
hr_item),
1090 static void o2hb_tv_subtract(
struct timeval *
a,
1109 static unsigned int o2hb_elapsed_msecs(
struct timeval *
start,
1114 o2hb_tv_subtract(&res, start);
1124 static int o2hb_thread(
void *
data)
1129 struct timeval before_hb, after_hb;
1130 unsigned int elapsed_msec;
1148 ret = o2hb_do_disk_heartbeat(reg);
1151 elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb);
1154 "start = %lu.%lu, end = %lu.%lu, msec = %u\n",
1155 before_hb.tv_sec, (
unsigned long) before_hb.tv_usec,
1156 after_hb.tv_sec, (
unsigned long) after_hb.tv_usec,
1160 elapsed_msec < reg->hr_timeout_ms) {
1167 o2hb_disarm_write_timeout(reg);
1171 o2hb_shutdown_slot(®->
hr_slots[i]);
1179 o2hb_prepare_block(reg, 0);
1180 ret = o2hb_issue_node_write(reg, &write_wc);
1182 o2hb_wait_on_io(reg, &write_wc);
1195 #ifdef CONFIG_DEBUG_FS
1218 spin_lock(&o2hb_live_lock);
1220 spin_unlock(&o2hb_live_lock);
1224 spin_lock(&o2hb_live_lock);
1227 spin_unlock(&o2hb_live_lock);
1260 i_size_write(inode, out);
1269 static int o2hb_debug_release(
struct inode *inode,
struct file *file)
1275 static ssize_t o2hb_debug_read(
struct file *file,
char __user *buf,
1276 size_t nbytes, loff_t *ppos)
1282 static int o2hb_debug_open(
struct inode *inode,
struct file *file)
1286 static int o2hb_debug_release(
struct inode *inode,
struct file *file)
1290 static ssize_t o2hb_debug_read(
struct file *file,
char __user *buf,
1291 size_t nbytes, loff_t *ppos)
1298 .open = o2hb_debug_open,
1299 .release = o2hb_debug_release,
1300 .read = o2hb_debug_read,
1306 kfree(o2hb_db_livenodes);
1307 kfree(o2hb_db_liveregions);
1308 kfree(o2hb_db_quorumregions);
1309 kfree(o2hb_db_failedregions);
1317 static struct dentry *o2hb_debug_create(
const char *
name,
struct dentry *dir,
1319 int type,
int size,
int len,
void *data)
1325 (*db)->db_type =
type;
1326 (*db)->db_size =
size;
1327 (*db)->db_len = len;
1328 (*db)->db_data =
data;
1334 static int o2hb_debug_init(
void)
1339 if (!o2hb_debug_dir) {
1347 sizeof(*o2hb_db_livenodes),
1349 sizeof(o2hb_live_node_bitmap),
1351 o2hb_live_node_bitmap);
1352 if (!o2hb_debug_livenodes) {
1359 &o2hb_db_liveregions,
1360 sizeof(*o2hb_db_liveregions),
1362 sizeof(o2hb_live_region_bitmap),
1364 o2hb_live_region_bitmap);
1365 if (!o2hb_debug_liveregions) {
1370 o2hb_debug_quorumregions =
1373 &o2hb_db_quorumregions,
1374 sizeof(*o2hb_db_quorumregions),
1376 sizeof(o2hb_quorum_region_bitmap),
1378 o2hb_quorum_region_bitmap);
1379 if (!o2hb_debug_quorumregions) {
1384 o2hb_debug_failedregions =
1387 &o2hb_db_failedregions,
1388 sizeof(*o2hb_db_failedregions),
1390 sizeof(o2hb_failed_region_bitmap),
1392 o2hb_failed_region_bitmap);
1393 if (!o2hb_debug_failedregions) {
1410 for (i = 0; i <
ARRAY_SIZE(o2hb_callbacks); i++)
1411 INIT_LIST_HEAD(&o2hb_callbacks[i].
list);
1413 for (i = 0; i <
ARRAY_SIZE(o2hb_live_slots); i++)
1414 INIT_LIST_HEAD(&o2hb_live_slots[i]);
1416 INIT_LIST_HEAD(&o2hb_node_events);
1418 memset(o2hb_live_node_bitmap, 0,
sizeof(o2hb_live_node_bitmap));
1419 memset(o2hb_region_bitmap, 0,
sizeof(o2hb_region_bitmap));
1420 memset(o2hb_live_region_bitmap, 0,
sizeof(o2hb_live_region_bitmap));
1421 memset(o2hb_quorum_region_bitmap, 0,
sizeof(o2hb_quorum_region_bitmap));
1422 memset(o2hb_failed_region_bitmap, 0,
sizeof(o2hb_failed_region_bitmap));
1426 return o2hb_debug_init();
1430 static void o2hb_fill_node_map_from_callback(
unsigned long *map,
1435 memcpy(map, &o2hb_live_node_bitmap, bytes);
1446 spin_lock(&o2hb_live_lock);
1447 o2hb_fill_node_map_from_callback(map, bytes);
1448 spin_unlock(&o2hb_live_lock);
1466 static void o2hb_region_release(
struct config_item *item)
1500 spin_lock(&o2hb_live_lock);
1502 spin_unlock(&o2hb_live_lock);
1507 static int o2hb_read_block_input(
struct o2hb_region *reg,
1510 unsigned long *ret_bytes,
1511 unsigned int *ret_bits)
1513 unsigned long bytes;
1514 char *
p = (
char *)page;
1517 if (!p || (*p && (*p !=
'\n')))
1521 if (bytes > 4096 || bytes < 512)
1529 *ret_bits =
ffs(bytes) - 1;
1545 unsigned long block_bytes;
1546 unsigned int block_bits;
1551 status = o2hb_read_block_input(reg, page, count,
1552 &block_bytes, &block_bits);
1572 unsigned long long tmp;
1573 char *p = (
char *)page;
1579 if (!p || (*p && (*p !=
'\n')))
1598 char *p = (
char *)page;
1604 if (!p || (*p && (*p !=
'\n')))
1618 unsigned int ret = 0;
1626 static void o2hb_init_region_params(
struct o2hb_region *reg)
1639 static int o2hb_map_slot_data(
struct o2hb_region *reg)
1642 unsigned int last_slot;
1670 "at %u blocks per page\n",
1689 last_slot = i * spp;
1692 (j < spp) && ((j + last_slot) < reg->
hr_blocks);
1696 slot = ®->
hr_slots[j + last_slot];
1710 static int o2hb_populate_slot_data(
struct o2hb_region *reg)
1716 ret = o2hb_read_slots(reg, reg->
hr_blocks);
1748 char *p = (
char *)page;
1750 struct inode *
inode;
1763 if (!p || (*p && (*p !=
'\n')))
1777 inode =
igrab(f.file->f_mapping->host);
1794 sectsize = bdev_logical_block_size(reg->
hr_bdev);
1797 "blocksize %u incorrect for device, expected %d",
1803 o2hb_init_region_params(reg);
1811 ret = o2hb_map_slot_data(reg);
1817 ret = o2hb_populate_slot_data(reg);
1835 spin_lock(&o2hb_live_lock);
1837 live_threshold <<= 1;
1838 spin_unlock(&o2hb_live_lock);
1845 hb_task =
kthread_run(o2hb_thread, reg,
"o2hb-%s",
1847 if (IS_ERR(hb_task)) {
1848 ret = PTR_ERR(hb_task);
1853 spin_lock(&o2hb_live_lock);
1855 spin_unlock(&o2hb_live_lock);
1870 spin_lock(&o2hb_live_lock);
1874 spin_unlock(&o2hb_live_lock);
1904 spin_lock(&o2hb_live_lock);
1906 pid = task_pid_nr(reg->
hr_task);
1907 spin_unlock(&o2hb_live_lock);
1912 return sprintf(page,
"%u\n", pid);
1923 .ca_name =
"block_bytes",
1925 .show = o2hb_region_block_bytes_read,
1926 .store = o2hb_region_block_bytes_write,
1931 .ca_name =
"start_block",
1933 .show = o2hb_region_start_block_read,
1934 .store = o2hb_region_start_block_write,
1939 .ca_name =
"blocks",
1941 .show = o2hb_region_blocks_read,
1942 .store = o2hb_region_blocks_write,
1949 .show = o2hb_region_dev_read,
1950 .store = o2hb_region_dev_write,
1957 .show = o2hb_region_pid_read,
1961 &o2hb_region_attr_block_bytes.
attr,
1962 &o2hb_region_attr_start_block.
attr,
1963 &o2hb_region_attr_blocks.
attr,
1964 &o2hb_region_attr_dev.
attr,
1965 &o2hb_region_attr_pid.
attr,
1978 if (o2hb_region_attr->
show)
1979 ret = o2hb_region_attr->
show(reg, page);
1985 const char *page,
size_t count)
1992 if (o2hb_region_attr->
store)
1993 ret = o2hb_region_attr->
store(reg, page, count);
1998 .release = o2hb_region_release,
1999 .show_attribute = o2hb_region_show,
2000 .store_attribute = o2hb_region_store,
2004 .ct_item_ops = &o2hb_region_item_ops,
2005 .ct_attrs = o2hb_region_attrs,
2103 spin_lock(&o2hb_live_lock);
2109 spin_unlock(&o2hb_live_lock);
2116 spin_unlock(&o2hb_live_lock);
2120 ret = o2hb_debug_region_init(reg, o2hb_debug_dir);
2129 return ERR_PTR(ret);
2132 static void o2hb_heartbeat_group_drop_item(
struct config_group *group,
2137 int quorum_region = 0;
2140 spin_lock(&o2hb_live_lock);
2144 spin_unlock(&o2hb_live_lock);
2150 spin_lock(&o2hb_live_lock);
2156 spin_unlock(&o2hb_live_lock);
2159 "stopped" :
"start aborted"), config_item_name(item),
2182 spin_lock(&o2hb_live_lock);
2187 if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
2189 o2hb_region_pin(
NULL);
2192 spin_unlock(&o2hb_live_lock);
2210 if (o2hb_heartbeat_group_attr->
show)
2211 ret = o2hb_heartbeat_group_attr->
show(reg, page);
2217 const char *page,
size_t count)
2224 if (o2hb_heartbeat_group_attr->
store)
2225 ret = o2hb_heartbeat_group_attr->
store(reg, page, count);
2240 char *p = (
char *)page;
2243 if (!p || (*p && (*p !=
'\n')))
2247 o2hb_dead_threshold_set((
unsigned int) tmp);
2262 const char *page,
size_t count)
2268 len = (page[count - 1] ==
'\n') ? count - 1 : count;
2276 ret = o2hb_global_hearbeat_mode_set(i);
2289 .ca_name =
"dead_threshold",
2291 .show = o2hb_heartbeat_group_threshold_show,
2292 .store = o2hb_heartbeat_group_threshold_store,
2299 .show = o2hb_heartbeat_group_mode_show,
2300 .store = o2hb_heartbeat_group_mode_store,
2304 &o2hb_heartbeat_group_attr_threshold.
attr,
2305 &o2hb_heartbeat_group_attr_mode.
attr,
2310 .show_attribute = o2hb_heartbeat_group_show,
2311 .store_attribute = o2hb_heartbeat_group_store,
2315 .make_item = o2hb_heartbeat_group_make_item,
2316 .drop_item = o2hb_heartbeat_group_drop_item,
2320 .ct_group_ops = &o2hb_heartbeat_group_group_ops,
2321 .ct_item_ops = &o2hb_hearbeat_group_item_ops,
2322 .ct_attrs = o2hb_heartbeat_group_attrs,
2338 &o2hb_heartbeat_group_type);
2360 return &o2hb_callbacks[
type];
2385 static int o2hb_region_pin(
const char *region_uuid)
2387 int ret = 0, found = 0;
2394 uuid = config_item_name(®->
hr_item);
2398 if (
strcmp(region_uuid, uuid))
2412 if (ret == -
ENOENT && found)
2435 static void o2hb_region_unpin(
const char *region_uuid)
2444 uuid = config_item_name(®->
hr_item);
2446 if (
strcmp(region_uuid, uuid))
2461 static int o2hb_region_inc_user(
const char *region_uuid)
2465 spin_lock(&o2hb_live_lock);
2469 ret = o2hb_region_pin(region_uuid);
2481 if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
2483 ret = o2hb_region_pin(
NULL);
2486 spin_unlock(&o2hb_live_lock);
2492 spin_lock(&o2hb_live_lock);
2496 o2hb_region_unpin(region_uuid);
2506 o2hb_region_unpin(
NULL);
2509 spin_unlock(&o2hb_live_lock);
2517 struct o2hb_callback *hbcall;
2523 hbcall = hbcall_from_type(hc->
hc_type);
2524 if (IS_ERR(hbcall)) {
2525 ret = PTR_ERR(hbcall);
2530 ret = o2hb_region_inc_user(region_uuid);
2553 ret, __builtin_return_address(0), hc);
2564 __builtin_return_address(0), hc);
2586 if (!
test_bit(node_num, testing_map)) {
2588 "node (%u) does not have heartbeating enabled.\n",
2601 o2hb_fill_node_map_from_callback(testing_map,
sizeof(testing_map));
2602 if (!
test_bit(node_num, testing_map)) {
2604 "node (%u) does not have heartbeating enabled.\n",
2638 mlog(
ML_ERROR,
"stopping heartbeat on all active regions.\n");
2640 spin_lock(&o2hb_live_lock);
2645 spin_unlock(&o2hb_live_lock);
2655 spin_lock(&o2hb_live_lock);
2659 mlog(0,
"Region: %s\n", config_item_name(®->
hr_item));
2660 if (numregs < max_regions) {
2668 spin_unlock(&o2hb_live_lock);