1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | /* |
3 | * Copyright (C) 2004, 2005 Oracle. All rights reserved. |
4 | */ |
5 | |
6 | #include <linux/kernel.h> |
7 | #include <linux/sched.h> |
8 | #include <linux/jiffies.h> |
9 | #include <linux/module.h> |
10 | #include <linux/fs.h> |
11 | #include <linux/bio.h> |
12 | #include <linux/blkdev.h> |
13 | #include <linux/delay.h> |
14 | #include <linux/file.h> |
15 | #include <linux/kthread.h> |
16 | #include <linux/configfs.h> |
17 | #include <linux/random.h> |
18 | #include <linux/crc32.h> |
19 | #include <linux/time.h> |
20 | #include <linux/debugfs.h> |
21 | #include <linux/slab.h> |
22 | #include <linux/bitmap.h> |
23 | #include <linux/ktime.h> |
24 | #include "heartbeat.h" |
25 | #include "tcp.h" |
26 | #include "nodemanager.h" |
27 | #include "quorum.h" |
28 | |
29 | #include "masklog.h" |
30 | |
31 | |
32 | /* |
33 | * The first heartbeat pass had one global thread that would serialize all hb |
34 | * callback calls. This global serializing sem should only be removed once |
35 | * we've made sure that all callees can deal with being called concurrently |
36 | * from multiple hb region threads. |
37 | */ |
38 | static DECLARE_RWSEM(o2hb_callback_sem); |
39 | |
40 | /* |
41 | * multiple hb threads are watching multiple regions. A node is live |
42 | * whenever any of the threads sees activity from the node in its region. |
43 | */ |
44 | static DEFINE_SPINLOCK(o2hb_live_lock); |
45 | static struct list_head o2hb_live_slots[O2NM_MAX_NODES]; |
46 | static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; |
47 | static LIST_HEAD(o2hb_node_events); |
48 | static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue); |
49 | |
50 | /* |
51 | * In global heartbeat, we maintain a series of region bitmaps. |
52 | * - o2hb_region_bitmap allows us to limit the region number to max region. |
53 | * - o2hb_live_region_bitmap tracks live regions (seen steady iterations). |
54 | * - o2hb_quorum_region_bitmap tracks live regions that have seen all nodes |
55 | * heartbeat on it. |
56 | * - o2hb_failed_region_bitmap tracks the regions that have seen io timeouts. |
57 | */ |
58 | static unsigned long o2hb_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; |
59 | static unsigned long o2hb_live_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; |
60 | static unsigned long o2hb_quorum_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; |
61 | static unsigned long o2hb_failed_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; |
62 | |
63 | #define O2HB_DB_TYPE_LIVENODES 0 |
64 | #define O2HB_DB_TYPE_LIVEREGIONS 1 |
65 | #define O2HB_DB_TYPE_QUORUMREGIONS 2 |
66 | #define O2HB_DB_TYPE_FAILEDREGIONS 3 |
67 | #define O2HB_DB_TYPE_REGION_LIVENODES 4 |
68 | #define O2HB_DB_TYPE_REGION_NUMBER 5 |
69 | #define O2HB_DB_TYPE_REGION_ELAPSED_TIME 6 |
70 | #define O2HB_DB_TYPE_REGION_PINNED 7 |
71 | struct o2hb_debug_buf { |
72 | int db_type; |
73 | int db_size; |
74 | int db_len; |
75 | void *db_data; |
76 | }; |
77 | |
78 | static struct o2hb_debug_buf *o2hb_db_livenodes; |
79 | static struct o2hb_debug_buf *o2hb_db_liveregions; |
80 | static struct o2hb_debug_buf *o2hb_db_quorumregions; |
81 | static struct o2hb_debug_buf *o2hb_db_failedregions; |
82 | |
83 | #define O2HB_DEBUG_DIR "o2hb" |
84 | #define O2HB_DEBUG_LIVENODES "livenodes" |
85 | #define O2HB_DEBUG_LIVEREGIONS "live_regions" |
86 | #define O2HB_DEBUG_QUORUMREGIONS "quorum_regions" |
87 | #define O2HB_DEBUG_FAILEDREGIONS "failed_regions" |
88 | #define O2HB_DEBUG_REGION_NUMBER "num" |
89 | #define O2HB_DEBUG_REGION_ELAPSED_TIME "elapsed_time_in_ms" |
90 | #define O2HB_DEBUG_REGION_PINNED "pinned" |
91 | |
92 | static struct dentry *o2hb_debug_dir; |
93 | |
94 | static LIST_HEAD(o2hb_all_regions); |
95 | |
96 | static struct o2hb_callback { |
97 | struct list_head list; |
98 | } o2hb_callbacks[O2HB_NUM_CB]; |
99 | |
100 | static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type); |
101 | |
102 | enum o2hb_heartbeat_modes { |
103 | O2HB_HEARTBEAT_LOCAL = 0, |
104 | O2HB_HEARTBEAT_GLOBAL, |
105 | O2HB_HEARTBEAT_NUM_MODES, |
106 | }; |
107 | |
108 | static const char *o2hb_heartbeat_mode_desc[O2HB_HEARTBEAT_NUM_MODES] = { |
109 | "local" , /* O2HB_HEARTBEAT_LOCAL */ |
110 | "global" , /* O2HB_HEARTBEAT_GLOBAL */ |
111 | }; |
112 | |
113 | unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD; |
114 | static unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL; |
115 | |
116 | /* |
117 | * o2hb_dependent_users tracks the number of registered callbacks that depend |
118 | * on heartbeat. o2net and o2dlm are two entities that register this callback. |
119 | * However only o2dlm depends on the heartbeat. It does not want the heartbeat |
120 | * to stop while a dlm domain is still active. |
121 | */ |
122 | static unsigned int o2hb_dependent_users; |
123 | |
124 | /* |
125 | * In global heartbeat mode, all regions are pinned if there are one or more |
126 | * dependent users and the quorum region count is <= O2HB_PIN_CUT_OFF. All |
127 | * regions are unpinned if the region count exceeds the cut off or the number |
128 | * of dependent users falls to zero. |
129 | */ |
130 | #define O2HB_PIN_CUT_OFF 3 |
131 | |
132 | /* |
133 | * In local heartbeat mode, we assume the dlm domain name to be the same as |
134 | * region uuid. This is true for domains created for the file system but not |
135 | * necessarily true for userdlm domains. This is a known limitation. |
136 | * |
137 | * In global heartbeat mode, we pin/unpin all o2hb regions. This solution |
138 | * works for both file system and userdlm domains. |
139 | */ |
140 | static int o2hb_region_pin(const char *region_uuid); |
141 | static void o2hb_region_unpin(const char *region_uuid); |
142 | |
143 | /* Only sets a new threshold if there are no active regions. |
144 | * |
145 | * No locking or otherwise interesting code is required for reading |
146 | * o2hb_dead_threshold as it can't change once regions are active and |
147 | * it's not interesting to anyone until then anyway. */ |
148 | static void o2hb_dead_threshold_set(unsigned int threshold) |
149 | { |
150 | if (threshold > O2HB_MIN_DEAD_THRESHOLD) { |
151 | spin_lock(lock: &o2hb_live_lock); |
152 | if (list_empty(head: &o2hb_all_regions)) |
153 | o2hb_dead_threshold = threshold; |
154 | spin_unlock(lock: &o2hb_live_lock); |
155 | } |
156 | } |
157 | |
158 | static int o2hb_global_heartbeat_mode_set(unsigned int hb_mode) |
159 | { |
160 | int ret = -1; |
161 | |
162 | if (hb_mode < O2HB_HEARTBEAT_NUM_MODES) { |
163 | spin_lock(lock: &o2hb_live_lock); |
164 | if (list_empty(head: &o2hb_all_regions)) { |
165 | o2hb_heartbeat_mode = hb_mode; |
166 | ret = 0; |
167 | } |
168 | spin_unlock(lock: &o2hb_live_lock); |
169 | } |
170 | |
171 | return ret; |
172 | } |
173 | |
174 | struct o2hb_node_event { |
175 | struct list_head hn_item; |
176 | enum o2hb_callback_type hn_event_type; |
177 | struct o2nm_node *hn_node; |
178 | int hn_node_num; |
179 | }; |
180 | |
181 | struct o2hb_disk_slot { |
182 | struct o2hb_disk_heartbeat_block *ds_raw_block; |
183 | u8 ds_node_num; |
184 | u64 ds_last_time; |
185 | u64 ds_last_generation; |
186 | u16 ds_equal_samples; |
187 | u16 ds_changed_samples; |
188 | struct list_head ds_live_item; |
189 | }; |
190 | |
191 | /* each thread owns a region.. when we're asked to tear down the region |
192 | * we ask the thread to stop, who cleans up the region */ |
193 | struct o2hb_region { |
194 | struct config_item hr_item; |
195 | |
196 | struct list_head hr_all_item; |
197 | unsigned hr_unclean_stop:1, |
198 | hr_aborted_start:1, |
199 | hr_item_pinned:1, |
200 | hr_item_dropped:1, |
201 | hr_node_deleted:1; |
202 | |
203 | /* protected by the hr_callback_sem */ |
204 | struct task_struct *hr_task; |
205 | |
206 | unsigned int hr_blocks; |
207 | unsigned long long hr_start_block; |
208 | |
209 | unsigned int hr_block_bits; |
210 | unsigned int hr_block_bytes; |
211 | |
212 | unsigned int hr_slots_per_page; |
213 | unsigned int hr_num_pages; |
214 | |
215 | struct page **hr_slot_data; |
216 | struct file *hr_bdev_file; |
217 | struct o2hb_disk_slot *hr_slots; |
218 | |
219 | /* live node map of this region */ |
220 | unsigned long hr_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; |
221 | unsigned int hr_region_num; |
222 | |
223 | struct dentry *hr_debug_dir; |
224 | struct o2hb_debug_buf *hr_db_livenodes; |
225 | struct o2hb_debug_buf *hr_db_regnum; |
226 | struct o2hb_debug_buf *hr_db_elapsed_time; |
227 | struct o2hb_debug_buf *hr_db_pinned; |
228 | |
229 | /* let the person setting up hb wait for it to return until it |
230 | * has reached a 'steady' state. This will be fixed when we have |
231 | * a more complete api that doesn't lead to this sort of fragility. */ |
232 | atomic_t hr_steady_iterations; |
233 | |
234 | /* terminate o2hb thread if it does not reach steady state |
235 | * (hr_steady_iterations == 0) within hr_unsteady_iterations */ |
236 | atomic_t hr_unsteady_iterations; |
237 | |
238 | unsigned int hr_timeout_ms; |
239 | |
240 | /* randomized as the region goes up and down so that a node |
241 | * recognizes a node going up and down in one iteration */ |
242 | u64 hr_generation; |
243 | |
244 | struct delayed_work hr_write_timeout_work; |
245 | unsigned long hr_last_timeout_start; |
246 | |
247 | /* negotiate timer, used to negotiate extending hb timeout. */ |
248 | struct delayed_work hr_nego_timeout_work; |
249 | unsigned long hr_nego_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; |
250 | |
251 | /* Used during o2hb_check_slot to hold a copy of the block |
252 | * being checked because we temporarily have to zero out the |
253 | * crc field. */ |
254 | struct o2hb_disk_heartbeat_block *hr_tmp_block; |
255 | |
256 | /* Message key for negotiate timeout message. */ |
257 | unsigned int hr_key; |
258 | struct list_head hr_handler_list; |
259 | |
260 | /* last hb status, 0 for success, other value for error. */ |
261 | int hr_last_hb_status; |
262 | }; |
263 | |
264 | static inline struct block_device *reg_bdev(struct o2hb_region *reg) |
265 | { |
266 | return reg->hr_bdev_file ? file_bdev(bdev_file: reg->hr_bdev_file) : NULL; |
267 | } |
268 | |
269 | struct o2hb_bio_wait_ctxt { |
270 | atomic_t wc_num_reqs; |
271 | struct completion wc_io_complete; |
272 | int wc_error; |
273 | }; |
274 | |
275 | #define O2HB_NEGO_TIMEOUT_MS (O2HB_MAX_WRITE_TIMEOUT_MS/2) |
276 | |
277 | enum { |
278 | O2HB_NEGO_TIMEOUT_MSG = 1, |
279 | O2HB_NEGO_APPROVE_MSG = 2, |
280 | }; |
281 | |
282 | struct o2hb_nego_msg { |
283 | u8 node_num; |
284 | }; |
285 | |
286 | static void o2hb_write_timeout(struct work_struct *work) |
287 | { |
288 | int failed, quorum; |
289 | struct o2hb_region *reg = |
290 | container_of(work, struct o2hb_region, |
291 | hr_write_timeout_work.work); |
292 | |
293 | mlog(ML_ERROR, "Heartbeat write timeout to device %pg after %u " |
294 | "milliseconds\n" , reg_bdev(reg), |
295 | jiffies_to_msecs(jiffies - reg->hr_last_timeout_start)); |
296 | |
297 | if (o2hb_global_heartbeat_active()) { |
298 | spin_lock(lock: &o2hb_live_lock); |
299 | if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap)) |
300 | set_bit(nr: reg->hr_region_num, addr: o2hb_failed_region_bitmap); |
301 | failed = bitmap_weight(src: o2hb_failed_region_bitmap, |
302 | O2NM_MAX_REGIONS); |
303 | quorum = bitmap_weight(src: o2hb_quorum_region_bitmap, |
304 | O2NM_MAX_REGIONS); |
305 | spin_unlock(lock: &o2hb_live_lock); |
306 | |
307 | mlog(ML_HEARTBEAT, "Number of regions %d, failed regions %d\n" , |
308 | quorum, failed); |
309 | |
310 | /* |
311 | * Fence if the number of failed regions >= half the number |
312 | * of quorum regions |
313 | */ |
314 | if ((failed << 1) < quorum) |
315 | return; |
316 | } |
317 | |
318 | o2quo_disk_timeout(); |
319 | } |
320 | |
321 | static void o2hb_arm_timeout(struct o2hb_region *reg) |
322 | { |
323 | /* Arm writeout only after thread reaches steady state */ |
324 | if (atomic_read(v: ®->hr_steady_iterations) != 0) |
325 | return; |
326 | |
327 | mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n" , |
328 | O2HB_MAX_WRITE_TIMEOUT_MS); |
329 | |
330 | if (o2hb_global_heartbeat_active()) { |
331 | spin_lock(lock: &o2hb_live_lock); |
332 | clear_bit(nr: reg->hr_region_num, addr: o2hb_failed_region_bitmap); |
333 | spin_unlock(lock: &o2hb_live_lock); |
334 | } |
335 | cancel_delayed_work(dwork: ®->hr_write_timeout_work); |
336 | schedule_delayed_work(dwork: ®->hr_write_timeout_work, |
337 | delay: msecs_to_jiffies(O2HB_MAX_WRITE_TIMEOUT_MS)); |
338 | |
339 | cancel_delayed_work(dwork: ®->hr_nego_timeout_work); |
340 | /* negotiate timeout must be less than write timeout. */ |
341 | schedule_delayed_work(dwork: ®->hr_nego_timeout_work, |
342 | delay: msecs_to_jiffies(O2HB_NEGO_TIMEOUT_MS)); |
343 | bitmap_zero(dst: reg->hr_nego_node_bitmap, O2NM_MAX_NODES); |
344 | } |
345 | |
346 | static void o2hb_disarm_timeout(struct o2hb_region *reg) |
347 | { |
348 | cancel_delayed_work_sync(dwork: ®->hr_write_timeout_work); |
349 | cancel_delayed_work_sync(dwork: ®->hr_nego_timeout_work); |
350 | } |
351 | |
352 | static int o2hb_send_nego_msg(int key, int type, u8 target) |
353 | { |
354 | struct o2hb_nego_msg msg; |
355 | int status, ret; |
356 | |
357 | msg.node_num = o2nm_this_node(); |
358 | again: |
359 | ret = o2net_send_message(msg_type: type, key, data: &msg, len: sizeof(msg), |
360 | target_node: target, status: &status); |
361 | |
362 | if (ret == -EAGAIN || ret == -ENOMEM) { |
363 | msleep(msecs: 100); |
364 | goto again; |
365 | } |
366 | |
367 | return ret; |
368 | } |
369 | |
370 | static void o2hb_nego_timeout(struct work_struct *work) |
371 | { |
372 | unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; |
373 | int master_node, i, ret; |
374 | struct o2hb_region *reg; |
375 | |
376 | reg = container_of(work, struct o2hb_region, hr_nego_timeout_work.work); |
377 | /* don't negotiate timeout if last hb failed since it is very |
378 | * possible io failed. Should let write timeout fence self. |
379 | */ |
380 | if (reg->hr_last_hb_status) |
381 | return; |
382 | |
383 | o2hb_fill_node_map(map: live_node_bitmap, O2NM_MAX_NODES); |
384 | /* lowest node as master node to make negotiate decision. */ |
385 | master_node = find_first_bit(addr: live_node_bitmap, O2NM_MAX_NODES); |
386 | |
387 | if (master_node == o2nm_this_node()) { |
388 | if (!test_bit(master_node, reg->hr_nego_node_bitmap)) { |
389 | printk(KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%pg).\n" , |
390 | o2nm_this_node(), O2HB_NEGO_TIMEOUT_MS/1000, |
391 | config_item_name(®->hr_item), reg_bdev(reg)); |
392 | set_bit(nr: master_node, addr: reg->hr_nego_node_bitmap); |
393 | } |
394 | if (!bitmap_equal(src1: reg->hr_nego_node_bitmap, src2: live_node_bitmap, |
395 | O2NM_MAX_NODES)) { |
396 | /* check negotiate bitmap every second to do timeout |
397 | * approve decision. |
398 | */ |
399 | schedule_delayed_work(dwork: ®->hr_nego_timeout_work, |
400 | delay: msecs_to_jiffies(m: 1000)); |
401 | |
402 | return; |
403 | } |
404 | |
405 | printk(KERN_NOTICE "o2hb: all nodes hb write hung, maybe region %s (%pg) is down.\n" , |
406 | config_item_name(®->hr_item), |
407 | reg_bdev(reg)); |
408 | /* approve negotiate timeout request. */ |
409 | o2hb_arm_timeout(reg); |
410 | |
411 | i = -1; |
412 | while ((i = find_next_bit(addr: live_node_bitmap, |
413 | O2NM_MAX_NODES, offset: i + 1)) < O2NM_MAX_NODES) { |
414 | if (i == master_node) |
415 | continue; |
416 | |
417 | mlog(ML_HEARTBEAT, "send NEGO_APPROVE msg to node %d\n" , i); |
418 | ret = o2hb_send_nego_msg(key: reg->hr_key, |
419 | type: O2HB_NEGO_APPROVE_MSG, target: i); |
420 | if (ret) |
421 | mlog(ML_ERROR, "send NEGO_APPROVE msg to node %d fail %d\n" , |
422 | i, ret); |
423 | } |
424 | } else { |
425 | /* negotiate timeout with master node. */ |
426 | printk(KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%pg), negotiate timeout with node %d.\n" , |
427 | o2nm_this_node(), O2HB_NEGO_TIMEOUT_MS/1000, config_item_name(®->hr_item), |
428 | reg_bdev(reg), master_node); |
429 | ret = o2hb_send_nego_msg(key: reg->hr_key, type: O2HB_NEGO_TIMEOUT_MSG, |
430 | target: master_node); |
431 | if (ret) |
432 | mlog(ML_ERROR, "send NEGO_TIMEOUT msg to node %d fail %d\n" , |
433 | master_node, ret); |
434 | } |
435 | } |
436 | |
437 | static int o2hb_nego_timeout_handler(struct o2net_msg *msg, u32 len, void *data, |
438 | void **ret_data) |
439 | { |
440 | struct o2hb_region *reg = data; |
441 | struct o2hb_nego_msg *nego_msg; |
442 | |
443 | nego_msg = (struct o2hb_nego_msg *)msg->buf; |
444 | printk(KERN_NOTICE "o2hb: receive negotiate timeout message from node %d on region %s (%pg).\n" , |
445 | nego_msg->node_num, config_item_name(®->hr_item), |
446 | reg_bdev(reg)); |
447 | if (nego_msg->node_num < O2NM_MAX_NODES) |
448 | set_bit(nr: nego_msg->node_num, addr: reg->hr_nego_node_bitmap); |
449 | else |
450 | mlog(ML_ERROR, "got nego timeout message from bad node.\n" ); |
451 | |
452 | return 0; |
453 | } |
454 | |
455 | static int o2hb_nego_approve_handler(struct o2net_msg *msg, u32 len, void *data, |
456 | void **ret_data) |
457 | { |
458 | struct o2hb_region *reg = data; |
459 | |
460 | printk(KERN_NOTICE "o2hb: negotiate timeout approved by master node on region %s (%pg).\n" , |
461 | config_item_name(®->hr_item), reg_bdev(reg)); |
462 | o2hb_arm_timeout(reg); |
463 | return 0; |
464 | } |
465 | |
466 | static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc) |
467 | { |
468 | atomic_set(v: &wc->wc_num_reqs, i: 1); |
469 | init_completion(x: &wc->wc_io_complete); |
470 | wc->wc_error = 0; |
471 | } |
472 | |
473 | /* Used in error paths too */ |
474 | static inline void o2hb_bio_wait_dec(struct o2hb_bio_wait_ctxt *wc, |
475 | unsigned int num) |
476 | { |
477 | /* sadly atomic_sub_and_test() isn't available on all platforms. The |
478 | * good news is that the fast path only completes one at a time */ |
479 | while(num--) { |
480 | if (atomic_dec_and_test(v: &wc->wc_num_reqs)) { |
481 | BUG_ON(num > 0); |
482 | complete(&wc->wc_io_complete); |
483 | } |
484 | } |
485 | } |
486 | |
487 | static void o2hb_wait_on_io(struct o2hb_bio_wait_ctxt *wc) |
488 | { |
489 | o2hb_bio_wait_dec(wc, num: 1); |
490 | wait_for_completion(&wc->wc_io_complete); |
491 | } |
492 | |
493 | static void o2hb_bio_end_io(struct bio *bio) |
494 | { |
495 | struct o2hb_bio_wait_ctxt *wc = bio->bi_private; |
496 | |
497 | if (bio->bi_status) { |
498 | mlog(ML_ERROR, "IO Error %d\n" , bio->bi_status); |
499 | wc->wc_error = blk_status_to_errno(status: bio->bi_status); |
500 | } |
501 | |
502 | o2hb_bio_wait_dec(wc, num: 1); |
503 | bio_put(bio); |
504 | } |
505 | |
506 | /* Setup a Bio to cover I/O against num_slots slots starting at |
507 | * start_slot. */ |
508 | static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, |
509 | struct o2hb_bio_wait_ctxt *wc, |
510 | unsigned int *current_slot, |
511 | unsigned int max_slots, blk_opf_t opf) |
512 | { |
513 | int len, current_page; |
514 | unsigned int vec_len, vec_start; |
515 | unsigned int bits = reg->hr_block_bits; |
516 | unsigned int spp = reg->hr_slots_per_page; |
517 | unsigned int cs = *current_slot; |
518 | struct bio *bio; |
519 | struct page *page; |
520 | |
521 | /* Testing has shown this allocation to take long enough under |
522 | * GFP_KERNEL that the local node can get fenced. It would be |
523 | * nicest if we could pre-allocate these bios and avoid this |
524 | * all together. */ |
525 | bio = bio_alloc(bdev: reg_bdev(reg), nr_vecs: 16, opf, GFP_ATOMIC); |
526 | if (!bio) { |
527 | mlog(ML_ERROR, "Could not alloc slots BIO!\n" ); |
528 | bio = ERR_PTR(error: -ENOMEM); |
529 | goto bail; |
530 | } |
531 | |
532 | /* Must put everything in 512 byte sectors for the bio... */ |
533 | bio->bi_iter.bi_sector = (reg->hr_start_block + cs) << (bits - 9); |
534 | bio->bi_private = wc; |
535 | bio->bi_end_io = o2hb_bio_end_io; |
536 | |
537 | vec_start = (cs << bits) % PAGE_SIZE; |
538 | while(cs < max_slots) { |
539 | current_page = cs / spp; |
540 | page = reg->hr_slot_data[current_page]; |
541 | |
542 | vec_len = min(PAGE_SIZE - vec_start, |
543 | (max_slots-cs) * (PAGE_SIZE/spp) ); |
544 | |
545 | mlog(ML_HB_BIO, "page %d, vec_len = %u, vec_start = %u\n" , |
546 | current_page, vec_len, vec_start); |
547 | |
548 | len = bio_add_page(bio, page, len: vec_len, off: vec_start); |
549 | if (len != vec_len) break; |
550 | |
551 | cs += vec_len / (PAGE_SIZE/spp); |
552 | vec_start = 0; |
553 | } |
554 | |
555 | bail: |
556 | *current_slot = cs; |
557 | return bio; |
558 | } |
559 | |
560 | static int o2hb_read_slots(struct o2hb_region *reg, |
561 | unsigned int begin_slot, |
562 | unsigned int max_slots) |
563 | { |
564 | unsigned int current_slot = begin_slot; |
565 | int status; |
566 | struct o2hb_bio_wait_ctxt wc; |
567 | struct bio *bio; |
568 | |
569 | o2hb_bio_wait_init(wc: &wc); |
570 | |
571 | while(current_slot < max_slots) { |
572 | bio = o2hb_setup_one_bio(reg, wc: &wc, current_slot: ¤t_slot, max_slots, |
573 | opf: REQ_OP_READ); |
574 | if (IS_ERR(ptr: bio)) { |
575 | status = PTR_ERR(ptr: bio); |
576 | mlog_errno(status); |
577 | goto bail_and_wait; |
578 | } |
579 | |
580 | atomic_inc(v: &wc.wc_num_reqs); |
581 | submit_bio(bio); |
582 | } |
583 | |
584 | status = 0; |
585 | |
586 | bail_and_wait: |
587 | o2hb_wait_on_io(wc: &wc); |
588 | if (wc.wc_error && !status) |
589 | status = wc.wc_error; |
590 | |
591 | return status; |
592 | } |
593 | |
594 | static int o2hb_issue_node_write(struct o2hb_region *reg, |
595 | struct o2hb_bio_wait_ctxt *write_wc) |
596 | { |
597 | int status; |
598 | unsigned int slot; |
599 | struct bio *bio; |
600 | |
601 | o2hb_bio_wait_init(wc: write_wc); |
602 | |
603 | slot = o2nm_this_node(); |
604 | |
605 | bio = o2hb_setup_one_bio(reg, wc: write_wc, current_slot: &slot, max_slots: slot+1, |
606 | opf: REQ_OP_WRITE | REQ_SYNC); |
607 | if (IS_ERR(ptr: bio)) { |
608 | status = PTR_ERR(ptr: bio); |
609 | mlog_errno(status); |
610 | goto bail; |
611 | } |
612 | |
613 | atomic_inc(v: &write_wc->wc_num_reqs); |
614 | submit_bio(bio); |
615 | |
616 | status = 0; |
617 | bail: |
618 | return status; |
619 | } |
620 | |
621 | static u32 o2hb_compute_block_crc_le(struct o2hb_region *reg, |
622 | struct o2hb_disk_heartbeat_block *hb_block) |
623 | { |
624 | __le32 old_cksum; |
625 | u32 ret; |
626 | |
627 | /* We want to compute the block crc with a 0 value in the |
628 | * hb_cksum field. Save it off here and replace after the |
629 | * crc. */ |
630 | old_cksum = hb_block->hb_cksum; |
631 | hb_block->hb_cksum = 0; |
632 | |
633 | ret = crc32_le(crc: 0, p: (unsigned char *) hb_block, len: reg->hr_block_bytes); |
634 | |
635 | hb_block->hb_cksum = old_cksum; |
636 | |
637 | return ret; |
638 | } |
639 | |
640 | static void o2hb_dump_slot(struct o2hb_disk_heartbeat_block *hb_block) |
641 | { |
642 | mlog(ML_ERROR, "Dump slot information: seq = 0x%llx, node = %u, " |
643 | "cksum = 0x%x, generation 0x%llx\n" , |
644 | (long long)le64_to_cpu(hb_block->hb_seq), |
645 | hb_block->hb_node, le32_to_cpu(hb_block->hb_cksum), |
646 | (long long)le64_to_cpu(hb_block->hb_generation)); |
647 | } |
648 | |
649 | static int o2hb_verify_crc(struct o2hb_region *reg, |
650 | struct o2hb_disk_heartbeat_block *hb_block) |
651 | { |
652 | u32 read, computed; |
653 | |
654 | read = le32_to_cpu(hb_block->hb_cksum); |
655 | computed = o2hb_compute_block_crc_le(reg, hb_block); |
656 | |
657 | return read == computed; |
658 | } |
659 | |
660 | /* |
661 | * Compare the slot data with what we wrote in the last iteration. |
662 | * If the match fails, print an appropriate error message. This is to |
663 | * detect errors like... another node hearting on the same slot, |
664 | * flaky device that is losing writes, etc. |
665 | * Returns 1 if check succeeds, 0 otherwise. |
666 | */ |
667 | static int o2hb_check_own_slot(struct o2hb_region *reg) |
668 | { |
669 | struct o2hb_disk_slot *slot; |
670 | struct o2hb_disk_heartbeat_block *hb_block; |
671 | char *errstr; |
672 | |
673 | slot = ®->hr_slots[o2nm_this_node()]; |
674 | /* Don't check on our 1st timestamp */ |
675 | if (!slot->ds_last_time) |
676 | return 0; |
677 | |
678 | hb_block = slot->ds_raw_block; |
679 | if (le64_to_cpu(hb_block->hb_seq) == slot->ds_last_time && |
680 | le64_to_cpu(hb_block->hb_generation) == slot->ds_last_generation && |
681 | hb_block->hb_node == slot->ds_node_num) |
682 | return 1; |
683 | |
684 | #define ERRSTR1 "Another node is heartbeating on device" |
685 | #define ERRSTR2 "Heartbeat generation mismatch on device" |
686 | #define ERRSTR3 "Heartbeat sequence mismatch on device" |
687 | |
688 | if (hb_block->hb_node != slot->ds_node_num) |
689 | errstr = ERRSTR1; |
690 | else if (le64_to_cpu(hb_block->hb_generation) != |
691 | slot->ds_last_generation) |
692 | errstr = ERRSTR2; |
693 | else |
694 | errstr = ERRSTR3; |
695 | |
696 | mlog(ML_ERROR, "%s (%pg): expected(%u:0x%llx, 0x%llx), " |
697 | "ondisk(%u:0x%llx, 0x%llx)\n" , errstr, reg_bdev(reg), |
698 | slot->ds_node_num, (unsigned long long)slot->ds_last_generation, |
699 | (unsigned long long)slot->ds_last_time, hb_block->hb_node, |
700 | (unsigned long long)le64_to_cpu(hb_block->hb_generation), |
701 | (unsigned long long)le64_to_cpu(hb_block->hb_seq)); |
702 | |
703 | return 0; |
704 | } |
705 | |
706 | static inline void o2hb_prepare_block(struct o2hb_region *reg, |
707 | u64 generation) |
708 | { |
709 | int node_num; |
710 | u64 cputime; |
711 | struct o2hb_disk_slot *slot; |
712 | struct o2hb_disk_heartbeat_block *hb_block; |
713 | |
714 | node_num = o2nm_this_node(); |
715 | slot = ®->hr_slots[node_num]; |
716 | |
717 | hb_block = (struct o2hb_disk_heartbeat_block *)slot->ds_raw_block; |
718 | memset(hb_block, 0, reg->hr_block_bytes); |
719 | /* TODO: time stuff */ |
720 | cputime = ktime_get_real_seconds(); |
721 | if (!cputime) |
722 | cputime = 1; |
723 | |
724 | hb_block->hb_seq = cpu_to_le64(cputime); |
725 | hb_block->hb_node = node_num; |
726 | hb_block->hb_generation = cpu_to_le64(generation); |
727 | hb_block->hb_dead_ms = cpu_to_le32(o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS); |
728 | |
729 | /* This step must always happen last! */ |
730 | hb_block->hb_cksum = cpu_to_le32(o2hb_compute_block_crc_le(reg, |
731 | hb_block)); |
732 | |
733 | mlog(ML_HB_BIO, "our node generation = 0x%llx, cksum = 0x%x\n" , |
734 | (long long)generation, |
735 | le32_to_cpu(hb_block->hb_cksum)); |
736 | } |
737 | |
738 | static void o2hb_fire_callbacks(struct o2hb_callback *hbcall, |
739 | struct o2nm_node *node, |
740 | int idx) |
741 | { |
742 | struct o2hb_callback_func *f; |
743 | |
744 | list_for_each_entry(f, &hbcall->list, hc_item) { |
745 | mlog(ML_HEARTBEAT, "calling funcs %p\n" , f); |
746 | (f->hc_func)(node, idx, f->hc_data); |
747 | } |
748 | } |
749 | |
750 | /* Will run the list in order until we process the passed event */ |
751 | static void o2hb_run_event_list(struct o2hb_node_event *queued_event) |
752 | { |
753 | struct o2hb_callback *hbcall; |
754 | struct o2hb_node_event *event; |
755 | |
756 | /* Holding callback sem assures we don't alter the callback |
757 | * lists when doing this, and serializes ourselves with other |
758 | * processes wanting callbacks. */ |
759 | down_write(sem: &o2hb_callback_sem); |
760 | |
761 | spin_lock(lock: &o2hb_live_lock); |
762 | while (!list_empty(head: &o2hb_node_events) |
763 | && !list_empty(head: &queued_event->hn_item)) { |
764 | event = list_entry(o2hb_node_events.next, |
765 | struct o2hb_node_event, |
766 | hn_item); |
767 | list_del_init(entry: &event->hn_item); |
768 | spin_unlock(lock: &o2hb_live_lock); |
769 | |
770 | mlog(ML_HEARTBEAT, "Node %s event for %d\n" , |
771 | event->hn_event_type == O2HB_NODE_UP_CB ? "UP" : "DOWN" , |
772 | event->hn_node_num); |
773 | |
774 | hbcall = hbcall_from_type(type: event->hn_event_type); |
775 | |
776 | /* We should *never* have gotten on to the list with a |
777 | * bad type... This isn't something that we should try |
778 | * to recover from. */ |
779 | BUG_ON(IS_ERR(hbcall)); |
780 | |
781 | o2hb_fire_callbacks(hbcall, node: event->hn_node, idx: event->hn_node_num); |
782 | |
783 | spin_lock(lock: &o2hb_live_lock); |
784 | } |
785 | spin_unlock(lock: &o2hb_live_lock); |
786 | |
787 | up_write(sem: &o2hb_callback_sem); |
788 | } |
789 | |
790 | static void o2hb_queue_node_event(struct o2hb_node_event *event, |
791 | enum o2hb_callback_type type, |
792 | struct o2nm_node *node, |
793 | int node_num) |
794 | { |
795 | assert_spin_locked(&o2hb_live_lock); |
796 | |
797 | BUG_ON((!node) && (type != O2HB_NODE_DOWN_CB)); |
798 | |
799 | event->hn_event_type = type; |
800 | event->hn_node = node; |
801 | event->hn_node_num = node_num; |
802 | |
803 | mlog(ML_HEARTBEAT, "Queue node %s event for node %d\n" , |
804 | type == O2HB_NODE_UP_CB ? "UP" : "DOWN" , node_num); |
805 | |
806 | list_add_tail(new: &event->hn_item, head: &o2hb_node_events); |
807 | } |
808 | |
809 | static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot) |
810 | { |
811 | struct o2hb_node_event event = |
812 | { .hn_item = LIST_HEAD_INIT(event.hn_item), }; |
813 | struct o2nm_node *node; |
814 | int queued = 0; |
815 | |
816 | node = o2nm_get_node_by_num(node_num: slot->ds_node_num); |
817 | if (!node) |
818 | return; |
819 | |
820 | spin_lock(lock: &o2hb_live_lock); |
821 | if (!list_empty(head: &slot->ds_live_item)) { |
822 | mlog(ML_HEARTBEAT, "Shutdown, node %d leaves region\n" , |
823 | slot->ds_node_num); |
824 | |
825 | list_del_init(entry: &slot->ds_live_item); |
826 | |
827 | if (list_empty(head: &o2hb_live_slots[slot->ds_node_num])) { |
828 | clear_bit(nr: slot->ds_node_num, addr: o2hb_live_node_bitmap); |
829 | |
830 | o2hb_queue_node_event(event: &event, type: O2HB_NODE_DOWN_CB, node, |
831 | node_num: slot->ds_node_num); |
832 | queued = 1; |
833 | } |
834 | } |
835 | spin_unlock(lock: &o2hb_live_lock); |
836 | |
837 | if (queued) |
838 | o2hb_run_event_list(queued_event: &event); |
839 | |
840 | o2nm_node_put(node); |
841 | } |
842 | |
843 | static void o2hb_set_quorum_device(struct o2hb_region *reg) |
844 | { |
845 | if (!o2hb_global_heartbeat_active()) |
846 | return; |
847 | |
848 | /* Prevent race with o2hb_heartbeat_group_drop_item() */ |
849 | if (kthread_should_stop()) |
850 | return; |
851 | |
852 | /* Tag region as quorum only after thread reaches steady state */ |
853 | if (atomic_read(v: ®->hr_steady_iterations) != 0) |
854 | return; |
855 | |
856 | spin_lock(lock: &o2hb_live_lock); |
857 | |
858 | if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap)) |
859 | goto unlock; |
860 | |
861 | /* |
862 | * A region can be added to the quorum only when it sees all |
863 | * live nodes heartbeat on it. In other words, the region has been |
864 | * added to all nodes. |
865 | */ |
866 | if (!bitmap_equal(src1: reg->hr_live_node_bitmap, src2: o2hb_live_node_bitmap, |
867 | O2NM_MAX_NODES)) |
868 | goto unlock; |
869 | |
870 | printk(KERN_NOTICE "o2hb: Region %s (%pg) is now a quorum device\n" , |
871 | config_item_name(®->hr_item), reg_bdev(reg)); |
872 | |
873 | set_bit(nr: reg->hr_region_num, addr: o2hb_quorum_region_bitmap); |
874 | |
875 | /* |
876 | * If global heartbeat active, unpin all regions if the |
877 | * region count > CUT_OFF |
878 | */ |
879 | if (bitmap_weight(src: o2hb_quorum_region_bitmap, |
880 | O2NM_MAX_REGIONS) > O2HB_PIN_CUT_OFF) |
881 | o2hb_region_unpin(NULL); |
882 | unlock: |
883 | spin_unlock(lock: &o2hb_live_lock); |
884 | } |
885 | |
886 | static int o2hb_check_slot(struct o2hb_region *reg, |
887 | struct o2hb_disk_slot *slot) |
888 | { |
889 | int changed = 0, gen_changed = 0; |
890 | struct o2hb_node_event event = |
891 | { .hn_item = LIST_HEAD_INIT(event.hn_item), }; |
892 | struct o2nm_node *node; |
893 | struct o2hb_disk_heartbeat_block *hb_block = reg->hr_tmp_block; |
894 | u64 cputime; |
895 | unsigned int dead_ms = o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS; |
896 | unsigned int slot_dead_ms; |
897 | int tmp; |
898 | int queued = 0; |
899 | |
900 | memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes); |
901 | |
902 | /* |
903 | * If a node is no longer configured but is still in the livemap, we |
904 | * may need to clear that bit from the livemap. |
905 | */ |
906 | node = o2nm_get_node_by_num(node_num: slot->ds_node_num); |
907 | if (!node) { |
908 | spin_lock(lock: &o2hb_live_lock); |
909 | tmp = test_bit(slot->ds_node_num, o2hb_live_node_bitmap); |
910 | spin_unlock(lock: &o2hb_live_lock); |
911 | if (!tmp) |
912 | return 0; |
913 | } |
914 | |
915 | if (!o2hb_verify_crc(reg, hb_block)) { |
916 | /* all paths from here will drop o2hb_live_lock for |
917 | * us. */ |
918 | spin_lock(lock: &o2hb_live_lock); |
919 | |
920 | /* Don't print an error on the console in this case - |
921 | * a freshly formatted heartbeat area will not have a |
922 | * crc set on it. */ |
923 | if (list_empty(head: &slot->ds_live_item)) |
924 | goto out; |
925 | |
926 | /* The node is live but pushed out a bad crc. We |
927 | * consider it a transient miss but don't populate any |
928 | * other values as they may be junk. */ |
929 | mlog(ML_ERROR, "Node %d has written a bad crc to %pg\n" , |
930 | slot->ds_node_num, reg_bdev(reg)); |
931 | o2hb_dump_slot(hb_block); |
932 | |
933 | slot->ds_equal_samples++; |
934 | goto fire_callbacks; |
935 | } |
936 | |
937 | /* we don't care if these wrap.. the state transitions below |
938 | * clear at the right places */ |
939 | cputime = le64_to_cpu(hb_block->hb_seq); |
940 | if (slot->ds_last_time != cputime) |
941 | slot->ds_changed_samples++; |
942 | else |
943 | slot->ds_equal_samples++; |
944 | slot->ds_last_time = cputime; |
945 | |
946 | /* The node changed heartbeat generations. We assume this to |
947 | * mean it dropped off but came back before we timed out. We |
948 | * want to consider it down for the time being but don't want |
949 | * to lose any changed_samples state we might build up to |
950 | * considering it live again. */ |
951 | if (slot->ds_last_generation != le64_to_cpu(hb_block->hb_generation)) { |
952 | gen_changed = 1; |
953 | slot->ds_equal_samples = 0; |
954 | mlog(ML_HEARTBEAT, "Node %d changed generation (0x%llx " |
955 | "to 0x%llx)\n" , slot->ds_node_num, |
956 | (long long)slot->ds_last_generation, |
957 | (long long)le64_to_cpu(hb_block->hb_generation)); |
958 | } |
959 | |
960 | slot->ds_last_generation = le64_to_cpu(hb_block->hb_generation); |
961 | |
962 | mlog(ML_HEARTBEAT, "Slot %d gen 0x%llx cksum 0x%x " |
963 | "seq %llu last %llu changed %u equal %u\n" , |
964 | slot->ds_node_num, (long long)slot->ds_last_generation, |
965 | le32_to_cpu(hb_block->hb_cksum), |
966 | (unsigned long long)le64_to_cpu(hb_block->hb_seq), |
967 | (unsigned long long)slot->ds_last_time, slot->ds_changed_samples, |
968 | slot->ds_equal_samples); |
969 | |
970 | spin_lock(lock: &o2hb_live_lock); |
971 | |
972 | fire_callbacks: |
973 | /* dead nodes only come to life after some number of |
974 | * changes at any time during their dead time */ |
975 | if (list_empty(head: &slot->ds_live_item) && |
976 | slot->ds_changed_samples >= O2HB_LIVE_THRESHOLD) { |
977 | mlog(ML_HEARTBEAT, "Node %d (id 0x%llx) joined my region\n" , |
978 | slot->ds_node_num, (long long)slot->ds_last_generation); |
979 | |
980 | set_bit(nr: slot->ds_node_num, addr: reg->hr_live_node_bitmap); |
981 | |
982 | /* first on the list generates a callback */ |
983 | if (list_empty(head: &o2hb_live_slots[slot->ds_node_num])) { |
984 | mlog(ML_HEARTBEAT, "o2hb: Add node %d to live nodes " |
985 | "bitmap\n" , slot->ds_node_num); |
986 | set_bit(nr: slot->ds_node_num, addr: o2hb_live_node_bitmap); |
987 | |
988 | o2hb_queue_node_event(event: &event, type: O2HB_NODE_UP_CB, node, |
989 | node_num: slot->ds_node_num); |
990 | |
991 | changed = 1; |
992 | queued = 1; |
993 | } |
994 | |
995 | list_add_tail(new: &slot->ds_live_item, |
996 | head: &o2hb_live_slots[slot->ds_node_num]); |
997 | |
998 | slot->ds_equal_samples = 0; |
999 | |
1000 | /* We want to be sure that all nodes agree on the |
1001 | * number of milliseconds before a node will be |
1002 | * considered dead. The self-fencing timeout is |
1003 | * computed from this value, and a discrepancy might |
1004 | * result in heartbeat calling a node dead when it |
1005 | * hasn't self-fenced yet. */ |
1006 | slot_dead_ms = le32_to_cpu(hb_block->hb_dead_ms); |
1007 | if (slot_dead_ms && slot_dead_ms != dead_ms) { |
1008 | /* TODO: Perhaps we can fail the region here. */ |
1009 | mlog(ML_ERROR, "Node %d on device %pg has a dead count " |
1010 | "of %u ms, but our count is %u ms.\n" |
1011 | "Please double check your configuration values " |
1012 | "for 'O2CB_HEARTBEAT_THRESHOLD'\n" , |
1013 | slot->ds_node_num, reg_bdev(reg), |
1014 | slot_dead_ms, dead_ms); |
1015 | } |
1016 | goto out; |
1017 | } |
1018 | |
1019 | /* if the list is dead, we're done.. */ |
1020 | if (list_empty(head: &slot->ds_live_item)) |
1021 | goto out; |
1022 | |
1023 | /* live nodes only go dead after enough consequtive missed |
1024 | * samples.. reset the missed counter whenever we see |
1025 | * activity */ |
1026 | if (slot->ds_equal_samples >= o2hb_dead_threshold || gen_changed) { |
1027 | mlog(ML_HEARTBEAT, "Node %d left my region\n" , |
1028 | slot->ds_node_num); |
1029 | |
1030 | clear_bit(nr: slot->ds_node_num, addr: reg->hr_live_node_bitmap); |
1031 | |
1032 | /* last off the live_slot generates a callback */ |
1033 | list_del_init(entry: &slot->ds_live_item); |
1034 | if (list_empty(head: &o2hb_live_slots[slot->ds_node_num])) { |
1035 | mlog(ML_HEARTBEAT, "o2hb: Remove node %d from live " |
1036 | "nodes bitmap\n" , slot->ds_node_num); |
1037 | clear_bit(nr: slot->ds_node_num, addr: o2hb_live_node_bitmap); |
1038 | |
1039 | /* node can be null */ |
1040 | o2hb_queue_node_event(event: &event, type: O2HB_NODE_DOWN_CB, |
1041 | node, node_num: slot->ds_node_num); |
1042 | |
1043 | changed = 1; |
1044 | queued = 1; |
1045 | } |
1046 | |
1047 | /* We don't clear this because the node is still |
1048 | * actually writing new blocks. */ |
1049 | if (!gen_changed) |
1050 | slot->ds_changed_samples = 0; |
1051 | goto out; |
1052 | } |
1053 | if (slot->ds_changed_samples) { |
1054 | slot->ds_changed_samples = 0; |
1055 | slot->ds_equal_samples = 0; |
1056 | } |
1057 | out: |
1058 | spin_unlock(lock: &o2hb_live_lock); |
1059 | |
1060 | if (queued) |
1061 | o2hb_run_event_list(queued_event: &event); |
1062 | |
1063 | if (node) |
1064 | o2nm_node_put(node); |
1065 | return changed; |
1066 | } |
1067 | |
1068 | static int o2hb_highest_node(unsigned long *nodes, int numbits) |
1069 | { |
1070 | return find_last_bit(addr: nodes, size: numbits); |
1071 | } |
1072 | |
1073 | static int o2hb_lowest_node(unsigned long *nodes, int numbits) |
1074 | { |
1075 | return find_first_bit(addr: nodes, size: numbits); |
1076 | } |
1077 | |
1078 | static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) |
1079 | { |
1080 | int i, ret, highest_node, lowest_node; |
1081 | int membership_change = 0, own_slot_ok = 0; |
1082 | unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)]; |
1083 | unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; |
1084 | struct o2hb_bio_wait_ctxt write_wc; |
1085 | |
1086 | ret = o2nm_configured_node_map(map: configured_nodes, |
1087 | bytes: sizeof(configured_nodes)); |
1088 | if (ret) { |
1089 | mlog_errno(ret); |
1090 | goto bail; |
1091 | } |
1092 | |
1093 | /* |
1094 | * If a node is not configured but is in the livemap, we still need |
1095 | * to read the slot so as to be able to remove it from the livemap. |
1096 | */ |
1097 | o2hb_fill_node_map(map: live_node_bitmap, O2NM_MAX_NODES); |
1098 | i = -1; |
1099 | while ((i = find_next_bit(addr: live_node_bitmap, |
1100 | O2NM_MAX_NODES, offset: i + 1)) < O2NM_MAX_NODES) { |
1101 | set_bit(nr: i, addr: configured_nodes); |
1102 | } |
1103 | |
1104 | highest_node = o2hb_highest_node(nodes: configured_nodes, O2NM_MAX_NODES); |
1105 | lowest_node = o2hb_lowest_node(nodes: configured_nodes, O2NM_MAX_NODES); |
1106 | if (highest_node >= O2NM_MAX_NODES || lowest_node >= O2NM_MAX_NODES) { |
1107 | mlog(ML_NOTICE, "o2hb: No configured nodes found!\n" ); |
1108 | ret = -EINVAL; |
1109 | goto bail; |
1110 | } |
1111 | |
1112 | /* No sense in reading the slots of nodes that don't exist |
1113 | * yet. Of course, if the node definitions have holes in them |
1114 | * then we're reading an empty slot anyway... Consider this |
1115 | * best-effort. */ |
1116 | ret = o2hb_read_slots(reg, begin_slot: lowest_node, max_slots: highest_node + 1); |
1117 | if (ret < 0) { |
1118 | mlog_errno(ret); |
1119 | goto bail; |
1120 | } |
1121 | |
1122 | /* With an up to date view of the slots, we can check that no |
1123 | * other node has been improperly configured to heartbeat in |
1124 | * our slot. */ |
1125 | own_slot_ok = o2hb_check_own_slot(reg); |
1126 | |
1127 | /* fill in the proper info for our next heartbeat */ |
1128 | o2hb_prepare_block(reg, generation: reg->hr_generation); |
1129 | |
1130 | ret = o2hb_issue_node_write(reg, write_wc: &write_wc); |
1131 | if (ret < 0) { |
1132 | mlog_errno(ret); |
1133 | goto bail; |
1134 | } |
1135 | |
1136 | i = -1; |
1137 | while((i = find_next_bit(addr: configured_nodes, |
1138 | O2NM_MAX_NODES, offset: i + 1)) < O2NM_MAX_NODES) { |
1139 | membership_change |= o2hb_check_slot(reg, slot: ®->hr_slots[i]); |
1140 | } |
1141 | |
1142 | /* |
1143 | * We have to be sure we've advertised ourselves on disk |
1144 | * before we can go to steady state. This ensures that |
1145 | * people we find in our steady state have seen us. |
1146 | */ |
1147 | o2hb_wait_on_io(wc: &write_wc); |
1148 | if (write_wc.wc_error) { |
1149 | /* Do not re-arm the write timeout on I/O error - we |
1150 | * can't be sure that the new block ever made it to |
1151 | * disk */ |
1152 | mlog(ML_ERROR, "Write error %d on device \"%pg\"\n" , |
1153 | write_wc.wc_error, reg_bdev(reg)); |
1154 | ret = write_wc.wc_error; |
1155 | goto bail; |
1156 | } |
1157 | |
1158 | /* Skip disarming the timeout if own slot has stale/bad data */ |
1159 | if (own_slot_ok) { |
1160 | o2hb_set_quorum_device(reg); |
1161 | o2hb_arm_timeout(reg); |
1162 | reg->hr_last_timeout_start = jiffies; |
1163 | } |
1164 | |
1165 | bail: |
1166 | /* let the person who launched us know when things are steady */ |
1167 | if (atomic_read(v: ®->hr_steady_iterations) != 0) { |
1168 | if (!ret && own_slot_ok && !membership_change) { |
1169 | if (atomic_dec_and_test(v: ®->hr_steady_iterations)) |
1170 | wake_up(&o2hb_steady_queue); |
1171 | } |
1172 | } |
1173 | |
1174 | if (atomic_read(v: ®->hr_steady_iterations) != 0) { |
1175 | if (atomic_dec_and_test(v: ®->hr_unsteady_iterations)) { |
1176 | printk(KERN_NOTICE "o2hb: Unable to stabilize " |
1177 | "heartbeat on region %s (%pg)\n" , |
1178 | config_item_name(®->hr_item), |
1179 | reg_bdev(reg)); |
1180 | atomic_set(v: ®->hr_steady_iterations, i: 0); |
1181 | reg->hr_aborted_start = 1; |
1182 | wake_up(&o2hb_steady_queue); |
1183 | ret = -EIO; |
1184 | } |
1185 | } |
1186 | |
1187 | return ret; |
1188 | } |
1189 | |
1190 | /* |
1191 | * we ride the region ref that the region dir holds. before the region |
1192 | * dir is removed and drops it ref it will wait to tear down this |
1193 | * thread. |
1194 | */ |
1195 | static int o2hb_thread(void *data) |
1196 | { |
1197 | int i, ret; |
1198 | struct o2hb_region *reg = data; |
1199 | struct o2hb_bio_wait_ctxt write_wc; |
1200 | ktime_t before_hb, after_hb; |
1201 | unsigned int elapsed_msec; |
1202 | |
1203 | mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread running\n" ); |
1204 | |
1205 | set_user_nice(current, MIN_NICE); |
1206 | |
1207 | /* Pin node */ |
1208 | ret = o2nm_depend_this_node(); |
1209 | if (ret) { |
1210 | mlog(ML_ERROR, "Node has been deleted, ret = %d\n" , ret); |
1211 | reg->hr_node_deleted = 1; |
1212 | wake_up(&o2hb_steady_queue); |
1213 | return 0; |
1214 | } |
1215 | |
1216 | while (!kthread_should_stop() && |
1217 | !reg->hr_unclean_stop && !reg->hr_aborted_start) { |
1218 | /* We track the time spent inside |
1219 | * o2hb_do_disk_heartbeat so that we avoid more than |
1220 | * hr_timeout_ms between disk writes. On busy systems |
1221 | * this should result in a heartbeat which is less |
1222 | * likely to time itself out. */ |
1223 | before_hb = ktime_get_real(); |
1224 | |
1225 | ret = o2hb_do_disk_heartbeat(reg); |
1226 | reg->hr_last_hb_status = ret; |
1227 | |
1228 | after_hb = ktime_get_real(); |
1229 | |
1230 | elapsed_msec = (unsigned int) |
1231 | ktime_ms_delta(later: after_hb, earlier: before_hb); |
1232 | |
1233 | mlog(ML_HEARTBEAT, |
1234 | "start = %lld, end = %lld, msec = %u, ret = %d\n" , |
1235 | before_hb, after_hb, elapsed_msec, ret); |
1236 | |
1237 | if (!kthread_should_stop() && |
1238 | elapsed_msec < reg->hr_timeout_ms) { |
1239 | /* the kthread api has blocked signals for us so no |
1240 | * need to record the return value. */ |
1241 | msleep_interruptible(msecs: reg->hr_timeout_ms - elapsed_msec); |
1242 | } |
1243 | } |
1244 | |
1245 | o2hb_disarm_timeout(reg); |
1246 | |
1247 | /* unclean stop is only used in very bad situation */ |
1248 | for(i = 0; !reg->hr_unclean_stop && i < reg->hr_blocks; i++) |
1249 | o2hb_shutdown_slot(slot: ®->hr_slots[i]); |
1250 | |
1251 | /* Explicit down notification - avoid forcing the other nodes |
1252 | * to timeout on this region when we could just as easily |
1253 | * write a clear generation - thus indicating to them that |
1254 | * this node has left this region. |
1255 | */ |
1256 | if (!reg->hr_unclean_stop && !reg->hr_aborted_start) { |
1257 | o2hb_prepare_block(reg, generation: 0); |
1258 | ret = o2hb_issue_node_write(reg, write_wc: &write_wc); |
1259 | if (ret == 0) |
1260 | o2hb_wait_on_io(wc: &write_wc); |
1261 | else |
1262 | mlog_errno(ret); |
1263 | } |
1264 | |
1265 | /* Unpin node */ |
1266 | o2nm_undepend_this_node(); |
1267 | |
1268 | mlog(ML_HEARTBEAT|ML_KTHREAD, "o2hb thread exiting\n" ); |
1269 | |
1270 | return 0; |
1271 | } |
1272 | |
1273 | #ifdef CONFIG_DEBUG_FS |
1274 | static int o2hb_debug_open(struct inode *inode, struct file *file) |
1275 | { |
1276 | struct o2hb_debug_buf *db = inode->i_private; |
1277 | struct o2hb_region *reg; |
1278 | unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)]; |
1279 | unsigned long lts; |
1280 | char *buf = NULL; |
1281 | int i = -1; |
1282 | int out = 0; |
1283 | |
1284 | /* max_nodes should be the largest bitmap we pass here */ |
1285 | BUG_ON(sizeof(map) < db->db_size); |
1286 | |
1287 | buf = kmalloc(PAGE_SIZE, GFP_KERNEL); |
1288 | if (!buf) |
1289 | goto bail; |
1290 | |
1291 | switch (db->db_type) { |
1292 | case O2HB_DB_TYPE_LIVENODES: |
1293 | case O2HB_DB_TYPE_LIVEREGIONS: |
1294 | case O2HB_DB_TYPE_QUORUMREGIONS: |
1295 | case O2HB_DB_TYPE_FAILEDREGIONS: |
1296 | spin_lock(lock: &o2hb_live_lock); |
1297 | memcpy(map, db->db_data, db->db_size); |
1298 | spin_unlock(lock: &o2hb_live_lock); |
1299 | break; |
1300 | |
1301 | case O2HB_DB_TYPE_REGION_LIVENODES: |
1302 | spin_lock(lock: &o2hb_live_lock); |
1303 | reg = (struct o2hb_region *)db->db_data; |
1304 | memcpy(map, reg->hr_live_node_bitmap, db->db_size); |
1305 | spin_unlock(lock: &o2hb_live_lock); |
1306 | break; |
1307 | |
1308 | case O2HB_DB_TYPE_REGION_NUMBER: |
1309 | reg = (struct o2hb_region *)db->db_data; |
1310 | out += scnprintf(buf: buf + out, PAGE_SIZE - out, fmt: "%d\n" , |
1311 | reg->hr_region_num); |
1312 | goto done; |
1313 | |
1314 | case O2HB_DB_TYPE_REGION_ELAPSED_TIME: |
1315 | reg = (struct o2hb_region *)db->db_data; |
1316 | lts = reg->hr_last_timeout_start; |
1317 | /* If 0, it has never been set before */ |
1318 | if (lts) |
1319 | lts = jiffies_to_msecs(j: jiffies - lts); |
1320 | out += scnprintf(buf: buf + out, PAGE_SIZE - out, fmt: "%lu\n" , lts); |
1321 | goto done; |
1322 | |
1323 | case O2HB_DB_TYPE_REGION_PINNED: |
1324 | reg = (struct o2hb_region *)db->db_data; |
1325 | out += scnprintf(buf: buf + out, PAGE_SIZE - out, fmt: "%u\n" , |
1326 | !!reg->hr_item_pinned); |
1327 | goto done; |
1328 | |
1329 | default: |
1330 | goto done; |
1331 | } |
1332 | |
1333 | while ((i = find_next_bit(addr: map, size: db->db_len, offset: i + 1)) < db->db_len) |
1334 | out += scnprintf(buf: buf + out, PAGE_SIZE - out, fmt: "%d " , i); |
1335 | out += scnprintf(buf: buf + out, PAGE_SIZE - out, fmt: "\n" ); |
1336 | |
1337 | done: |
1338 | i_size_write(inode, i_size: out); |
1339 | |
1340 | file->private_data = buf; |
1341 | |
1342 | return 0; |
1343 | bail: |
1344 | return -ENOMEM; |
1345 | } |
1346 | |
1347 | static int o2hb_debug_release(struct inode *inode, struct file *file) |
1348 | { |
1349 | kfree(objp: file->private_data); |
1350 | return 0; |
1351 | } |
1352 | |
1353 | static ssize_t o2hb_debug_read(struct file *file, char __user *buf, |
1354 | size_t nbytes, loff_t *ppos) |
1355 | { |
1356 | return simple_read_from_buffer(to: buf, count: nbytes, ppos, from: file->private_data, |
1357 | available: i_size_read(inode: file->f_mapping->host)); |
1358 | } |
1359 | #else |
1360 | static int o2hb_debug_open(struct inode *inode, struct file *file) |
1361 | { |
1362 | return 0; |
1363 | } |
1364 | static int o2hb_debug_release(struct inode *inode, struct file *file) |
1365 | { |
1366 | return 0; |
1367 | } |
1368 | static ssize_t o2hb_debug_read(struct file *file, char __user *buf, |
1369 | size_t nbytes, loff_t *ppos) |
1370 | { |
1371 | return 0; |
1372 | } |
1373 | #endif /* CONFIG_DEBUG_FS */ |
1374 | |
1375 | static const struct file_operations o2hb_debug_fops = { |
1376 | .open = o2hb_debug_open, |
1377 | .release = o2hb_debug_release, |
1378 | .read = o2hb_debug_read, |
1379 | .llseek = generic_file_llseek, |
1380 | }; |
1381 | |
1382 | void o2hb_exit(void) |
1383 | { |
1384 | debugfs_remove_recursive(dentry: o2hb_debug_dir); |
1385 | kfree(objp: o2hb_db_livenodes); |
1386 | kfree(objp: o2hb_db_liveregions); |
1387 | kfree(objp: o2hb_db_quorumregions); |
1388 | kfree(objp: o2hb_db_failedregions); |
1389 | } |
1390 | |
1391 | static void o2hb_debug_create(const char *name, struct dentry *dir, |
1392 | struct o2hb_debug_buf **db, int db_len, int type, |
1393 | int size, int len, void *data) |
1394 | { |
1395 | *db = kmalloc(size: db_len, GFP_KERNEL); |
1396 | if (!*db) |
1397 | return; |
1398 | |
1399 | (*db)->db_type = type; |
1400 | (*db)->db_size = size; |
1401 | (*db)->db_len = len; |
1402 | (*db)->db_data = data; |
1403 | |
1404 | debugfs_create_file(name, S_IFREG|S_IRUSR, parent: dir, data: *db, fops: &o2hb_debug_fops); |
1405 | } |
1406 | |
1407 | static void o2hb_debug_init(void) |
1408 | { |
1409 | o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL); |
1410 | |
1411 | o2hb_debug_create(O2HB_DEBUG_LIVENODES, dir: o2hb_debug_dir, |
1412 | db: &o2hb_db_livenodes, db_len: sizeof(*o2hb_db_livenodes), |
1413 | O2HB_DB_TYPE_LIVENODES, size: sizeof(o2hb_live_node_bitmap), |
1414 | O2NM_MAX_NODES, data: o2hb_live_node_bitmap); |
1415 | |
1416 | o2hb_debug_create(O2HB_DEBUG_LIVEREGIONS, dir: o2hb_debug_dir, |
1417 | db: &o2hb_db_liveregions, db_len: sizeof(*o2hb_db_liveregions), |
1418 | O2HB_DB_TYPE_LIVEREGIONS, |
1419 | size: sizeof(o2hb_live_region_bitmap), O2NM_MAX_REGIONS, |
1420 | data: o2hb_live_region_bitmap); |
1421 | |
1422 | o2hb_debug_create(O2HB_DEBUG_QUORUMREGIONS, dir: o2hb_debug_dir, |
1423 | db: &o2hb_db_quorumregions, |
1424 | db_len: sizeof(*o2hb_db_quorumregions), |
1425 | O2HB_DB_TYPE_QUORUMREGIONS, |
1426 | size: sizeof(o2hb_quorum_region_bitmap), O2NM_MAX_REGIONS, |
1427 | data: o2hb_quorum_region_bitmap); |
1428 | |
1429 | o2hb_debug_create(O2HB_DEBUG_FAILEDREGIONS, dir: o2hb_debug_dir, |
1430 | db: &o2hb_db_failedregions, |
1431 | db_len: sizeof(*o2hb_db_failedregions), |
1432 | O2HB_DB_TYPE_FAILEDREGIONS, |
1433 | size: sizeof(o2hb_failed_region_bitmap), O2NM_MAX_REGIONS, |
1434 | data: o2hb_failed_region_bitmap); |
1435 | } |
1436 | |
1437 | void o2hb_init(void) |
1438 | { |
1439 | int i; |
1440 | |
1441 | for (i = 0; i < ARRAY_SIZE(o2hb_callbacks); i++) |
1442 | INIT_LIST_HEAD(list: &o2hb_callbacks[i].list); |
1443 | |
1444 | for (i = 0; i < ARRAY_SIZE(o2hb_live_slots); i++) |
1445 | INIT_LIST_HEAD(list: &o2hb_live_slots[i]); |
1446 | |
1447 | bitmap_zero(dst: o2hb_live_node_bitmap, O2NM_MAX_NODES); |
1448 | bitmap_zero(dst: o2hb_region_bitmap, O2NM_MAX_REGIONS); |
1449 | bitmap_zero(dst: o2hb_live_region_bitmap, O2NM_MAX_REGIONS); |
1450 | bitmap_zero(dst: o2hb_quorum_region_bitmap, O2NM_MAX_REGIONS); |
1451 | bitmap_zero(dst: o2hb_failed_region_bitmap, O2NM_MAX_REGIONS); |
1452 | |
1453 | o2hb_dependent_users = 0; |
1454 | |
1455 | o2hb_debug_init(); |
1456 | } |
1457 | |
1458 | /* if we're already in a callback then we're already serialized by the sem */ |
1459 | static void o2hb_fill_node_map_from_callback(unsigned long *map, |
1460 | unsigned int bits) |
1461 | { |
1462 | bitmap_copy(dst: map, src: o2hb_live_node_bitmap, nbits: bits); |
1463 | } |
1464 | |
1465 | /* |
1466 | * get a map of all nodes that are heartbeating in any regions |
1467 | */ |
1468 | void o2hb_fill_node_map(unsigned long *map, unsigned int bits) |
1469 | { |
1470 | /* callers want to serialize this map and callbacks so that they |
1471 | * can trust that they don't miss nodes coming to the party */ |
1472 | down_read(sem: &o2hb_callback_sem); |
1473 | spin_lock(lock: &o2hb_live_lock); |
1474 | o2hb_fill_node_map_from_callback(map, bits); |
1475 | spin_unlock(lock: &o2hb_live_lock); |
1476 | up_read(sem: &o2hb_callback_sem); |
1477 | } |
1478 | EXPORT_SYMBOL_GPL(o2hb_fill_node_map); |
1479 | |
1480 | /* |
1481 | * heartbeat configfs bits. The heartbeat set is a default set under |
1482 | * the cluster set in nodemanager.c. |
1483 | */ |
1484 | |
1485 | static struct o2hb_region *to_o2hb_region(struct config_item *item) |
1486 | { |
1487 | return item ? container_of(item, struct o2hb_region, hr_item) : NULL; |
1488 | } |
1489 | |
1490 | /* drop_item only drops its ref after killing the thread, nothing should |
1491 | * be using the region anymore. this has to clean up any state that |
1492 | * attributes might have built up. */ |
1493 | static void o2hb_region_release(struct config_item *item) |
1494 | { |
1495 | int i; |
1496 | struct page *page; |
1497 | struct o2hb_region *reg = to_o2hb_region(item); |
1498 | |
1499 | mlog(ML_HEARTBEAT, "hb region release (%pg)\n" , reg_bdev(reg)); |
1500 | |
1501 | kfree(objp: reg->hr_tmp_block); |
1502 | |
1503 | if (reg->hr_slot_data) { |
1504 | for (i = 0; i < reg->hr_num_pages; i++) { |
1505 | page = reg->hr_slot_data[i]; |
1506 | if (page) |
1507 | __free_page(page); |
1508 | } |
1509 | kfree(objp: reg->hr_slot_data); |
1510 | } |
1511 | |
1512 | if (reg->hr_bdev_file) |
1513 | fput(reg->hr_bdev_file); |
1514 | |
1515 | kfree(objp: reg->hr_slots); |
1516 | |
1517 | debugfs_remove_recursive(dentry: reg->hr_debug_dir); |
1518 | kfree(objp: reg->hr_db_livenodes); |
1519 | kfree(objp: reg->hr_db_regnum); |
1520 | kfree(objp: reg->hr_db_elapsed_time); |
1521 | kfree(objp: reg->hr_db_pinned); |
1522 | |
1523 | spin_lock(lock: &o2hb_live_lock); |
1524 | list_del(entry: ®->hr_all_item); |
1525 | spin_unlock(lock: &o2hb_live_lock); |
1526 | |
1527 | o2net_unregister_handler_list(list: ®->hr_handler_list); |
1528 | kfree(objp: reg); |
1529 | } |
1530 | |
1531 | static int o2hb_read_block_input(struct o2hb_region *reg, |
1532 | const char *page, |
1533 | unsigned long *ret_bytes, |
1534 | unsigned int *ret_bits) |
1535 | { |
1536 | unsigned long bytes; |
1537 | char *p = (char *)page; |
1538 | |
1539 | bytes = simple_strtoul(p, &p, 0); |
1540 | if (!p || (*p && (*p != '\n'))) |
1541 | return -EINVAL; |
1542 | |
1543 | /* Heartbeat and fs min / max block sizes are the same. */ |
1544 | if (bytes > 4096 || bytes < 512) |
1545 | return -ERANGE; |
1546 | if (hweight16(bytes) != 1) |
1547 | return -EINVAL; |
1548 | |
1549 | if (ret_bytes) |
1550 | *ret_bytes = bytes; |
1551 | if (ret_bits) |
1552 | *ret_bits = ffs(bytes) - 1; |
1553 | |
1554 | return 0; |
1555 | } |
1556 | |
1557 | static ssize_t o2hb_region_block_bytes_show(struct config_item *item, |
1558 | char *page) |
1559 | { |
1560 | return sprintf(buf: page, fmt: "%u\n" , to_o2hb_region(item)->hr_block_bytes); |
1561 | } |
1562 | |
1563 | static ssize_t o2hb_region_block_bytes_store(struct config_item *item, |
1564 | const char *page, |
1565 | size_t count) |
1566 | { |
1567 | struct o2hb_region *reg = to_o2hb_region(item); |
1568 | int status; |
1569 | unsigned long block_bytes; |
1570 | unsigned int block_bits; |
1571 | |
1572 | if (reg->hr_bdev_file) |
1573 | return -EINVAL; |
1574 | |
1575 | status = o2hb_read_block_input(reg, page, ret_bytes: &block_bytes, |
1576 | ret_bits: &block_bits); |
1577 | if (status) |
1578 | return status; |
1579 | |
1580 | reg->hr_block_bytes = (unsigned int)block_bytes; |
1581 | reg->hr_block_bits = block_bits; |
1582 | |
1583 | return count; |
1584 | } |
1585 | |
1586 | static ssize_t o2hb_region_start_block_show(struct config_item *item, |
1587 | char *page) |
1588 | { |
1589 | return sprintf(buf: page, fmt: "%llu\n" , to_o2hb_region(item)->hr_start_block); |
1590 | } |
1591 | |
1592 | static ssize_t o2hb_region_start_block_store(struct config_item *item, |
1593 | const char *page, |
1594 | size_t count) |
1595 | { |
1596 | struct o2hb_region *reg = to_o2hb_region(item); |
1597 | unsigned long long tmp; |
1598 | char *p = (char *)page; |
1599 | ssize_t ret; |
1600 | |
1601 | if (reg->hr_bdev_file) |
1602 | return -EINVAL; |
1603 | |
1604 | ret = kstrtoull(s: p, base: 0, res: &tmp); |
1605 | if (ret) |
1606 | return -EINVAL; |
1607 | |
1608 | reg->hr_start_block = tmp; |
1609 | |
1610 | return count; |
1611 | } |
1612 | |
1613 | static ssize_t o2hb_region_blocks_show(struct config_item *item, char *page) |
1614 | { |
1615 | return sprintf(buf: page, fmt: "%d\n" , to_o2hb_region(item)->hr_blocks); |
1616 | } |
1617 | |
1618 | static ssize_t o2hb_region_blocks_store(struct config_item *item, |
1619 | const char *page, |
1620 | size_t count) |
1621 | { |
1622 | struct o2hb_region *reg = to_o2hb_region(item); |
1623 | unsigned long tmp; |
1624 | char *p = (char *)page; |
1625 | |
1626 | if (reg->hr_bdev_file) |
1627 | return -EINVAL; |
1628 | |
1629 | tmp = simple_strtoul(p, &p, 0); |
1630 | if (!p || (*p && (*p != '\n'))) |
1631 | return -EINVAL; |
1632 | |
1633 | if (tmp > O2NM_MAX_NODES || tmp == 0) |
1634 | return -ERANGE; |
1635 | |
1636 | reg->hr_blocks = (unsigned int)tmp; |
1637 | |
1638 | return count; |
1639 | } |
1640 | |
1641 | static ssize_t o2hb_region_dev_show(struct config_item *item, char *page) |
1642 | { |
1643 | unsigned int ret = 0; |
1644 | |
1645 | if (to_o2hb_region(item)->hr_bdev_file) |
1646 | ret = sprintf(buf: page, fmt: "%pg\n" , reg_bdev(reg: to_o2hb_region(item))); |
1647 | |
1648 | return ret; |
1649 | } |
1650 | |
1651 | static void o2hb_init_region_params(struct o2hb_region *reg) |
1652 | { |
1653 | reg->hr_slots_per_page = PAGE_SIZE >> reg->hr_block_bits; |
1654 | reg->hr_timeout_ms = O2HB_REGION_TIMEOUT_MS; |
1655 | |
1656 | mlog(ML_HEARTBEAT, "hr_start_block = %llu, hr_blocks = %u\n" , |
1657 | reg->hr_start_block, reg->hr_blocks); |
1658 | mlog(ML_HEARTBEAT, "hr_block_bytes = %u, hr_block_bits = %u\n" , |
1659 | reg->hr_block_bytes, reg->hr_block_bits); |
1660 | mlog(ML_HEARTBEAT, "hr_timeout_ms = %u\n" , reg->hr_timeout_ms); |
1661 | mlog(ML_HEARTBEAT, "dead threshold = %u\n" , o2hb_dead_threshold); |
1662 | } |
1663 | |
1664 | static int o2hb_map_slot_data(struct o2hb_region *reg) |
1665 | { |
1666 | int i, j; |
1667 | unsigned int last_slot; |
1668 | unsigned int spp = reg->hr_slots_per_page; |
1669 | struct page *page; |
1670 | char *raw; |
1671 | struct o2hb_disk_slot *slot; |
1672 | |
1673 | reg->hr_tmp_block = kmalloc(size: reg->hr_block_bytes, GFP_KERNEL); |
1674 | if (reg->hr_tmp_block == NULL) |
1675 | return -ENOMEM; |
1676 | |
1677 | reg->hr_slots = kcalloc(n: reg->hr_blocks, |
1678 | size: sizeof(struct o2hb_disk_slot), GFP_KERNEL); |
1679 | if (reg->hr_slots == NULL) |
1680 | return -ENOMEM; |
1681 | |
1682 | for(i = 0; i < reg->hr_blocks; i++) { |
1683 | slot = ®->hr_slots[i]; |
1684 | slot->ds_node_num = i; |
1685 | INIT_LIST_HEAD(list: &slot->ds_live_item); |
1686 | slot->ds_raw_block = NULL; |
1687 | } |
1688 | |
1689 | reg->hr_num_pages = (reg->hr_blocks + spp - 1) / spp; |
1690 | mlog(ML_HEARTBEAT, "Going to require %u pages to cover %u blocks " |
1691 | "at %u blocks per page\n" , |
1692 | reg->hr_num_pages, reg->hr_blocks, spp); |
1693 | |
1694 | reg->hr_slot_data = kcalloc(n: reg->hr_num_pages, size: sizeof(struct page *), |
1695 | GFP_KERNEL); |
1696 | if (!reg->hr_slot_data) |
1697 | return -ENOMEM; |
1698 | |
1699 | for(i = 0; i < reg->hr_num_pages; i++) { |
1700 | page = alloc_page(GFP_KERNEL); |
1701 | if (!page) |
1702 | return -ENOMEM; |
1703 | |
1704 | reg->hr_slot_data[i] = page; |
1705 | |
1706 | last_slot = i * spp; |
1707 | raw = page_address(page); |
1708 | for (j = 0; |
1709 | (j < spp) && ((j + last_slot) < reg->hr_blocks); |
1710 | j++) { |
1711 | BUG_ON((j + last_slot) >= reg->hr_blocks); |
1712 | |
1713 | slot = ®->hr_slots[j + last_slot]; |
1714 | slot->ds_raw_block = |
1715 | (struct o2hb_disk_heartbeat_block *) raw; |
1716 | |
1717 | raw += reg->hr_block_bytes; |
1718 | } |
1719 | } |
1720 | |
1721 | return 0; |
1722 | } |
1723 | |
1724 | /* Read in all the slots available and populate the tracking |
1725 | * structures so that we can start with a baseline idea of what's |
1726 | * there. */ |
1727 | static int o2hb_populate_slot_data(struct o2hb_region *reg) |
1728 | { |
1729 | int ret, i; |
1730 | struct o2hb_disk_slot *slot; |
1731 | struct o2hb_disk_heartbeat_block *hb_block; |
1732 | |
1733 | ret = o2hb_read_slots(reg, begin_slot: 0, max_slots: reg->hr_blocks); |
1734 | if (ret) |
1735 | goto out; |
1736 | |
1737 | /* We only want to get an idea of the values initially in each |
1738 | * slot, so we do no verification - o2hb_check_slot will |
1739 | * actually determine if each configured slot is valid and |
1740 | * whether any values have changed. */ |
1741 | for(i = 0; i < reg->hr_blocks; i++) { |
1742 | slot = ®->hr_slots[i]; |
1743 | hb_block = (struct o2hb_disk_heartbeat_block *) slot->ds_raw_block; |
1744 | |
1745 | /* Only fill the values that o2hb_check_slot uses to |
1746 | * determine changing slots */ |
1747 | slot->ds_last_time = le64_to_cpu(hb_block->hb_seq); |
1748 | slot->ds_last_generation = le64_to_cpu(hb_block->hb_generation); |
1749 | } |
1750 | |
1751 | out: |
1752 | return ret; |
1753 | } |
1754 | |
1755 | /* |
1756 | * this is acting as commit; we set up all of hr_bdev_file and hr_task or |
1757 | * nothing |
1758 | */ |
1759 | static ssize_t o2hb_region_dev_store(struct config_item *item, |
1760 | const char *page, |
1761 | size_t count) |
1762 | { |
1763 | struct o2hb_region *reg = to_o2hb_region(item); |
1764 | struct task_struct *hb_task; |
1765 | long fd; |
1766 | int sectsize; |
1767 | char *p = (char *)page; |
1768 | struct fd f; |
1769 | ssize_t ret = -EINVAL; |
1770 | int live_threshold; |
1771 | |
1772 | if (reg->hr_bdev_file) |
1773 | goto out; |
1774 | |
1775 | /* We can't heartbeat without having had our node number |
1776 | * configured yet. */ |
1777 | if (o2nm_this_node() == O2NM_MAX_NODES) |
1778 | goto out; |
1779 | |
1780 | fd = simple_strtol(p, &p, 0); |
1781 | if (!p || (*p && (*p != '\n'))) |
1782 | goto out; |
1783 | |
1784 | if (fd < 0 || fd >= INT_MAX) |
1785 | goto out; |
1786 | |
1787 | f = fdget(fd); |
1788 | if (f.file == NULL) |
1789 | goto out; |
1790 | |
1791 | if (reg->hr_blocks == 0 || reg->hr_start_block == 0 || |
1792 | reg->hr_block_bytes == 0) |
1793 | goto out2; |
1794 | |
1795 | if (!S_ISBLK(f.file->f_mapping->host->i_mode)) |
1796 | goto out2; |
1797 | |
1798 | reg->hr_bdev_file = bdev_file_open_by_dev(dev: f.file->f_mapping->host->i_rdev, |
1799 | BLK_OPEN_WRITE | BLK_OPEN_READ, NULL, NULL); |
1800 | if (IS_ERR(ptr: reg->hr_bdev_file)) { |
1801 | ret = PTR_ERR(ptr: reg->hr_bdev_file); |
1802 | reg->hr_bdev_file = NULL; |
1803 | goto out2; |
1804 | } |
1805 | |
1806 | sectsize = bdev_logical_block_size(bdev: reg_bdev(reg)); |
1807 | if (sectsize != reg->hr_block_bytes) { |
1808 | mlog(ML_ERROR, |
1809 | "blocksize %u incorrect for device, expected %d" , |
1810 | reg->hr_block_bytes, sectsize); |
1811 | ret = -EINVAL; |
1812 | goto out3; |
1813 | } |
1814 | |
1815 | o2hb_init_region_params(reg); |
1816 | |
1817 | /* Generation of zero is invalid */ |
1818 | do { |
1819 | get_random_bytes(buf: ®->hr_generation, |
1820 | len: sizeof(reg->hr_generation)); |
1821 | } while (reg->hr_generation == 0); |
1822 | |
1823 | ret = o2hb_map_slot_data(reg); |
1824 | if (ret) { |
1825 | mlog_errno(ret); |
1826 | goto out3; |
1827 | } |
1828 | |
1829 | ret = o2hb_populate_slot_data(reg); |
1830 | if (ret) { |
1831 | mlog_errno(ret); |
1832 | goto out3; |
1833 | } |
1834 | |
1835 | INIT_DELAYED_WORK(®->hr_write_timeout_work, o2hb_write_timeout); |
1836 | INIT_DELAYED_WORK(®->hr_nego_timeout_work, o2hb_nego_timeout); |
1837 | |
1838 | /* |
1839 | * A node is considered live after it has beat LIVE_THRESHOLD |
1840 | * times. We're not steady until we've given them a chance |
1841 | * _after_ our first read. |
1842 | * The default threshold is bare minimum so as to limit the delay |
1843 | * during mounts. For global heartbeat, the threshold doubled for the |
1844 | * first region. |
1845 | */ |
1846 | live_threshold = O2HB_LIVE_THRESHOLD; |
1847 | if (o2hb_global_heartbeat_active()) { |
1848 | spin_lock(lock: &o2hb_live_lock); |
1849 | if (bitmap_weight(src: o2hb_region_bitmap, O2NM_MAX_REGIONS) == 1) |
1850 | live_threshold <<= 1; |
1851 | spin_unlock(lock: &o2hb_live_lock); |
1852 | } |
1853 | ++live_threshold; |
1854 | atomic_set(v: ®->hr_steady_iterations, i: live_threshold); |
1855 | /* unsteady_iterations is triple the steady_iterations */ |
1856 | atomic_set(v: ®->hr_unsteady_iterations, i: (live_threshold * 3)); |
1857 | |
1858 | hb_task = kthread_run(o2hb_thread, reg, "o2hb-%s" , |
1859 | reg->hr_item.ci_name); |
1860 | if (IS_ERR(ptr: hb_task)) { |
1861 | ret = PTR_ERR(ptr: hb_task); |
1862 | mlog_errno(ret); |
1863 | goto out3; |
1864 | } |
1865 | |
1866 | spin_lock(lock: &o2hb_live_lock); |
1867 | reg->hr_task = hb_task; |
1868 | spin_unlock(lock: &o2hb_live_lock); |
1869 | |
1870 | ret = wait_event_interruptible(o2hb_steady_queue, |
1871 | atomic_read(®->hr_steady_iterations) == 0 || |
1872 | reg->hr_node_deleted); |
1873 | if (ret) { |
1874 | atomic_set(v: ®->hr_steady_iterations, i: 0); |
1875 | reg->hr_aborted_start = 1; |
1876 | } |
1877 | |
1878 | if (reg->hr_aborted_start) { |
1879 | ret = -EIO; |
1880 | goto out3; |
1881 | } |
1882 | |
1883 | if (reg->hr_node_deleted) { |
1884 | ret = -EINVAL; |
1885 | goto out3; |
1886 | } |
1887 | |
1888 | /* Ok, we were woken. Make sure it wasn't by drop_item() */ |
1889 | spin_lock(lock: &o2hb_live_lock); |
1890 | hb_task = reg->hr_task; |
1891 | if (o2hb_global_heartbeat_active()) |
1892 | set_bit(nr: reg->hr_region_num, addr: o2hb_live_region_bitmap); |
1893 | spin_unlock(lock: &o2hb_live_lock); |
1894 | |
1895 | if (hb_task) |
1896 | ret = count; |
1897 | else |
1898 | ret = -EIO; |
1899 | |
1900 | if (hb_task && o2hb_global_heartbeat_active()) |
1901 | printk(KERN_NOTICE "o2hb: Heartbeat started on region %s (%pg)\n" , |
1902 | config_item_name(®->hr_item), reg_bdev(reg)); |
1903 | |
1904 | out3: |
1905 | if (ret < 0) { |
1906 | fput(reg->hr_bdev_file); |
1907 | reg->hr_bdev_file = NULL; |
1908 | } |
1909 | out2: |
1910 | fdput(fd: f); |
1911 | out: |
1912 | return ret; |
1913 | } |
1914 | |
1915 | static ssize_t o2hb_region_pid_show(struct config_item *item, char *page) |
1916 | { |
1917 | struct o2hb_region *reg = to_o2hb_region(item); |
1918 | pid_t pid = 0; |
1919 | |
1920 | spin_lock(lock: &o2hb_live_lock); |
1921 | if (reg->hr_task) |
1922 | pid = task_pid_nr(tsk: reg->hr_task); |
1923 | spin_unlock(lock: &o2hb_live_lock); |
1924 | |
1925 | if (!pid) |
1926 | return 0; |
1927 | |
1928 | return sprintf(buf: page, fmt: "%u\n" , pid); |
1929 | } |
1930 | |
1931 | CONFIGFS_ATTR(o2hb_region_, block_bytes); |
1932 | CONFIGFS_ATTR(o2hb_region_, start_block); |
1933 | CONFIGFS_ATTR(o2hb_region_, blocks); |
1934 | CONFIGFS_ATTR(o2hb_region_, dev); |
1935 | CONFIGFS_ATTR_RO(o2hb_region_, pid); |
1936 | |
1937 | static struct configfs_attribute *o2hb_region_attrs[] = { |
1938 | &o2hb_region_attr_block_bytes, |
1939 | &o2hb_region_attr_start_block, |
1940 | &o2hb_region_attr_blocks, |
1941 | &o2hb_region_attr_dev, |
1942 | &o2hb_region_attr_pid, |
1943 | NULL, |
1944 | }; |
1945 | |
1946 | static struct configfs_item_operations o2hb_region_item_ops = { |
1947 | .release = o2hb_region_release, |
1948 | }; |
1949 | |
1950 | static const struct config_item_type o2hb_region_type = { |
1951 | .ct_item_ops = &o2hb_region_item_ops, |
1952 | .ct_attrs = o2hb_region_attrs, |
1953 | .ct_owner = THIS_MODULE, |
1954 | }; |
1955 | |
1956 | /* heartbeat set */ |
1957 | |
1958 | struct o2hb_heartbeat_group { |
1959 | struct config_group hs_group; |
1960 | /* some stuff? */ |
1961 | }; |
1962 | |
1963 | static struct o2hb_heartbeat_group *to_o2hb_heartbeat_group(struct config_group *group) |
1964 | { |
1965 | return group ? |
1966 | container_of(group, struct o2hb_heartbeat_group, hs_group) |
1967 | : NULL; |
1968 | } |
1969 | |
1970 | static void o2hb_debug_region_init(struct o2hb_region *reg, |
1971 | struct dentry *parent) |
1972 | { |
1973 | struct dentry *dir; |
1974 | |
1975 | dir = debugfs_create_dir(name: config_item_name(item: ®->hr_item), parent); |
1976 | reg->hr_debug_dir = dir; |
1977 | |
1978 | o2hb_debug_create(O2HB_DEBUG_LIVENODES, dir, db: &(reg->hr_db_livenodes), |
1979 | db_len: sizeof(*(reg->hr_db_livenodes)), |
1980 | O2HB_DB_TYPE_REGION_LIVENODES, |
1981 | size: sizeof(reg->hr_live_node_bitmap), O2NM_MAX_NODES, |
1982 | data: reg); |
1983 | |
1984 | o2hb_debug_create(O2HB_DEBUG_REGION_NUMBER, dir, db: &(reg->hr_db_regnum), |
1985 | db_len: sizeof(*(reg->hr_db_regnum)), |
1986 | O2HB_DB_TYPE_REGION_NUMBER, size: 0, O2NM_MAX_NODES, data: reg); |
1987 | |
1988 | o2hb_debug_create(O2HB_DEBUG_REGION_ELAPSED_TIME, dir, |
1989 | db: &(reg->hr_db_elapsed_time), |
1990 | db_len: sizeof(*(reg->hr_db_elapsed_time)), |
1991 | O2HB_DB_TYPE_REGION_ELAPSED_TIME, size: 0, len: 0, data: reg); |
1992 | |
1993 | o2hb_debug_create(O2HB_DEBUG_REGION_PINNED, dir, db: &(reg->hr_db_pinned), |
1994 | db_len: sizeof(*(reg->hr_db_pinned)), |
1995 | O2HB_DB_TYPE_REGION_PINNED, size: 0, len: 0, data: reg); |
1996 | |
1997 | } |
1998 | |
1999 | static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group, |
2000 | const char *name) |
2001 | { |
2002 | struct o2hb_region *reg = NULL; |
2003 | int ret; |
2004 | |
2005 | reg = kzalloc(size: sizeof(struct o2hb_region), GFP_KERNEL); |
2006 | if (reg == NULL) |
2007 | return ERR_PTR(error: -ENOMEM); |
2008 | |
2009 | if (strlen(name) > O2HB_MAX_REGION_NAME_LEN) { |
2010 | ret = -ENAMETOOLONG; |
2011 | goto free; |
2012 | } |
2013 | |
2014 | spin_lock(lock: &o2hb_live_lock); |
2015 | reg->hr_region_num = 0; |
2016 | if (o2hb_global_heartbeat_active()) { |
2017 | reg->hr_region_num = find_first_zero_bit(addr: o2hb_region_bitmap, |
2018 | O2NM_MAX_REGIONS); |
2019 | if (reg->hr_region_num >= O2NM_MAX_REGIONS) { |
2020 | spin_unlock(lock: &o2hb_live_lock); |
2021 | ret = -EFBIG; |
2022 | goto free; |
2023 | } |
2024 | set_bit(nr: reg->hr_region_num, addr: o2hb_region_bitmap); |
2025 | } |
2026 | list_add_tail(new: ®->hr_all_item, head: &o2hb_all_regions); |
2027 | spin_unlock(lock: &o2hb_live_lock); |
2028 | |
2029 | config_item_init_type_name(item: ®->hr_item, name, type: &o2hb_region_type); |
2030 | |
2031 | /* this is the same way to generate msg key as dlm, for local heartbeat, |
2032 | * name is also the same, so make initial crc value different to avoid |
2033 | * message key conflict. |
2034 | */ |
2035 | reg->hr_key = crc32_le(crc: reg->hr_region_num + O2NM_MAX_REGIONS, |
2036 | p: name, strlen(name)); |
2037 | INIT_LIST_HEAD(list: ®->hr_handler_list); |
2038 | ret = o2net_register_handler(msg_type: O2HB_NEGO_TIMEOUT_MSG, key: reg->hr_key, |
2039 | max_len: sizeof(struct o2hb_nego_msg), |
2040 | func: o2hb_nego_timeout_handler, |
2041 | data: reg, NULL, unreg_list: ®->hr_handler_list); |
2042 | if (ret) |
2043 | goto remove_item; |
2044 | |
2045 | ret = o2net_register_handler(msg_type: O2HB_NEGO_APPROVE_MSG, key: reg->hr_key, |
2046 | max_len: sizeof(struct o2hb_nego_msg), |
2047 | func: o2hb_nego_approve_handler, |
2048 | data: reg, NULL, unreg_list: ®->hr_handler_list); |
2049 | if (ret) |
2050 | goto unregister_handler; |
2051 | |
2052 | o2hb_debug_region_init(reg, parent: o2hb_debug_dir); |
2053 | |
2054 | return ®->hr_item; |
2055 | |
2056 | unregister_handler: |
2057 | o2net_unregister_handler_list(list: ®->hr_handler_list); |
2058 | remove_item: |
2059 | spin_lock(lock: &o2hb_live_lock); |
2060 | list_del(entry: ®->hr_all_item); |
2061 | if (o2hb_global_heartbeat_active()) |
2062 | clear_bit(nr: reg->hr_region_num, addr: o2hb_region_bitmap); |
2063 | spin_unlock(lock: &o2hb_live_lock); |
2064 | free: |
2065 | kfree(objp: reg); |
2066 | return ERR_PTR(error: ret); |
2067 | } |
2068 | |
2069 | static void o2hb_heartbeat_group_drop_item(struct config_group *group, |
2070 | struct config_item *item) |
2071 | { |
2072 | struct task_struct *hb_task; |
2073 | struct o2hb_region *reg = to_o2hb_region(item); |
2074 | int quorum_region = 0; |
2075 | |
2076 | /* stop the thread when the user removes the region dir */ |
2077 | spin_lock(lock: &o2hb_live_lock); |
2078 | hb_task = reg->hr_task; |
2079 | reg->hr_task = NULL; |
2080 | reg->hr_item_dropped = 1; |
2081 | spin_unlock(lock: &o2hb_live_lock); |
2082 | |
2083 | if (hb_task) |
2084 | kthread_stop(k: hb_task); |
2085 | |
2086 | if (o2hb_global_heartbeat_active()) { |
2087 | spin_lock(lock: &o2hb_live_lock); |
2088 | clear_bit(nr: reg->hr_region_num, addr: o2hb_region_bitmap); |
2089 | clear_bit(nr: reg->hr_region_num, addr: o2hb_live_region_bitmap); |
2090 | if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap)) |
2091 | quorum_region = 1; |
2092 | clear_bit(nr: reg->hr_region_num, addr: o2hb_quorum_region_bitmap); |
2093 | spin_unlock(lock: &o2hb_live_lock); |
2094 | printk(KERN_NOTICE "o2hb: Heartbeat %s on region %s (%pg)\n" , |
2095 | ((atomic_read(®->hr_steady_iterations) == 0) ? |
2096 | "stopped" : "start aborted" ), config_item_name(item), |
2097 | reg_bdev(reg)); |
2098 | } |
2099 | |
2100 | /* |
2101 | * If we're racing a dev_write(), we need to wake them. They will |
2102 | * check reg->hr_task |
2103 | */ |
2104 | if (atomic_read(v: ®->hr_steady_iterations) != 0) { |
2105 | reg->hr_aborted_start = 1; |
2106 | atomic_set(v: ®->hr_steady_iterations, i: 0); |
2107 | wake_up(&o2hb_steady_queue); |
2108 | } |
2109 | |
2110 | config_item_put(item); |
2111 | |
2112 | if (!o2hb_global_heartbeat_active() || !quorum_region) |
2113 | return; |
2114 | |
2115 | /* |
2116 | * If global heartbeat active and there are dependent users, |
2117 | * pin all regions if quorum region count <= CUT_OFF |
2118 | */ |
2119 | spin_lock(lock: &o2hb_live_lock); |
2120 | |
2121 | if (!o2hb_dependent_users) |
2122 | goto unlock; |
2123 | |
2124 | if (bitmap_weight(src: o2hb_quorum_region_bitmap, |
2125 | O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF) |
2126 | o2hb_region_pin(NULL); |
2127 | |
2128 | unlock: |
2129 | spin_unlock(lock: &o2hb_live_lock); |
2130 | } |
2131 | |
2132 | static ssize_t o2hb_heartbeat_group_dead_threshold_show(struct config_item *item, |
2133 | char *page) |
2134 | { |
2135 | return sprintf(buf: page, fmt: "%u\n" , o2hb_dead_threshold); |
2136 | } |
2137 | |
2138 | static ssize_t o2hb_heartbeat_group_dead_threshold_store(struct config_item *item, |
2139 | const char *page, size_t count) |
2140 | { |
2141 | unsigned long tmp; |
2142 | char *p = (char *)page; |
2143 | |
2144 | tmp = simple_strtoul(p, &p, 10); |
2145 | if (!p || (*p && (*p != '\n'))) |
2146 | return -EINVAL; |
2147 | |
2148 | /* this will validate ranges for us. */ |
2149 | o2hb_dead_threshold_set(threshold: (unsigned int) tmp); |
2150 | |
2151 | return count; |
2152 | } |
2153 | |
2154 | static ssize_t o2hb_heartbeat_group_mode_show(struct config_item *item, |
2155 | char *page) |
2156 | { |
2157 | return sprintf(buf: page, fmt: "%s\n" , |
2158 | o2hb_heartbeat_mode_desc[o2hb_heartbeat_mode]); |
2159 | } |
2160 | |
2161 | static ssize_t o2hb_heartbeat_group_mode_store(struct config_item *item, |
2162 | const char *page, size_t count) |
2163 | { |
2164 | unsigned int i; |
2165 | int ret; |
2166 | size_t len; |
2167 | |
2168 | len = (page[count - 1] == '\n') ? count - 1 : count; |
2169 | if (!len) |
2170 | return -EINVAL; |
2171 | |
2172 | for (i = 0; i < O2HB_HEARTBEAT_NUM_MODES; ++i) { |
2173 | if (strncasecmp(s1: page, s2: o2hb_heartbeat_mode_desc[i], n: len)) |
2174 | continue; |
2175 | |
2176 | ret = o2hb_global_heartbeat_mode_set(hb_mode: i); |
2177 | if (!ret) |
2178 | printk(KERN_NOTICE "o2hb: Heartbeat mode set to %s\n" , |
2179 | o2hb_heartbeat_mode_desc[i]); |
2180 | return count; |
2181 | } |
2182 | |
2183 | return -EINVAL; |
2184 | |
2185 | } |
2186 | |
2187 | CONFIGFS_ATTR(o2hb_heartbeat_group_, dead_threshold); |
2188 | CONFIGFS_ATTR(o2hb_heartbeat_group_, mode); |
2189 | |
2190 | static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = { |
2191 | &o2hb_heartbeat_group_attr_dead_threshold, |
2192 | &o2hb_heartbeat_group_attr_mode, |
2193 | NULL, |
2194 | }; |
2195 | |
2196 | static struct configfs_group_operations o2hb_heartbeat_group_group_ops = { |
2197 | .make_item = o2hb_heartbeat_group_make_item, |
2198 | .drop_item = o2hb_heartbeat_group_drop_item, |
2199 | }; |
2200 | |
2201 | static const struct config_item_type o2hb_heartbeat_group_type = { |
2202 | .ct_group_ops = &o2hb_heartbeat_group_group_ops, |
2203 | .ct_attrs = o2hb_heartbeat_group_attrs, |
2204 | .ct_owner = THIS_MODULE, |
2205 | }; |
2206 | |
2207 | /* this is just here to avoid touching group in heartbeat.h which the |
2208 | * entire damn world #includes */ |
2209 | struct config_group *o2hb_alloc_hb_set(void) |
2210 | { |
2211 | struct o2hb_heartbeat_group *hs = NULL; |
2212 | struct config_group *ret = NULL; |
2213 | |
2214 | hs = kzalloc(size: sizeof(struct o2hb_heartbeat_group), GFP_KERNEL); |
2215 | if (hs == NULL) |
2216 | goto out; |
2217 | |
2218 | config_group_init_type_name(group: &hs->hs_group, name: "heartbeat" , |
2219 | type: &o2hb_heartbeat_group_type); |
2220 | |
2221 | ret = &hs->hs_group; |
2222 | out: |
2223 | if (ret == NULL) |
2224 | kfree(objp: hs); |
2225 | return ret; |
2226 | } |
2227 | |
2228 | void o2hb_free_hb_set(struct config_group *group) |
2229 | { |
2230 | struct o2hb_heartbeat_group *hs = to_o2hb_heartbeat_group(group); |
2231 | kfree(objp: hs); |
2232 | } |
2233 | |
2234 | /* hb callback registration and issuing */ |
2235 | |
2236 | static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type) |
2237 | { |
2238 | if (type == O2HB_NUM_CB) |
2239 | return ERR_PTR(error: -EINVAL); |
2240 | |
2241 | return &o2hb_callbacks[type]; |
2242 | } |
2243 | |
2244 | void o2hb_setup_callback(struct o2hb_callback_func *hc, |
2245 | enum o2hb_callback_type type, |
2246 | o2hb_cb_func *func, |
2247 | void *data, |
2248 | int priority) |
2249 | { |
2250 | INIT_LIST_HEAD(list: &hc->hc_item); |
2251 | hc->hc_func = func; |
2252 | hc->hc_data = data; |
2253 | hc->hc_priority = priority; |
2254 | hc->hc_type = type; |
2255 | hc->hc_magic = O2HB_CB_MAGIC; |
2256 | } |
2257 | EXPORT_SYMBOL_GPL(o2hb_setup_callback); |
2258 | |
2259 | /* |
2260 | * In local heartbeat mode, region_uuid passed matches the dlm domain name. |
2261 | * In global heartbeat mode, region_uuid passed is NULL. |
2262 | * |
2263 | * In local, we only pin the matching region. In global we pin all the active |
2264 | * regions. |
2265 | */ |
2266 | static int o2hb_region_pin(const char *region_uuid) |
2267 | { |
2268 | int ret = 0, found = 0; |
2269 | struct o2hb_region *reg; |
2270 | char *uuid; |
2271 | |
2272 | assert_spin_locked(&o2hb_live_lock); |
2273 | |
2274 | list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) { |
2275 | if (reg->hr_item_dropped) |
2276 | continue; |
2277 | |
2278 | uuid = config_item_name(item: ®->hr_item); |
2279 | |
2280 | /* local heartbeat */ |
2281 | if (region_uuid) { |
2282 | if (strcmp(region_uuid, uuid)) |
2283 | continue; |
2284 | found = 1; |
2285 | } |
2286 | |
2287 | if (reg->hr_item_pinned || reg->hr_item_dropped) |
2288 | goto skip_pin; |
2289 | |
2290 | /* Ignore ENOENT only for local hb (userdlm domain) */ |
2291 | ret = o2nm_depend_item(item: ®->hr_item); |
2292 | if (!ret) { |
2293 | mlog(ML_CLUSTER, "Pin region %s\n" , uuid); |
2294 | reg->hr_item_pinned = 1; |
2295 | } else { |
2296 | if (ret == -ENOENT && found) |
2297 | ret = 0; |
2298 | else { |
2299 | mlog(ML_ERROR, "Pin region %s fails with %d\n" , |
2300 | uuid, ret); |
2301 | break; |
2302 | } |
2303 | } |
2304 | skip_pin: |
2305 | if (found) |
2306 | break; |
2307 | } |
2308 | |
2309 | return ret; |
2310 | } |
2311 | |
2312 | /* |
2313 | * In local heartbeat mode, region_uuid passed matches the dlm domain name. |
2314 | * In global heartbeat mode, region_uuid passed is NULL. |
2315 | * |
2316 | * In local, we only unpin the matching region. In global we unpin all the |
2317 | * active regions. |
2318 | */ |
2319 | static void o2hb_region_unpin(const char *region_uuid) |
2320 | { |
2321 | struct o2hb_region *reg; |
2322 | char *uuid; |
2323 | int found = 0; |
2324 | |
2325 | assert_spin_locked(&o2hb_live_lock); |
2326 | |
2327 | list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) { |
2328 | if (reg->hr_item_dropped) |
2329 | continue; |
2330 | |
2331 | uuid = config_item_name(item: ®->hr_item); |
2332 | if (region_uuid) { |
2333 | if (strcmp(region_uuid, uuid)) |
2334 | continue; |
2335 | found = 1; |
2336 | } |
2337 | |
2338 | if (reg->hr_item_pinned) { |
2339 | mlog(ML_CLUSTER, "Unpin region %s\n" , uuid); |
2340 | o2nm_undepend_item(item: ®->hr_item); |
2341 | reg->hr_item_pinned = 0; |
2342 | } |
2343 | if (found) |
2344 | break; |
2345 | } |
2346 | } |
2347 | |
2348 | static int o2hb_region_inc_user(const char *region_uuid) |
2349 | { |
2350 | int ret = 0; |
2351 | |
2352 | spin_lock(lock: &o2hb_live_lock); |
2353 | |
2354 | /* local heartbeat */ |
2355 | if (!o2hb_global_heartbeat_active()) { |
2356 | ret = o2hb_region_pin(region_uuid); |
2357 | goto unlock; |
2358 | } |
2359 | |
2360 | /* |
2361 | * if global heartbeat active and this is the first dependent user, |
2362 | * pin all regions if quorum region count <= CUT_OFF |
2363 | */ |
2364 | o2hb_dependent_users++; |
2365 | if (o2hb_dependent_users > 1) |
2366 | goto unlock; |
2367 | |
2368 | if (bitmap_weight(src: o2hb_quorum_region_bitmap, |
2369 | O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF) |
2370 | ret = o2hb_region_pin(NULL); |
2371 | |
2372 | unlock: |
2373 | spin_unlock(lock: &o2hb_live_lock); |
2374 | return ret; |
2375 | } |
2376 | |
2377 | static void o2hb_region_dec_user(const char *region_uuid) |
2378 | { |
2379 | spin_lock(lock: &o2hb_live_lock); |
2380 | |
2381 | /* local heartbeat */ |
2382 | if (!o2hb_global_heartbeat_active()) { |
2383 | o2hb_region_unpin(region_uuid); |
2384 | goto unlock; |
2385 | } |
2386 | |
2387 | /* |
2388 | * if global heartbeat active and there are no dependent users, |
2389 | * unpin all quorum regions |
2390 | */ |
2391 | o2hb_dependent_users--; |
2392 | if (!o2hb_dependent_users) |
2393 | o2hb_region_unpin(NULL); |
2394 | |
2395 | unlock: |
2396 | spin_unlock(lock: &o2hb_live_lock); |
2397 | } |
2398 | |
2399 | int o2hb_register_callback(const char *region_uuid, |
2400 | struct o2hb_callback_func *hc) |
2401 | { |
2402 | struct o2hb_callback_func *f; |
2403 | struct o2hb_callback *hbcall; |
2404 | int ret; |
2405 | |
2406 | BUG_ON(hc->hc_magic != O2HB_CB_MAGIC); |
2407 | BUG_ON(!list_empty(&hc->hc_item)); |
2408 | |
2409 | hbcall = hbcall_from_type(type: hc->hc_type); |
2410 | if (IS_ERR(ptr: hbcall)) { |
2411 | ret = PTR_ERR(ptr: hbcall); |
2412 | goto out; |
2413 | } |
2414 | |
2415 | if (region_uuid) { |
2416 | ret = o2hb_region_inc_user(region_uuid); |
2417 | if (ret) { |
2418 | mlog_errno(ret); |
2419 | goto out; |
2420 | } |
2421 | } |
2422 | |
2423 | down_write(sem: &o2hb_callback_sem); |
2424 | |
2425 | list_for_each_entry(f, &hbcall->list, hc_item) { |
2426 | if (hc->hc_priority < f->hc_priority) { |
2427 | list_add_tail(new: &hc->hc_item, head: &f->hc_item); |
2428 | break; |
2429 | } |
2430 | } |
2431 | if (list_empty(head: &hc->hc_item)) |
2432 | list_add_tail(new: &hc->hc_item, head: &hbcall->list); |
2433 | |
2434 | up_write(sem: &o2hb_callback_sem); |
2435 | ret = 0; |
2436 | out: |
2437 | mlog(ML_CLUSTER, "returning %d on behalf of %p for funcs %p\n" , |
2438 | ret, __builtin_return_address(0), hc); |
2439 | return ret; |
2440 | } |
2441 | EXPORT_SYMBOL_GPL(o2hb_register_callback); |
2442 | |
2443 | void o2hb_unregister_callback(const char *region_uuid, |
2444 | struct o2hb_callback_func *hc) |
2445 | { |
2446 | BUG_ON(hc->hc_magic != O2HB_CB_MAGIC); |
2447 | |
2448 | mlog(ML_CLUSTER, "on behalf of %p for funcs %p\n" , |
2449 | __builtin_return_address(0), hc); |
2450 | |
2451 | /* XXX Can this happen _with_ a region reference? */ |
2452 | if (list_empty(head: &hc->hc_item)) |
2453 | return; |
2454 | |
2455 | if (region_uuid) |
2456 | o2hb_region_dec_user(region_uuid); |
2457 | |
2458 | down_write(sem: &o2hb_callback_sem); |
2459 | |
2460 | list_del_init(entry: &hc->hc_item); |
2461 | |
2462 | up_write(sem: &o2hb_callback_sem); |
2463 | } |
2464 | EXPORT_SYMBOL_GPL(o2hb_unregister_callback); |
2465 | |
2466 | int o2hb_check_node_heartbeating_no_sem(u8 node_num) |
2467 | { |
2468 | unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; |
2469 | |
2470 | spin_lock(lock: &o2hb_live_lock); |
2471 | o2hb_fill_node_map_from_callback(map: testing_map, O2NM_MAX_NODES); |
2472 | spin_unlock(lock: &o2hb_live_lock); |
2473 | if (!test_bit(node_num, testing_map)) { |
2474 | mlog(ML_HEARTBEAT, |
2475 | "node (%u) does not have heartbeating enabled.\n" , |
2476 | node_num); |
2477 | return 0; |
2478 | } |
2479 | |
2480 | return 1; |
2481 | } |
2482 | EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating_no_sem); |
2483 | |
2484 | int o2hb_check_node_heartbeating_from_callback(u8 node_num) |
2485 | { |
2486 | unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; |
2487 | |
2488 | o2hb_fill_node_map_from_callback(map: testing_map, O2NM_MAX_NODES); |
2489 | if (!test_bit(node_num, testing_map)) { |
2490 | mlog(ML_HEARTBEAT, |
2491 | "node (%u) does not have heartbeating enabled.\n" , |
2492 | node_num); |
2493 | return 0; |
2494 | } |
2495 | |
2496 | return 1; |
2497 | } |
2498 | EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating_from_callback); |
2499 | |
2500 | /* |
2501 | * this is just a hack until we get the plumbing which flips file systems |
2502 | * read only and drops the hb ref instead of killing the node dead. |
2503 | */ |
2504 | void o2hb_stop_all_regions(void) |
2505 | { |
2506 | struct o2hb_region *reg; |
2507 | |
2508 | mlog(ML_ERROR, "stopping heartbeat on all active regions.\n" ); |
2509 | |
2510 | spin_lock(lock: &o2hb_live_lock); |
2511 | |
2512 | list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) |
2513 | reg->hr_unclean_stop = 1; |
2514 | |
2515 | spin_unlock(lock: &o2hb_live_lock); |
2516 | } |
2517 | EXPORT_SYMBOL_GPL(o2hb_stop_all_regions); |
2518 | |
2519 | int o2hb_get_all_regions(char *region_uuids, u8 max_regions) |
2520 | { |
2521 | struct o2hb_region *reg; |
2522 | int numregs = 0; |
2523 | char *p; |
2524 | |
2525 | spin_lock(lock: &o2hb_live_lock); |
2526 | |
2527 | p = region_uuids; |
2528 | list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) { |
2529 | if (reg->hr_item_dropped) |
2530 | continue; |
2531 | |
2532 | mlog(0, "Region: %s\n" , config_item_name(®->hr_item)); |
2533 | if (numregs < max_regions) { |
2534 | memcpy(p, config_item_name(®->hr_item), |
2535 | O2HB_MAX_REGION_NAME_LEN); |
2536 | p += O2HB_MAX_REGION_NAME_LEN; |
2537 | } |
2538 | numregs++; |
2539 | } |
2540 | |
2541 | spin_unlock(lock: &o2hb_live_lock); |
2542 | |
2543 | return numregs; |
2544 | } |
2545 | EXPORT_SYMBOL_GPL(o2hb_get_all_regions); |
2546 | |
2547 | int o2hb_global_heartbeat_active(void) |
2548 | { |
2549 | return (o2hb_heartbeat_mode == O2HB_HEARTBEAT_GLOBAL); |
2550 | } |
2551 | EXPORT_SYMBOL(o2hb_global_heartbeat_active); |
2552 | |