1 | // SPDX-License-Identifier: GPL-2.0 |
2 | #include <linux/ceph/ceph_debug.h> |
3 | |
4 | #include <linux/bug.h> |
5 | #include <linux/err.h> |
6 | #include <linux/random.h> |
7 | #include <linux/slab.h> |
8 | #include <linux/types.h> |
9 | |
10 | #include <linux/ceph/mdsmap.h> |
11 | #include <linux/ceph/messenger.h> |
12 | #include <linux/ceph/decode.h> |
13 | |
14 | #include "super.h" |
15 | |
16 | #define CEPH_MDS_IS_READY(i, ignore_laggy) \ |
17 | (m->m_info[i].state > 0 && ignore_laggy ? true : !m->m_info[i].laggy) |
18 | |
19 | static int __mdsmap_get_random_mds(struct ceph_mdsmap *m, bool ignore_laggy) |
20 | { |
21 | int n = 0; |
22 | int i, j; |
23 | |
24 | /* count */ |
25 | for (i = 0; i < m->possible_max_rank; i++) |
26 | if (CEPH_MDS_IS_READY(i, ignore_laggy)) |
27 | n++; |
28 | if (n == 0) |
29 | return -1; |
30 | |
31 | /* pick */ |
32 | n = get_random_u32_below(ceil: n); |
33 | for (j = 0, i = 0; i < m->possible_max_rank; i++) { |
34 | if (CEPH_MDS_IS_READY(i, ignore_laggy)) |
35 | j++; |
36 | if (j > n) |
37 | break; |
38 | } |
39 | |
40 | return i; |
41 | } |
42 | |
43 | /* |
44 | * choose a random mds that is "up" (i.e. has a state > 0), or -1. |
45 | */ |
46 | int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m) |
47 | { |
48 | int mds; |
49 | |
50 | mds = __mdsmap_get_random_mds(m, ignore_laggy: false); |
51 | if (mds == m->possible_max_rank || mds == -1) |
52 | mds = __mdsmap_get_random_mds(m, ignore_laggy: true); |
53 | |
54 | return mds == m->possible_max_rank ? -1 : mds; |
55 | } |
56 | |
57 | #define __decode_and_drop_type(p, end, type, bad) \ |
58 | do { \ |
59 | if (*p + sizeof(type) > end) \ |
60 | goto bad; \ |
61 | *p += sizeof(type); \ |
62 | } while (0) |
63 | |
64 | #define __decode_and_drop_set(p, end, type, bad) \ |
65 | do { \ |
66 | u32 n; \ |
67 | size_t need; \ |
68 | ceph_decode_32_safe(p, end, n, bad); \ |
69 | need = sizeof(type) * n; \ |
70 | ceph_decode_need(p, end, need, bad); \ |
71 | *p += need; \ |
72 | } while (0) |
73 | |
74 | #define __decode_and_drop_map(p, end, ktype, vtype, bad) \ |
75 | do { \ |
76 | u32 n; \ |
77 | size_t need; \ |
78 | ceph_decode_32_safe(p, end, n, bad); \ |
79 | need = (sizeof(ktype) + sizeof(vtype)) * n; \ |
80 | ceph_decode_need(p, end, need, bad); \ |
81 | *p += need; \ |
82 | } while (0) |
83 | |
84 | |
85 | static int __decode_and_drop_compat_set(void **p, void* end) |
86 | { |
87 | int i; |
88 | /* compat, ro_compat, incompat*/ |
89 | for (i = 0; i < 3; i++) { |
90 | u32 n; |
91 | ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad); |
92 | /* mask */ |
93 | *p += sizeof(u64); |
94 | /* names (map<u64, string>) */ |
95 | n = ceph_decode_32(p); |
96 | while (n-- > 0) { |
97 | u32 len; |
98 | ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), |
99 | bad); |
100 | *p += sizeof(u64); |
101 | len = ceph_decode_32(p); |
102 | ceph_decode_need(p, end, len, bad); |
103 | *p += len; |
104 | } |
105 | } |
106 | return 0; |
107 | bad: |
108 | return -1; |
109 | } |
110 | |
111 | /* |
112 | * Decode an MDS map |
113 | * |
114 | * Ignore any fields we don't care about (there are quite a few of |
115 | * them). |
116 | */ |
117 | struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2) |
118 | { |
119 | struct ceph_mdsmap *m; |
120 | const void *start = *p; |
121 | int i, j, n; |
122 | int err; |
123 | u8 mdsmap_v; |
124 | u16 mdsmap_ev; |
125 | u32 target; |
126 | |
127 | m = kzalloc(size: sizeof(*m), GFP_NOFS); |
128 | if (!m) |
129 | return ERR_PTR(error: -ENOMEM); |
130 | |
131 | ceph_decode_need(p, end, 1 + 1, bad); |
132 | mdsmap_v = ceph_decode_8(p); |
133 | *p += sizeof(u8); /* mdsmap_cv */ |
134 | if (mdsmap_v >= 4) { |
135 | u32 mdsmap_len; |
136 | ceph_decode_32_safe(p, end, mdsmap_len, bad); |
137 | if (end < *p + mdsmap_len) |
138 | goto bad; |
139 | end = *p + mdsmap_len; |
140 | } |
141 | |
142 | ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad); |
143 | m->m_epoch = ceph_decode_32(p); |
144 | m->m_client_epoch = ceph_decode_32(p); |
145 | m->m_last_failure = ceph_decode_32(p); |
146 | m->m_root = ceph_decode_32(p); |
147 | m->m_session_timeout = ceph_decode_32(p); |
148 | m->m_session_autoclose = ceph_decode_32(p); |
149 | m->m_max_file_size = ceph_decode_64(p); |
150 | m->m_max_mds = ceph_decode_32(p); |
151 | |
152 | /* |
153 | * pick out the active nodes as the m_num_active_mds, the |
154 | * m_num_active_mds maybe larger than m_max_mds when decreasing |
155 | * the max_mds in cluster side, in other case it should less |
156 | * than or equal to m_max_mds. |
157 | */ |
158 | m->m_num_active_mds = n = ceph_decode_32(p); |
159 | |
160 | /* |
161 | * the possible max rank, it maybe larger than the m_num_active_mds, |
162 | * for example if the mds_max == 2 in the cluster, when the MDS(0) |
163 | * was laggy and being replaced by a new MDS, we will temporarily |
164 | * receive a new mds map with n_num_mds == 1 and the active MDS(1), |
165 | * and the mds rank >= m_num_active_mds. |
166 | */ |
167 | m->possible_max_rank = max(m->m_num_active_mds, m->m_max_mds); |
168 | |
169 | m->m_info = kcalloc(n: m->possible_max_rank, size: sizeof(*m->m_info), GFP_NOFS); |
170 | if (!m->m_info) |
171 | goto nomem; |
172 | |
173 | /* pick out active nodes from mds_info (state > 0) */ |
174 | for (i = 0; i < n; i++) { |
175 | u64 global_id; |
176 | u32 namelen; |
177 | s32 mds, inc, state; |
178 | u8 info_v; |
179 | void *info_end = NULL; |
180 | struct ceph_entity_addr addr; |
181 | u32 num_export_targets; |
182 | void *pexport_targets = NULL; |
183 | struct ceph_timespec laggy_since; |
184 | struct ceph_mds_info *info; |
185 | bool laggy; |
186 | |
187 | ceph_decode_need(p, end, sizeof(u64) + 1, bad); |
188 | global_id = ceph_decode_64(p); |
189 | info_v= ceph_decode_8(p); |
190 | if (info_v >= 4) { |
191 | u32 info_len; |
192 | ceph_decode_need(p, end, 1 + sizeof(u32), bad); |
193 | *p += sizeof(u8); /* info_cv */ |
194 | info_len = ceph_decode_32(p); |
195 | info_end = *p + info_len; |
196 | if (info_end > end) |
197 | goto bad; |
198 | } |
199 | |
200 | ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad); |
201 | *p += sizeof(u64); |
202 | namelen = ceph_decode_32(p); /* skip mds name */ |
203 | *p += namelen; |
204 | |
205 | ceph_decode_32_safe(p, end, mds, bad); |
206 | ceph_decode_32_safe(p, end, inc, bad); |
207 | ceph_decode_32_safe(p, end, state, bad); |
208 | *p += sizeof(u64); /* state_seq */ |
209 | if (info_v >= 8) |
210 | err = ceph_decode_entity_addrvec(p, end, msgr2, addr: &addr); |
211 | else |
212 | err = ceph_decode_entity_addr(p, end, addr: &addr); |
213 | if (err) |
214 | goto corrupt; |
215 | |
216 | ceph_decode_copy_safe(p, end, &laggy_since, sizeof(laggy_since), |
217 | bad); |
218 | laggy = laggy_since.tv_sec != 0 || laggy_since.tv_nsec != 0; |
219 | *p += sizeof(u32); |
220 | ceph_decode_32_safe(p, end, namelen, bad); |
221 | *p += namelen; |
222 | if (info_v >= 2) { |
223 | ceph_decode_32_safe(p, end, num_export_targets, bad); |
224 | pexport_targets = *p; |
225 | *p += num_export_targets * sizeof(u32); |
226 | } else { |
227 | num_export_targets = 0; |
228 | } |
229 | |
230 | if (info_end && *p != info_end) { |
231 | if (*p > info_end) |
232 | goto bad; |
233 | *p = info_end; |
234 | } |
235 | |
236 | dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s%s\n" , |
237 | i+1, n, global_id, mds, inc, |
238 | ceph_pr_addr(&addr), |
239 | ceph_mds_state_name(state), |
240 | laggy ? "(laggy)" : "" ); |
241 | |
242 | if (mds < 0 || mds >= m->possible_max_rank) { |
243 | pr_warn("mdsmap_decode got incorrect mds(%d)\n" , mds); |
244 | continue; |
245 | } |
246 | |
247 | if (state <= 0) { |
248 | dout("mdsmap_decode got incorrect state(%s)\n" , |
249 | ceph_mds_state_name(state)); |
250 | continue; |
251 | } |
252 | |
253 | info = &m->m_info[mds]; |
254 | info->global_id = global_id; |
255 | info->state = state; |
256 | info->addr = addr; |
257 | info->laggy = laggy; |
258 | info->num_export_targets = num_export_targets; |
259 | if (num_export_targets) { |
260 | info->export_targets = kcalloc(n: num_export_targets, |
261 | size: sizeof(u32), GFP_NOFS); |
262 | if (!info->export_targets) |
263 | goto nomem; |
264 | for (j = 0; j < num_export_targets; j++) { |
265 | target = ceph_decode_32(p: &pexport_targets); |
266 | info->export_targets[j] = target; |
267 | } |
268 | } else { |
269 | info->export_targets = NULL; |
270 | } |
271 | } |
272 | |
273 | /* pg_pools */ |
274 | ceph_decode_32_safe(p, end, n, bad); |
275 | m->m_num_data_pg_pools = n; |
276 | m->m_data_pg_pools = kcalloc(n, size: sizeof(u64), GFP_NOFS); |
277 | if (!m->m_data_pg_pools) |
278 | goto nomem; |
279 | ceph_decode_need(p, end, sizeof(u64)*(n+1), bad); |
280 | for (i = 0; i < n; i++) |
281 | m->m_data_pg_pools[i] = ceph_decode_64(p); |
282 | m->m_cas_pg_pool = ceph_decode_64(p); |
283 | m->m_enabled = m->m_epoch > 1; |
284 | |
285 | mdsmap_ev = 1; |
286 | if (mdsmap_v >= 2) { |
287 | ceph_decode_16_safe(p, end, mdsmap_ev, bad_ext); |
288 | } |
289 | if (mdsmap_ev >= 3) { |
290 | if (__decode_and_drop_compat_set(p, end) < 0) |
291 | goto bad_ext; |
292 | } |
293 | /* metadata_pool */ |
294 | if (mdsmap_ev < 5) { |
295 | __decode_and_drop_type(p, end, u32, bad_ext); |
296 | } else { |
297 | __decode_and_drop_type(p, end, u64, bad_ext); |
298 | } |
299 | |
300 | /* created + modified + tableserver */ |
301 | __decode_and_drop_type(p, end, struct ceph_timespec, bad_ext); |
302 | __decode_and_drop_type(p, end, struct ceph_timespec, bad_ext); |
303 | __decode_and_drop_type(p, end, u32, bad_ext); |
304 | |
305 | /* in */ |
306 | { |
307 | int num_laggy = 0; |
308 | ceph_decode_32_safe(p, end, n, bad_ext); |
309 | ceph_decode_need(p, end, sizeof(u32) * n, bad_ext); |
310 | |
311 | for (i = 0; i < n; i++) { |
312 | s32 mds = ceph_decode_32(p); |
313 | if (mds >= 0 && mds < m->possible_max_rank) { |
314 | if (m->m_info[mds].laggy) |
315 | num_laggy++; |
316 | } |
317 | } |
318 | m->m_num_laggy = num_laggy; |
319 | |
320 | if (n > m->possible_max_rank) { |
321 | void *new_m_info = krealloc(objp: m->m_info, |
322 | new_size: n * sizeof(*m->m_info), |
323 | GFP_NOFS | __GFP_ZERO); |
324 | if (!new_m_info) |
325 | goto nomem; |
326 | m->m_info = new_m_info; |
327 | } |
328 | m->possible_max_rank = n; |
329 | } |
330 | |
331 | /* inc */ |
332 | __decode_and_drop_map(p, end, u32, u32, bad_ext); |
333 | /* up */ |
334 | __decode_and_drop_map(p, end, u32, u64, bad_ext); |
335 | /* failed */ |
336 | __decode_and_drop_set(p, end, u32, bad_ext); |
337 | /* stopped */ |
338 | __decode_and_drop_set(p, end, u32, bad_ext); |
339 | |
340 | if (mdsmap_ev >= 4) { |
341 | /* last_failure_osd_epoch */ |
342 | __decode_and_drop_type(p, end, u32, bad_ext); |
343 | } |
344 | if (mdsmap_ev >= 6) { |
345 | /* ever_allowed_snaps */ |
346 | __decode_and_drop_type(p, end, u8, bad_ext); |
347 | /* explicitly_allowed_snaps */ |
348 | __decode_and_drop_type(p, end, u8, bad_ext); |
349 | } |
350 | if (mdsmap_ev >= 7) { |
351 | /* inline_data_enabled */ |
352 | __decode_and_drop_type(p, end, u8, bad_ext); |
353 | } |
354 | if (mdsmap_ev >= 8) { |
355 | /* enabled */ |
356 | ceph_decode_8_safe(p, end, m->m_enabled, bad_ext); |
357 | /* fs_name */ |
358 | ceph_decode_skip_string(p, end, bad_ext); |
359 | } |
360 | /* damaged */ |
361 | if (mdsmap_ev >= 9) { |
362 | size_t need; |
363 | ceph_decode_32_safe(p, end, n, bad_ext); |
364 | need = sizeof(u32) * n; |
365 | ceph_decode_need(p, end, need, bad_ext); |
366 | *p += need; |
367 | m->m_damaged = n > 0; |
368 | } else { |
369 | m->m_damaged = false; |
370 | } |
371 | if (mdsmap_ev >= 17) { |
372 | /* balancer */ |
373 | ceph_decode_skip_string(p, end, bad_ext); |
374 | /* standby_count_wanted */ |
375 | ceph_decode_skip_32(p, end, bad_ext); |
376 | /* old_max_mds */ |
377 | ceph_decode_skip_32(p, end, bad_ext); |
378 | /* min_compat_client */ |
379 | ceph_decode_skip_8(p, end, bad_ext); |
380 | /* required_client_features */ |
381 | ceph_decode_skip_set(p, end, 64, bad_ext); |
382 | ceph_decode_64_safe(p, end, m->m_max_xattr_size, bad_ext); |
383 | } else { |
384 | /* This forces the usage of the (sync) SETXATTR Op */ |
385 | m->m_max_xattr_size = 0; |
386 | } |
387 | bad_ext: |
388 | dout("mdsmap_decode m_enabled: %d, m_damaged: %d, m_num_laggy: %d\n" , |
389 | !!m->m_enabled, !!m->m_damaged, m->m_num_laggy); |
390 | *p = end; |
391 | dout("mdsmap_decode success epoch %u\n" , m->m_epoch); |
392 | return m; |
393 | nomem: |
394 | err = -ENOMEM; |
395 | goto out_err; |
396 | corrupt: |
397 | pr_err("corrupt mdsmap\n" ); |
398 | print_hex_dump(KERN_DEBUG, prefix_str: "mdsmap: " , |
399 | prefix_type: DUMP_PREFIX_OFFSET, rowsize: 16, groupsize: 1, |
400 | buf: start, len: end - start, ascii: true); |
401 | out_err: |
402 | ceph_mdsmap_destroy(m); |
403 | return ERR_PTR(error: err); |
404 | bad: |
405 | err = -EINVAL; |
406 | goto corrupt; |
407 | } |
408 | |
409 | void ceph_mdsmap_destroy(struct ceph_mdsmap *m) |
410 | { |
411 | int i; |
412 | |
413 | if (m->m_info) { |
414 | for (i = 0; i < m->possible_max_rank; i++) |
415 | kfree(objp: m->m_info[i].export_targets); |
416 | kfree(objp: m->m_info); |
417 | } |
418 | kfree(objp: m->m_data_pg_pools); |
419 | kfree(objp: m); |
420 | } |
421 | |
422 | bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap *m) |
423 | { |
424 | int i, nr_active = 0; |
425 | if (!m->m_enabled) |
426 | return false; |
427 | if (m->m_damaged) |
428 | return false; |
429 | if (m->m_num_laggy == m->m_num_active_mds) |
430 | return false; |
431 | for (i = 0; i < m->possible_max_rank; i++) { |
432 | if (m->m_info[i].state == CEPH_MDS_STATE_ACTIVE) |
433 | nr_active++; |
434 | } |
435 | return nr_active > 0; |
436 | } |
437 | |