mapper.c source code [linux/net/ceph/crush/mapper.c]

1	/*
2	* Ceph - scalable distributed file system
3	*
4	* Copyright (C) 2015 Intel Corporation All Rights Reserved
5	*
6	* This is free software; you can redistribute it and/or
7	* modify it under the terms of the GNU Lesser General Public
8	* License version 2.1, as published by the Free Software
9	* Foundation. See file COPYING.
10	*
11	*/
12
13	#ifdef __KERNEL__
14	# include <linux/string.h>
15	# include <linux/slab.h>
16	# include <linux/bug.h>
17	# include <linux/kernel.h>
18	# include <linux/crush/crush.h>
19	# include <linux/crush/hash.h>
20	# include <linux/crush/mapper.h>
21	#else
22	# include "crush_compat.h"
23	# include "crush.h"
24	# include "hash.h"
25	# include "mapper.h"
26	#endif
27	#include "crush_ln_table.h"
28
29	#define dprintk(args...) /* printf(args) */
30
31	/*
32	* Implement the core CRUSH mapping algorithm.
33	*/
34
35	/**
36	* crush_find_rule - find a crush_rule id for a given ruleset, type, and size.
37	* @map: the crush_map
38	* @ruleset: the storage ruleset id (user defined)
39	* @type: storage ruleset type (user defined)
40	* @size: output set size
41	*/
42	int crush_find_rule(const struct crush_map map, int* ruleset, int type, int size)
43	{
44	__u32 i;
45
46	for (i = `0`; i < map->max_rules; i++) {
47	if (map->rules[i] &&
48	map->rules[i]->mask.ruleset == ruleset &&
49	map->rules[i]->mask.type == type &&
50	map->rules[i]->mask.min_size <= size &&
51	map->rules[i]->mask.max_size >= size)
52	return i;
53	}
54	return -`1`;
55	}
56
57	/*
58	* bucket choose methods
59	*
60	* For each bucket algorithm, we have a "choose" method that, given a
61	* crush input @x and replica position (usually, position in output set) @r,
62	* will produce an item in the bucket.
63	*/
64
65	/*
66	* Choose based on a random permutation of the bucket.
67	*
68	* We used to use some prime number arithmetic to do this, but it
69	* wasn't very random, and had some other bad behaviors. Instead, we
70	* calculate an actual random permutation of the bucket members.
71	* Since this is expensive, we optimize for the r=0 case, which
72	* captures the vast majority of calls.
73	*/
74	static int bucket_perm_choose(const struct crush_bucket *bucket,
75	struct crush_work_bucket *work,
76	int x, int r)
77	{
78	unsigned int pr = r % bucket->size;
79	unsigned int i, s;
80
81	/ start a new permutation if @x has changed /
82	if (work->perm_x != (__u32)x \|\| work->perm_n == `0`) {
83	dprintk("bucket %d new x=%d\n", bucket->id, x);
84	work->perm_x = x;
85
86	/ optimize common r=0 case /
87	if (pr == `0`) {
88	s = crush_hash32_3(type: bucket->hash, a: x, b: bucket->id, c: `0`) %
89	bucket->size;
90	work->perm[`0`] = s;
91	work->perm_n = `0xffff`; / magic value, see below /
92	goto out;
93	}
94
95	for (i = `0`; i < bucket->size; i++)
96	work->perm[i] = i;
97	work->perm_n = `0`;
98	} else if (work->perm_n == `0xffff`) {
99	/ clean up after the r=0 case above /
100	for (i = `1`; i < bucket->size; i++)
101	work->perm[i] = i;
102	work->perm[work->perm[`0`]] = `0`;
103	work->perm_n = `1`;
104	}
105
106	/ calculate permutation up to pr /
107	for (i = `0`; i < work->perm_n; i++)
108	dprintk(" perm_choose have %d: %d\n", i, work->perm[i]);
109	while (work->perm_n <= pr) {
110	unsigned int p = work->perm_n;
111	/ no point in swapping the final entry /
112	if (p < bucket->size - `1`) {
113	i = crush_hash32_3(type: bucket->hash, a: x, b: bucket->id, c: p) %
114	(bucket->size - p);
115	if (i) {
116	unsigned int t = work->perm[p + i];
117	work->perm[p + i] = work->perm[p];
118	work->perm[p] = t;
119	}
120	dprintk(" perm_choose swap %d with %d\n", p, p+i);
121	}
122	work->perm_n++;
123	}
124	for (i = `0`; i < bucket->size; i++)
125	dprintk(" perm_choose %d: %d\n", i, work->perm[i]);
126
127	s = work->perm[pr];
128	out:
129	dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
130	bucket->size, x, r, pr, s);
131	return bucket->items[s];
132	}
133
134	/ uniform /
135	static int bucket_uniform_choose(const struct crush_bucket_uniform *bucket,
136	struct crush_work_bucket work, int* x, int r)
137	{
138	return bucket_perm_choose(bucket: &bucket->h, work, x, r);
139	}
140
141	/ list /
142	static int bucket_list_choose(const struct crush_bucket_list *bucket,
143	int x, int r)
144	{
145	int i;
146
147	for (i = bucket->h.size-`1`; i >= `0`; i--) {
148	__u64 w = crush_hash32_4(type: bucket->h.hash, a: x, b: bucket->h.items[i],
149	c: r, d: bucket->h.id);
150	w &= `0xffff`;
151	dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
152	"sw %x rand %llx",
153	i, x, r, bucket->h.items[i], bucket->item_weights[i],
154	bucket->sum_weights[i], w);
155	w *= bucket->sum_weights[i];
156	w = w >> `16`;
157	/dprintk(" scaled %llx\n", w);/
158	if (w < bucket->item_weights[i]) {
159	return bucket->h.items[i];
160	}
161	}
162
163	dprintk("bad list sums for bucket %d\n", bucket->h.id);
164	return bucket->h.items[`0`];
165	}
166
167
168	/ (binary) tree /
169	static int height(int n)
170	{
171	int h = `0`;
172	while ((n & `1`) == `0`) {
173	h++;
174	n = n >> `1`;
175	}
176	return h;
177	}
178
179	static int left(int x)
180	{
181	int h = height(n: x);
182	return x - (`1` << (h-`1`));
183	}
184
185	static int right(int x)
186	{
187	int h = height(n: x);
188	return x + (`1` << (h-`1`));
189	}
190
191	static int terminal(int x)
192	{
193	return x & `1`;
194	}
195
196	static int bucket_tree_choose(const struct crush_bucket_tree *bucket,
197	int x, int r)
198	{
199	int n;
200	__u32 w;
201	__u64 t;
202
203	/ start at root /
204	n = bucket->num_nodes >> `1`;
205
206	while (!terminal(x: n)) {
207	int l;
208	/ pick point in [0, w) /
209	w = bucket->node_weights[n];
210	t = (__u64)crush_hash32_4(type: bucket->h.hash, a: x, b: n, c: r,
211	d: bucket->h.id) * (__u64)w;
212	t = t >> `32`;
213
214	/ descend to the left or right? /
215	l = left(x: n);
216	if (t < bucket->node_weights[l])
217	n = l;
218	else
219	n = right(x: n);
220	}
221
222	return bucket->h.items[n >> `1`];
223	}
224
225
226	/ straw /
227
228	static int bucket_straw_choose(const struct crush_bucket_straw *bucket,
229	int x, int r)
230	{
231	__u32 i;
232	int high = `0`;
233	__u64 high_draw = `0`;
234	__u64 draw;
235
236	for (i = `0`; i < bucket->h.size; i++) {
237	draw = crush_hash32_3(type: bucket->h.hash, a: x, b: bucket->h.items[i], c: r);
238	draw &= `0xffff`;
239	draw *= bucket->straws[i];
240	if (i == `0` \|\| draw > high_draw) {
241	high = i;
242	high_draw = draw;
243	}
244	}
245	return bucket->h.items[high];
246	}
247
248	/ compute 2^44log2(input+1) /*
249	static __u64 crush_ln(unsigned int xin)
250	{
251	unsigned int x = xin;
252	int iexpon, index1, index2;
253	__u64 RH, LH, LL, xl64, result;
254
255	x++;
256
257	/ normalize input /
258	iexpon = `15`;
259
260	/*
261	* figure out number of bits we need to shift and
262	* do it in one step instead of iteratively
263	*/
264	if (!(x & `0x18000`)) {
265	int bits = __builtin_clz(x & `0x1FFFF`) - `16`;
266	x <<= bits;
267	iexpon = `15` - bits;
268	}
269
270	index1 = (x >> `8`) << `1`;
271	/ RH ~ 2^56/index1 /
272	RH = __RH_LH_tbl[index1 - `256`];
273	/ LH ~ 2^48 * log2(index1/256) /
274	LH = __RH_LH_tbl[index1 + `1` - `256`];
275
276	/ RHx ~ 2^48 (2^15 + xf), xf<2^8 /
277	xl64 = (__s64)x * RH;
278	xl64 >>= `48`;
279
280	result = iexpon;
281	result <<= (`12` + `32`);
282
283	index2 = xl64 & `0xff`;
284	/ LL ~ 2^48log2(1.0+index2/2^15) /*
285	LL = __LL_tbl[index2];
286
287	LH = LH + LL;
288
289	LH >>= (`48` - `12` - `32`);
290	result += LH;
291
292	return result;
293	}
294
295
296	/*
297	* straw2
298	*
299	* for reference, see:
300	*
301	* https://en.wikipedia.org/wiki/Exponential_distribution#Distribution_of_the_minimum_of_exponential_random_variables
302	*
303	*/
304
305	static __u32 get_choose_arg_weights(const* struct crush_bucket_straw2 *bucket,
306	const struct crush_choose_arg *arg,
307	int position)
308	{
309	if (!arg \|\| !arg->weight_set)
310	return bucket->item_weights;
311
312	if (position >= arg->weight_set_size)
313	position = arg->weight_set_size - `1`;
314	return arg->weight_set[position].weights;
315	}
316
317	static __s32 get_choose_arg_ids(const* struct crush_bucket_straw2 *bucket,
318	const struct crush_choose_arg *arg)
319	{
320	if (!arg \|\| !arg->ids)
321	return bucket->h.items;
322
323	return arg->ids;
324	}
325
326	static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket,
327	int x, int r,
328	const struct crush_choose_arg *arg,
329	int position)
330	{
331	unsigned int i, high = `0`;
332	unsigned int u;
333	__s64 ln, draw, high_draw = `0`;
334	__u32 *weights = get_choose_arg_weights(bucket, arg, position);
335	__s32 *ids = get_choose_arg_ids(bucket, arg);
336
337	for (i = `0`; i < bucket->h.size; i++) {
338	dprintk("weight 0x%x item %d\n", weights[i], ids[i]);
339	if (weights[i]) {
340	u = crush_hash32_3(type: bucket->h.hash, a: x, b: ids[i], c: r);
341	u &= `0xffff`;
342
343	/*
344	* for some reason slightly less than 0x10000 produces
345	* a slightly more accurate distribution... probably a
346	* rounding effect.
347	*
348	* the natural log lookup table maps [0,0xffff]
349	* (corresponding to real numbers [1/0x10000, 1] to
350	* [0, 0xffffffffffff] (corresponding to real numbers
351	* [-11.090355,0]).
352	*/
353	ln = crush_ln(xin: u) - `0x1000000000000ll`;
354
355	/*
356	* divide by 16.16 fixed-point weight. note
357	* that the ln value is negative, so a larger
358	* weight means a larger (less negative) value
359	* for draw.
360	*/
361	draw = div64_s64(dividend: ln, divisor: weights[i]);
362	} else {
363	draw = S64_MIN;
364	}
365
366	if (i == `0` \|\| draw > high_draw) {
367	high = i;
368	high_draw = draw;
369	}
370	}
371
372	return bucket->h.items[high];
373	}
374
375
376	static int crush_bucket_choose(const struct crush_bucket *in,
377	struct crush_work_bucket *work,
378	int x, int r,
379	const struct crush_choose_arg *arg,
380	int position)
381	{
382	dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
383	BUG_ON(in->size == `0`);
384	switch (in->alg) {
385	case CRUSH_BUCKET_UNIFORM:
386	return bucket_uniform_choose(
387	bucket: (const struct crush_bucket_uniform *)in,
388	work, x, r);
389	case CRUSH_BUCKET_LIST:
390	return bucket_list_choose(bucket: (const struct crush_bucket_list *)in,
391	x, r);
392	case CRUSH_BUCKET_TREE:
393	return bucket_tree_choose(bucket: (const struct crush_bucket_tree *)in,
394	x, r);
395	case CRUSH_BUCKET_STRAW:
396	return bucket_straw_choose(
397	bucket: (const struct crush_bucket_straw *)in,
398	x, r);
399	case CRUSH_BUCKET_STRAW2:
400	return bucket_straw2_choose(
401	bucket: (const struct crush_bucket_straw2 *)in,
402	x, r, arg, position);
403	default:
404	dprintk("unknown bucket %d alg %d\n", in->id, in->alg);
405	return in->items[`0`];
406	}
407	}
408
409	/*
410	* true if device is marked "out" (failed, fully offloaded)
411	* of the cluster
412	*/
413	static int is_out(const struct crush_map *map,
414	const __u32 weight, int* weight_max,
415	int item, int x)
416	{
417	if (item >= weight_max)
418	return `1`;
419	if (weight[item] >= `0x10000`)
420	return `0`;
421	if (weight[item] == `0`)
422	return `1`;
423	if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, a: x, b: item) & `0xffff`)
424	< weight[item])
425	return `0`;
426	return `1`;
427	}
428
429	/**
430	* crush_choose_firstn - choose numrep distinct items of given type
431	* @map: the crush_map
432	* @bucket: the bucket we are choose an item from
433	* @x: crush input value
434	* @numrep: the number of items to choose
435	* @type: the type of item to choose
436	* @out: pointer to output vector
437	* @outpos: our position in that vector
438	* @out_size: size of the out vector
439	* @tries: number of attempts to make
440	* @recurse_tries: number of attempts to have recursive chooseleaf make
441	* @local_retries: localized retries
442	* @local_fallback_retries: localized fallback retries
443	* @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose)
444	* @stable: stable mode starts rep=0 in the recursive call for all replicas
445	* @vary_r: pass r to recursive calls
446	* @out2: second output vector for leaf items (if @recurse_to_leaf)
447	* @parent_r: r value passed from the parent
448	*/
449	static int crush_choose_firstn(const struct crush_map *map,
450	struct crush_work *work,
451	const struct crush_bucket *bucket,
452	const __u32 weight, int* weight_max,
453	int x, int numrep, int type,
454	int out, int* outpos,
455	int out_size,
456	unsigned int tries,
457	unsigned int recurse_tries,
458	unsigned int local_retries,
459	unsigned int local_fallback_retries,
460	int recurse_to_leaf,
461	unsigned int vary_r,
462	unsigned int stable,
463	int *out2,
464	int parent_r,
465	const struct crush_choose_arg *choose_args)
466	{
467	int rep;
468	unsigned int ftotal, flocal;
469	int retry_descent, retry_bucket, skip_rep;
470	const struct crush_bucket *in = bucket;
471	int r;
472	int i;
473	int item = `0`;
474	int itemtype;
475	int collide, reject;
476	int count = out_size;
477
478	dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d stable %d\n",
479	recurse_to_leaf ? "_LEAF" : "",
480	bucket->id, x, outpos, numrep,
481	tries, recurse_tries, local_retries, local_fallback_retries,
482	parent_r, stable);
483
484	for (rep = stable ? `0` : outpos; rep < numrep && count > `0` ; rep++) {
485	/ keep trying until we get a non-out, non-colliding item /
486	ftotal = `0`;
487	skip_rep = `0`;
488	do {
489	retry_descent = `0`;
490	in = bucket; / initial bucket /
491
492	/ choose through intervening buckets /
493	flocal = `0`;
494	do {
495	collide = `0`;
496	retry_bucket = `0`;
497	r = rep + parent_r;
498	/ r' = r + f_total /
499	r += ftotal;
500
501	/ bucket choose /
502	if (in->size == `0`) {
503	reject = `1`;
504	goto reject;
505	}
506	if (local_fallback_retries > `0` &&
507	flocal >= (in->size>>`1`) &&
508	flocal > local_fallback_retries)
509	item = bucket_perm_choose(
510	bucket: in, work: work->work[-`1`-in->id],
511	x, r);
512	else
513	item = crush_bucket_choose(
514	in, work: work->work[-`1`-in->id],
515	x, r,
516	arg: (choose_args ?
517	&choose_args[-`1`-in->id] : NULL),
518	position: outpos);
519	if (item >= map->max_devices) {
520	dprintk(" bad item %d\n", item);
521	skip_rep = `1`;
522	break;
523	}
524
525	/ desired type? /
526	if (item < `0`)
527	itemtype = map->buckets[-`1`-item]->type;
528	else
529	itemtype = `0`;
530	dprintk(" item %d type %d\n", item, itemtype);
531
532	/ keep going? /
533	if (itemtype != type) {
534	if (item >= `0` \|\|
535	(-`1`-item) >= map->max_buckets) {
536	dprintk(" bad item type %d\n", type);
537	skip_rep = `1`;
538	break;
539	}
540	in = map->buckets[-`1`-item];
541	retry_bucket = `1`;
542	continue;
543	}
544
545	/ collision? /
546	for (i = `0`; i < outpos; i++) {
547	if (out[i] == item) {
548	collide = `1`;
549	break;
550	}
551	}
552
553	reject = `0`;
554	if (!collide && recurse_to_leaf) {
555	if (item < `0`) {
556	int sub_r;
557	if (vary_r)
558	sub_r = r >> (vary_r-`1`);
559	else
560	sub_r = `0`;
561	if (crush_choose_firstn(
562	map,
563	work,
564	bucket: map->buckets[-`1`-item],
565	weight, weight_max,
566	x, numrep: stable ? `1` : outpos+`1`, type: `0`,
567	out: out2, outpos, out_size: count,
568	tries: recurse_tries, recurse_tries: `0`,
569	local_retries,
570	local_fallback_retries,
571	recurse_to_leaf: `0`,
572	vary_r,
573	stable,
574	NULL,
575	parent_r: sub_r,
576	choose_args) <= outpos)
577	/ didn't get leaf /
578	reject = `1`;
579	} else {
580	/ we already have a leaf! /
581	out2[outpos] = item;
582	}
583	}
584
585	if (!reject && !collide) {
586	/ out? /
587	if (itemtype == `0`)
588	reject = is_out(map, weight,
589	weight_max,
590	item, x);
591	}
592
593	reject:
594	if (reject \|\| collide) {
595	ftotal++;
596	flocal++;
597
598	if (collide && flocal <= local_retries)
599	/ retry locally a few times /
600	retry_bucket = `1`;
601	else if (local_fallback_retries > `0` &&
602	flocal <= in->size + local_fallback_retries)
603	/ exhaustive bucket search /
604	retry_bucket = `1`;
605	else if (ftotal < tries)
606	/ then retry descent /
607	retry_descent = `1`;
608	else
609	/ else give up /
610	skip_rep = `1`;
611	dprintk(" reject %d collide %d "
612	"ftotal %u flocal %u\n",
613	reject, collide, ftotal,
614	flocal);
615	}
616	} while (retry_bucket);
617	} while (retry_descent);
618
619	if (skip_rep) {
620	dprintk("skip rep\n");
621	continue;
622	}
623
624	dprintk("CHOOSE got %d\n", item);
625	out[outpos] = item;
626	outpos++;
627	count--;
628	#ifndef __KERNEL__
629	if (map->choose_tries && ftotal <= map->choose_total_tries)
630	map->choose_tries[ftotal]++;
631	#endif
632	}
633
634	dprintk("CHOOSE returns %d\n", outpos);
635	return outpos;
636	}
637
638
639	/**
640	* crush_choose_indep: alternative breadth-first positionally stable mapping
641	*
642	*/
643	static void crush_choose_indep(const struct crush_map *map,
644	struct crush_work *work,
645	const struct crush_bucket *bucket,
646	const __u32 weight, int* weight_max,
647	int x, int left, int numrep, int type,
648	int out, int* outpos,
649	unsigned int tries,
650	unsigned int recurse_tries,
651	int recurse_to_leaf,
652	int *out2,
653	int parent_r,
654	const struct crush_choose_arg *choose_args)
655	{
656	const struct crush_bucket *in = bucket;
657	int endpos = outpos + left;
658	int rep;
659	unsigned int ftotal;
660	int r;
661	int i;
662	int item = `0`;
663	int itemtype;
664	int collide;
665
666	dprintk("CHOOSE%s INDEP bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
667	bucket->id, x, outpos, numrep);
668
669	/ initially my result is undefined /
670	for (rep = outpos; rep < endpos; rep++) {
671	out[rep] = CRUSH_ITEM_UNDEF;
672	if (out2)
673	out2[rep] = CRUSH_ITEM_UNDEF;
674	}
675
676	for (ftotal = `0`; left > `0` && ftotal < tries; ftotal++) {
677	#ifdef DEBUG_INDEP
678	if (out2 && ftotal) {
679	dprintk("%u %d a: ", ftotal, left);
680	for (rep = outpos; rep < endpos; rep++) {
681	dprintk(" %d", out[rep]);
682	}
683	dprintk("\n");
684	dprintk("%u %d b: ", ftotal, left);
685	for (rep = outpos; rep < endpos; rep++) {
686	dprintk(" %d", out2[rep]);
687	}
688	dprintk("\n");
689	}
690	#endif
691	for (rep = outpos; rep < endpos; rep++) {
692	if (out[rep] != CRUSH_ITEM_UNDEF)
693	continue;
694
695	in = bucket; / initial bucket /
696
697	/ choose through intervening buckets /
698	for (;;) {
699	/ note: we base the choice on the position*
700	* even in the nested call. that means that
701	* if the first layer chooses the same bucket
702	* in a different position, we will tend to
703	* choose a different item in that bucket.
704	* this will involve more devices in data
705	* movement and tend to distribute the load.
706	*/
707	r = rep + parent_r;
708
709	/ be careful /
710	if (in->alg == CRUSH_BUCKET_UNIFORM &&
711	in->size % numrep == `0`)
712	/ r'=r+(n+1)f_total /*
713	r += (numrep+`1`) * ftotal;
714	else
715	/ r' = r + nf_total /*
716	r += numrep * ftotal;
717
718	/ bucket choose /
719	if (in->size == `0`) {
720	dprintk(" empty bucket\n");
721	break;
722	}
723
724	item = crush_bucket_choose(
725	in, work: work->work[-`1`-in->id],
726	x, r,
727	arg: (choose_args ?
728	&choose_args[-`1`-in->id] : NULL),
729	position: outpos);
730	if (item >= map->max_devices) {
731	dprintk(" bad item %d\n", item);
732	out[rep] = CRUSH_ITEM_NONE;
733	if (out2)
734	out2[rep] = CRUSH_ITEM_NONE;
735	left--;
736	break;
737	}
738
739	/ desired type? /
740	if (item < `0`)
741	itemtype = map->buckets[-`1`-item]->type;
742	else
743	itemtype = `0`;
744	dprintk(" item %d type %d\n", item, itemtype);
745
746	/ keep going? /
747	if (itemtype != type) {
748	if (item >= `0` \|\|
749	(-`1`-item) >= map->max_buckets) {
750	dprintk(" bad item type %d\n", type);
751	out[rep] = CRUSH_ITEM_NONE;
752	if (out2)
753	out2[rep] =
754	CRUSH_ITEM_NONE;
755	left--;
756	break;
757	}
758	in = map->buckets[-`1`-item];
759	continue;
760	}
761
762	/ collision? /
763	collide = `0`;
764	for (i = outpos; i < endpos; i++) {
765	if (out[i] == item) {
766	collide = `1`;
767	break;
768	}
769	}
770	if (collide)
771	break;
772
773	if (recurse_to_leaf) {
774	if (item < `0`) {
775	crush_choose_indep(
776	map,
777	work,
778	bucket: map->buckets[-`1`-item],
779	weight, weight_max,
780	x, left: `1`, numrep, type: `0`,
781	out: out2, outpos: rep,
782	tries: recurse_tries, recurse_tries: `0`,
783	recurse_to_leaf: `0`, NULL, parent_r: r,
784	choose_args);
785	if (out2[rep] == CRUSH_ITEM_NONE) {
786	/ placed nothing; no leaf /
787	break;
788	}
789	} else {
790	/ we already have a leaf! /
791	out2[rep] = item;
792	}
793	}
794
795	/ out? /
796	if (itemtype == `0` &&
797	is_out(map, weight, weight_max, item, x))
798	break;
799
800	/ yay! /
801	out[rep] = item;
802	left--;
803	break;
804	}
805	}
806	}
807	for (rep = outpos; rep < endpos; rep++) {
808	if (out[rep] == CRUSH_ITEM_UNDEF) {
809	out[rep] = CRUSH_ITEM_NONE;
810	}
811	if (out2 && out2[rep] == CRUSH_ITEM_UNDEF) {
812	out2[rep] = CRUSH_ITEM_NONE;
813	}
814	}
815	#ifndef __KERNEL__
816	if (map->choose_tries && ftotal <= map->choose_total_tries)
817	map->choose_tries[ftotal]++;
818	#endif
819	#ifdef DEBUG_INDEP
820	if (out2) {
821	dprintk("%u %d a: ", ftotal, left);
822	for (rep = outpos; rep < endpos; rep++) {
823	dprintk(" %d", out[rep]);
824	}
825	dprintk("\n");
826	dprintk("%u %d b: ", ftotal, left);
827	for (rep = outpos; rep < endpos; rep++) {
828	dprintk(" %d", out2[rep]);
829	}
830	dprintk("\n");
831	}
832	#endif
833	}
834
835
836	/*
837	* This takes a chunk of memory and sets it up to be a shiny new
838	* working area for a CRUSH placement computation. It must be called
839	* on any newly allocated memory before passing it in to
840	* crush_do_rule. It may be used repeatedly after that, so long as the
841	* map has not changed. If the map /has/ changed, you must make sure
842	* the working size is no smaller than what was allocated and re-run
843	* crush_init_workspace.
844	*
845	* If you do retain the working space between calls to crush, make it
846	* thread-local.
847	*/
848	void crush_init_workspace(const struct crush_map map, void* *v)
849	{
850	struct crush_work *w = v;
851	__s32 b;
852
853	/*
854	* We work by moving through the available space and setting
855	* values and pointers as we go.
856	*
857	* It's a bit like Forth's use of the 'allot' word since we
858	* set the pointer first and then reserve the space for it to
859	* point to by incrementing the point.
860	*/
861	v += sizeof(struct crush_work);
862	w->work = v;
863	v += map->max_buckets * sizeof(struct crush_work_bucket *);
864	for (b = `0`; b < map->max_buckets; ++b) {
865	if (!map->buckets[b])
866	continue;
867
868	w->work[b] = v;
869	switch (map->buckets[b]->alg) {
870	default:
871	v += sizeof(struct crush_work_bucket);
872	break;
873	}
874	w->work[b]->perm_x = `0`;
875	w->work[b]->perm_n = `0`;
876	w->work[b]->perm = v;
877	v += map->buckets[b]->size * sizeof(__u32);
878	}
879	BUG_ON(v - (void *)w != map->working_size);
880	}
881
882	/**
883	* crush_do_rule - calculate a mapping with the given input and rule
884	* @map: the crush_map
885	* @ruleno: the rule id
886	* @x: hash input
887	* @result: pointer to result vector
888	* @result_max: maximum result size
889	* @weight: weight vector (for map leaves)
890	* @weight_max: size of weight vector
891	* @cwin: pointer to at least crush_work_size() bytes of memory
892	* @choose_args: weights and ids for each known bucket
893	*/
894	int crush_do_rule(const struct crush_map *map,
895	int ruleno, int x, int result, int* result_max,
896	const __u32 weight, int* weight_max,
897	void cwin, const* struct crush_choose_arg *choose_args)
898	{
899	int result_len;
900	struct crush_work *cw = cwin;
901	int *a = cwin + map->working_size;
902	int *b = a + result_max;
903	int *c = b + result_max;
904	int *w = a;
905	int *o = b;
906	int recurse_to_leaf;
907	int wsize = `0`;
908	int osize;
909	const struct crush_rule *rule;
910	__u32 step;
911	int i, j;
912	int numrep;
913	int out_size;
914	/*
915	* the original choose_total_tries value was off by one (it
916	* counted "retries" and not "tries"). add one.
917	*/
918	int choose_tries = map->choose_total_tries + `1`;
919	int choose_leaf_tries = `0`;
920	/*
921	* the local tries values were counted as "retries", though,
922	* and need no adjustment
923	*/
924	int choose_local_retries = map->choose_local_tries;
925	int choose_local_fallback_retries = map->choose_local_fallback_tries;
926
927	int vary_r = map->chooseleaf_vary_r;
928	int stable = map->chooseleaf_stable;
929
930	if ((__u32)ruleno >= map->max_rules) {
931	dprintk(" bad ruleno %d\n", ruleno);
932	return `0`;
933	}
934
935	rule = map->rules[ruleno];
936	result_len = `0`;
937
938	for (step = `0`; step < rule->len; step++) {
939	int firstn = `0`;
940	const struct crush_rule_step *curstep = &rule->steps[step];
941
942	switch (curstep->op) {
943	case CRUSH_RULE_TAKE:
944	if ((curstep->arg1 >= `0` &&
945	curstep->arg1 < map->max_devices) \|\|
946	(-`1`-curstep->arg1 >= `0` &&
947	-`1`-curstep->arg1 < map->max_buckets &&
948	map->buckets[-`1`-curstep->arg1])) {
949	w[`0`] = curstep->arg1;
950	wsize = `1`;
951	} else {
952	dprintk(" bad take value %d\n", curstep->arg1);
953	}
954	break;
955
956	case CRUSH_RULE_SET_CHOOSE_TRIES:
957	if (curstep->arg1 > `0`)
958	choose_tries = curstep->arg1;
959	break;
960
961	case CRUSH_RULE_SET_CHOOSELEAF_TRIES:
962	if (curstep->arg1 > `0`)
963	choose_leaf_tries = curstep->arg1;
964	break;
965
966	case CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES:
967	if (curstep->arg1 >= `0`)
968	choose_local_retries = curstep->arg1;
969	break;
970
971	case CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES:
972	if (curstep->arg1 >= `0`)
973	choose_local_fallback_retries = curstep->arg1;
974	break;
975
976	case CRUSH_RULE_SET_CHOOSELEAF_VARY_R:
977	if (curstep->arg1 >= `0`)
978	vary_r = curstep->arg1;
979	break;
980
981	case CRUSH_RULE_SET_CHOOSELEAF_STABLE:
982	if (curstep->arg1 >= `0`)
983	stable = curstep->arg1;
984	break;
985
986	case CRUSH_RULE_CHOOSELEAF_FIRSTN:
987	case CRUSH_RULE_CHOOSE_FIRSTN:
988	firstn = `1`;
989	fallthrough;
990	case CRUSH_RULE_CHOOSELEAF_INDEP:
991	case CRUSH_RULE_CHOOSE_INDEP:
992	if (wsize == `0`)
993	break;
994
995	recurse_to_leaf =
996	curstep->op ==
997	CRUSH_RULE_CHOOSELEAF_FIRSTN \|\|
998	curstep->op ==
999	CRUSH_RULE_CHOOSELEAF_INDEP;
1000
1001	/ reset output /
1002	osize = `0`;
1003
1004	for (i = `0`; i < wsize; i++) {
1005	int bno;
1006	numrep = curstep->arg1;
1007	if (numrep <= `0`) {
1008	numrep += result_max;
1009	if (numrep <= `0`)
1010	continue;
1011	}
1012	j = `0`;
1013	/ make sure bucket id is valid /
1014	bno = -`1` - w[i];
1015	if (bno < `0` \|\| bno >= map->max_buckets) {
1016	/ w[i] is probably CRUSH_ITEM_NONE /
1017	dprintk(" bad w[i] %d\n", w[i]);
1018	continue;
1019	}
1020	if (firstn) {
1021	int recurse_tries;
1022	if (choose_leaf_tries)
1023	recurse_tries =
1024	choose_leaf_tries;
1025	else if (map->chooseleaf_descend_once)
1026	recurse_tries = `1`;
1027	else
1028	recurse_tries = choose_tries;
1029	osize += crush_choose_firstn(
1030	map,
1031	work: cw,
1032	bucket: map->buckets[bno],
1033	weight, weight_max,
1034	x, numrep,
1035	type: curstep->arg2,
1036	out: o+osize, outpos: j,
1037	out_size: result_max-osize,
1038	tries: choose_tries,
1039	recurse_tries,
1040	local_retries: choose_local_retries,
1041	local_fallback_retries: choose_local_fallback_retries,
1042	recurse_to_leaf,
1043	vary_r,
1044	stable,
1045	out2: c+osize,
1046	parent_r: `0`,
1047	choose_args);
1048	} else {
1049	out_size = ((numrep < (result_max-osize)) ?
1050	numrep : (result_max-osize));
1051	crush_choose_indep(
1052	map,
1053	work: cw,
1054	bucket: map->buckets[bno],
1055	weight, weight_max,
1056	x, left: out_size, numrep,
1057	type: curstep->arg2,
1058	out: o+osize, outpos: j,
1059	tries: choose_tries,
1060	recurse_tries: choose_leaf_tries ?
1061	choose_leaf_tries : `1`,
1062	recurse_to_leaf,
1063	out2: c+osize,
1064	parent_r: `0`,
1065	choose_args);
1066	osize += out_size;
1067	}
1068	}
1069
1070	if (recurse_to_leaf)
1071	/ copy final _leaf_ values to output set /
1072	memcpy(o, c, osize*sizeof(*o));
1073
1074	/ swap o and w arrays /
1075	swap(o, w);
1076	wsize = osize;
1077	break;
1078
1079
1080	case CRUSH_RULE_EMIT:
1081	for (i = `0`; i < wsize && result_len < result_max; i++) {
1082	result[result_len] = w[i];
1083	result_len++;
1084	}
1085	wsize = `0`;
1086	break;
1087
1088	default:
1089	dprintk(" unknown op %d at step %d\n",
1090	curstep->op, step);
1091	break;
1092	}
1093	}
1094
1095	return result_len;
1096	}
1097

source code of linux/net/ceph/crush/mapper.c