uptodate.c source code [linux/fs/ocfs2/uptodate.c]

1	// SPDX-License-Identifier: GPL-2.0-or-later
2	/*
3	* uptodate.c
4	*
5	* Tracking the up-to-date-ness of a local buffer_head with respect to
6	* the cluster.
7	*
8	* Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved.
9	*
10	* Standard buffer head caching flags (uptodate, etc) are insufficient
11	* in a clustered environment - a buffer may be marked up to date on
12	* our local node but could have been modified by another cluster
13	* member. As a result an additional (and performant) caching scheme
14	* is required. A further requirement is that we consume as little
15	* memory as possible - we never pin buffer_head structures in order
16	* to cache them.
17	*
18	* We track the existence of up to date buffers on the inodes which
19	* are associated with them. Because we don't want to pin
20	* buffer_heads, this is only a (strong) hint and several other checks
21	* are made in the I/O path to ensure that we don't use a stale or
22	* invalid buffer without going to disk:
23	* - buffer_jbd is used liberally - if a bh is in the journal on
24	* this node then it must be up to date.
25	* - the standard buffer_uptodate() macro is used to detect buffers
26	* which may be invalid (even if we have an up to date tracking
27	* item for them)
28	*
29	* For a full understanding of how this code works together, one
30	* should read the callers in dlmglue.c, the I/O functions in
31	* buffer_head_io.c and ocfs2_journal_access in journal.c
32	*/
33
34	#include <linux/fs.h>
35	#include <linux/types.h>
36	#include <linux/slab.h>
37	#include <linux/highmem.h>
38	#include <linux/buffer_head.h>
39	#include <linux/rbtree.h>
40
41	#include <cluster/masklog.h>
42
43	#include "ocfs2.h"
44
45	#include "inode.h"
46	#include "uptodate.h"
47	#include "ocfs2_trace.h"
48
49	struct ocfs2_meta_cache_item {
50	struct rb_node c_node;
51	sector_t c_block;
52	};
53
54	static struct kmem_cache *ocfs2_uptodate_cachep;
55
56	u64 ocfs2_metadata_cache_owner(struct ocfs2_caching_info *ci)
57	{
58	BUG_ON(!ci \|\| !ci->ci_ops);
59
60	return ci->ci_ops->co_owner(ci);
61	}
62
63	struct super_block ocfs2_metadata_cache_get_super(struct* ocfs2_caching_info *ci)
64	{
65	BUG_ON(!ci \|\| !ci->ci_ops);
66
67	return ci->ci_ops->co_get_super(ci);
68	}
69
70	static void ocfs2_metadata_cache_lock(struct ocfs2_caching_info *ci)
71	{
72	BUG_ON(!ci \|\| !ci->ci_ops);
73
74	ci->ci_ops->co_cache_lock(ci);
75	}
76
77	static void ocfs2_metadata_cache_unlock(struct ocfs2_caching_info *ci)
78	{
79	BUG_ON(!ci \|\| !ci->ci_ops);
80
81	ci->ci_ops->co_cache_unlock(ci);
82	}
83
84	void ocfs2_metadata_cache_io_lock(struct ocfs2_caching_info *ci)
85	{
86	BUG_ON(!ci \|\| !ci->ci_ops);
87
88	ci->ci_ops->co_io_lock(ci);
89	}
90
91	void ocfs2_metadata_cache_io_unlock(struct ocfs2_caching_info *ci)
92	{
93	BUG_ON(!ci \|\| !ci->ci_ops);
94
95	ci->ci_ops->co_io_unlock(ci);
96	}
97
98
99	static void ocfs2_metadata_cache_reset(struct ocfs2_caching_info *ci,
100	int clear)
101	{
102	ci->ci_flags \|= OCFS2_CACHE_FL_INLINE;
103	ci->ci_num_cached = `0`;
104
105	if (clear) {
106	ci->ci_created_trans = `0`;
107	ci->ci_last_trans = `0`;
108	}
109	}
110
111	void ocfs2_metadata_cache_init(struct ocfs2_caching_info *ci,
112	const struct ocfs2_caching_operations *ops)
113	{
114	BUG_ON(!ops);
115
116	ci->ci_ops = ops;
117	ocfs2_metadata_cache_reset(ci, clear: `1`);
118	}
119
120	void ocfs2_metadata_cache_exit(struct ocfs2_caching_info *ci)
121	{
122	ocfs2_metadata_cache_purge(ci);
123	ocfs2_metadata_cache_reset(ci, clear: `1`);
124	}
125
126
127	/ No lock taken here as 'root' is not expected to be visible to other*
128	* processes. */
129	static unsigned int ocfs2_purge_copied_metadata_tree(struct rb_root *root)
130	{
131	unsigned int purged = `0`;
132	struct rb_node *node;
133	struct ocfs2_meta_cache_item *item;
134
135	while ((node = rb_last(root)) != NULL) {
136	item = rb_entry(node, struct ocfs2_meta_cache_item, c_node);
137
138	trace_ocfs2_purge_copied_metadata_tree(
139	num: (unsigned long long) item->c_block);
140
141	rb_erase(&item->c_node, root);
142	kmem_cache_free(s: ocfs2_uptodate_cachep, objp: item);
143
144	purged++;
145	}
146	return purged;
147	}
148
149	/ Called from locking and called from ocfs2_clear_inode. Dump the*
150	* cache for a given inode.
151	*
152	* This function is a few more lines longer than necessary due to some
153	* accounting done here, but I think it's worth tracking down those
154	* bugs sooner -- Mark */
155	void ocfs2_metadata_cache_purge(struct ocfs2_caching_info *ci)
156	{
157	unsigned int tree, to_purge, purged;
158	struct rb_root root = RB_ROOT;
159
160	BUG_ON(!ci \|\| !ci->ci_ops);
161
162	ocfs2_metadata_cache_lock(ci);
163	tree = !(ci->ci_flags & OCFS2_CACHE_FL_INLINE);
164	to_purge = ci->ci_num_cached;
165
166	trace_ocfs2_metadata_cache_purge(
167	val1: (unsigned long long)ocfs2_metadata_cache_owner(ci),
168	val2: to_purge, val3: tree);
169
170	/ If we're a tree, save off the root so that we can safely*
171	* initialize the cache. We do the work to free tree members
172	* without the spinlock. */
173	if (tree)
174	root = ci->ci_cache.ci_tree;
175
176	ocfs2_metadata_cache_reset(ci, clear: `0`);
177	ocfs2_metadata_cache_unlock(ci);
178
179	purged = ocfs2_purge_copied_metadata_tree(root: &root);
180	/ If possible, track the number wiped so that we can more*
181	* easily detect counting errors. Unfortunately, this is only
182	* meaningful for trees. */
183	if (tree && purged != to_purge)
184	mlog(ML_ERROR, "Owner %llu, count = %u, purged = %u\n",
185	(unsigned long long)ocfs2_metadata_cache_owner(ci),
186	to_purge, purged);
187	}
188
189	/ Returns the index in the cache array, -1 if not found.*
190	* Requires ip_lock. */
191	static int ocfs2_search_cache_array(struct ocfs2_caching_info *ci,
192	sector_t item)
193	{
194	int i;
195
196	for (i = `0`; i < ci->ci_num_cached; i++) {
197	if (item == ci->ci_cache.ci_array[i])
198	return i;
199	}
200
201	return -`1`;
202	}
203
204	/ Returns the cache item if found, otherwise NULL.*
205	* Requires ip_lock. */
206	static struct ocfs2_meta_cache_item *
207	ocfs2_search_cache_tree(struct ocfs2_caching_info *ci,
208	sector_t block)
209	{
210	struct rb_node * n = ci->ci_cache.ci_tree.rb_node;
211	struct ocfs2_meta_cache_item *item = NULL;
212
213	while (n) {
214	item = rb_entry(n, struct ocfs2_meta_cache_item, c_node);
215
216	if (block < item->c_block)
217	n = n->rb_left;
218	else if (block > item->c_block)
219	n = n->rb_right;
220	else
221	return item;
222	}
223
224	return NULL;
225	}
226
227	static int ocfs2_buffer_cached(struct ocfs2_caching_info *ci,
228	struct buffer_head *bh)
229	{
230	int index = -`1`;
231	struct ocfs2_meta_cache_item *item = NULL;
232
233	ocfs2_metadata_cache_lock(ci);
234
235	trace_ocfs2_buffer_cached_begin(
236	val1: (unsigned long long)ocfs2_metadata_cache_owner(ci),
237	val2: (unsigned long long) bh->b_blocknr,
238	val3: !!(ci->ci_flags & OCFS2_CACHE_FL_INLINE));
239
240	if (ci->ci_flags & OCFS2_CACHE_FL_INLINE)
241	index = ocfs2_search_cache_array(ci, item: bh->b_blocknr);
242	else
243	item = ocfs2_search_cache_tree(ci, block: bh->b_blocknr);
244
245	ocfs2_metadata_cache_unlock(ci);
246
247	trace_ocfs2_buffer_cached_end(index, item);
248
249	return (index != -`1`) \|\| (item != NULL);
250	}
251
252	/ Warning: even if it returns true, this does not guarantee that*
253	* the block is stored in our inode metadata cache.
254	*
255	* This can be called under lock_buffer()
256	*/
257	int ocfs2_buffer_uptodate(struct ocfs2_caching_info *ci,
258	struct buffer_head *bh)
259	{
260	/ Doesn't matter if the bh is in our cache or not -- if it's*
261	* not marked uptodate then we know it can't have correct
262	* data. */
263	if (!buffer_uptodate(bh))
264	return `0`;
265
266	/ OCFS2 does not allow multiple nodes to be changing the same*
267	* block at the same time. */
268	if (buffer_jbd(bh))
269	return `1`;
270
271	/ Ok, locally the buffer is marked as up to date, now search*
272	* our cache to see if we can trust that. */
273	return ocfs2_buffer_cached(ci, bh);
274	}
275
276	/*
277	* Determine whether a buffer is currently out on a read-ahead request.
278	* ci_io_sem should be held to serialize submitters with the logic here.
279	*/
280	int ocfs2_buffer_read_ahead(struct ocfs2_caching_info *ci,
281	struct buffer_head *bh)
282	{
283	return buffer_locked(bh) && ocfs2_buffer_cached(ci, bh);
284	}
285
286	/ Requires ip_lock /
287	static void ocfs2_append_cache_array(struct ocfs2_caching_info *ci,
288	sector_t block)
289	{
290	BUG_ON(ci->ci_num_cached >= OCFS2_CACHE_INFO_MAX_ARRAY);
291
292	trace_ocfs2_append_cache_array(
293	val1: (unsigned long long)ocfs2_metadata_cache_owner(ci),
294	val2: (unsigned long long)block, val3: ci->ci_num_cached);
295
296	ci->ci_cache.ci_array[ci->ci_num_cached] = block;
297	ci->ci_num_cached++;
298	}
299
300	/* By now the caller should have checked that the item does not
301	* exist in the tree.
302	* Requires ip_lock. */
303	static void __ocfs2_insert_cache_tree(struct ocfs2_caching_info *ci,
304	struct ocfs2_meta_cache_item *new)
305	{
306	sector_t block = new->c_block;
307	struct rb_node *parent = NULL;
308	struct rb_node **p = &ci->ci_cache.ci_tree.rb_node;
309	struct ocfs2_meta_cache_item *tmp;
310
311	trace_ocfs2_insert_cache_tree(
312	val1: (unsigned long long)ocfs2_metadata_cache_owner(ci),
313	val2: (unsigned long long)block, val3: ci->ci_num_cached);
314
315	while(*p) {
316	parent = *p;
317
318	tmp = rb_entry(parent, struct ocfs2_meta_cache_item, c_node);
319
320	if (block < tmp->c_block)
321	p = &(*p)->rb_left;
322	else if (block > tmp->c_block)
323	p = &(*p)->rb_right;
324	else {
325	/ This should never happen! /
326	mlog(ML_ERROR, "Duplicate block %llu cached!\n",
327	(unsigned long long) block);
328	BUG();
329	}
330	}
331
332	rb_link_node(node: &new->c_node, parent, rb_link: p);
333	rb_insert_color(&new->c_node, &ci->ci_cache.ci_tree);
334	ci->ci_num_cached++;
335	}
336
337	/ co_cache_lock() must be held /
338	static inline int ocfs2_insert_can_use_array(struct ocfs2_caching_info *ci)
339	{
340	return (ci->ci_flags & OCFS2_CACHE_FL_INLINE) &&
341	(ci->ci_num_cached < OCFS2_CACHE_INFO_MAX_ARRAY);
342	}
343
344	/ tree should be exactly OCFS2_CACHE_INFO_MAX_ARRAY wide. NULL the*
345	* pointers in tree after we use them - this allows caller to detect
346	* when to free in case of error.
347	*
348	* The co_cache_lock() must be held. */
349	static void ocfs2_expand_cache(struct ocfs2_caching_info *ci,
350	struct ocfs2_meta_cache_item **tree)
351	{
352	int i;
353
354	mlog_bug_on_msg(ci->ci_num_cached != OCFS2_CACHE_INFO_MAX_ARRAY,
355	"Owner %llu, num cached = %u, should be %u\n",
356	(unsigned long long)ocfs2_metadata_cache_owner(ci),
357	ci->ci_num_cached, OCFS2_CACHE_INFO_MAX_ARRAY);
358	mlog_bug_on_msg(!(ci->ci_flags & OCFS2_CACHE_FL_INLINE),
359	"Owner %llu not marked as inline anymore!\n",
360	(unsigned long long)ocfs2_metadata_cache_owner(ci));
361
362	/ Be careful to initialize the tree members first because*
363	* once the ci_tree is used, the array is junk... */
364	for (i = `0`; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++)
365	tree[i]->c_block = ci->ci_cache.ci_array[i];
366
367	ci->ci_flags &= ~OCFS2_CACHE_FL_INLINE;
368	ci->ci_cache.ci_tree = RB_ROOT;
369	/ this will be set again by __ocfs2_insert_cache_tree /
370	ci->ci_num_cached = `0`;
371
372	for (i = `0`; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++) {
373	__ocfs2_insert_cache_tree(ci, new: tree[i]);
374	tree[i] = NULL;
375	}
376
377	trace_ocfs2_expand_cache(
378	val1: (unsigned long long)ocfs2_metadata_cache_owner(ci),
379	val2: ci->ci_flags, val3: ci->ci_num_cached);
380	}
381
382	/ Slow path function - memory allocation is necessary. See the*
383	* comment above ocfs2_set_buffer_uptodate for more information. */
384	static void __ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci,
385	sector_t block,
386	int expand_tree)
387	{
388	int i;
389	struct ocfs2_meta_cache_item *new = NULL;
390	struct ocfs2_meta_cache_item *tree[OCFS2_CACHE_INFO_MAX_ARRAY] =
391	{ NULL, };
392
393	trace_ocfs2_set_buffer_uptodate(
394	val1: (unsigned long long)ocfs2_metadata_cache_owner(ci),
395	val2: (unsigned long long)block, val3: expand_tree);
396
397	new = kmem_cache_alloc(cachep: ocfs2_uptodate_cachep, GFP_NOFS);
398	if (!new) {
399	mlog_errno(-ENOMEM);
400	return;
401	}
402	new->c_block = block;
403
404	if (expand_tree) {
405	/ Do not allocate an array here - the removal code*
406	* has no way of tracking that. */
407	for (i = `0`; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++) {
408	tree[i] = kmem_cache_alloc(cachep: ocfs2_uptodate_cachep,
409	GFP_NOFS);
410	if (!tree[i]) {
411	mlog_errno(-ENOMEM);
412	goto out_free;
413	}
414
415	/ These are initialized in ocfs2_expand_cache! /
416	}
417	}
418
419	ocfs2_metadata_cache_lock(ci);
420	if (ocfs2_insert_can_use_array(ci)) {
421	/ Ok, items were removed from the cache in between*
422	* locks. Detect this and revert back to the fast path */
423	ocfs2_append_cache_array(ci, block);
424	ocfs2_metadata_cache_unlock(ci);
425	goto out_free;
426	}
427
428	if (expand_tree)
429	ocfs2_expand_cache(ci, tree);
430
431	__ocfs2_insert_cache_tree(ci, new);
432	ocfs2_metadata_cache_unlock(ci);
433
434	new = NULL;
435	out_free:
436	if (new)
437	kmem_cache_free(s: ocfs2_uptodate_cachep, objp: new);
438
439	/ If these were used, then ocfs2_expand_cache re-set them to*
440	* NULL for us. */
441	if (tree[`0`]) {
442	for (i = `0`; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++)
443	if (tree[i])
444	kmem_cache_free(s: ocfs2_uptodate_cachep,
445	objp: tree[i]);
446	}
447	}
448
449	/ Item insertion is guarded by co_io_lock(), so the insertion path takes*
450	* advantage of this by not rechecking for a duplicate insert during
451	* the slow case. Additionally, if the cache needs to be bumped up to
452	* a tree, the code will not recheck after acquiring the lock --
453	* multiple paths cannot be expanding to a tree at the same time.
454	*
455	* The slow path takes into account that items can be removed
456	* (including the whole tree wiped and reset) when this process it out
457	* allocating memory. In those cases, it reverts back to the fast
458	* path.
459	*
460	* Note that this function may actually fail to insert the block if
461	* memory cannot be allocated. This is not fatal however (but may
462	* result in a performance penalty)
463	*
464	* Readahead buffers can be passed in here before the I/O request is
465	* completed.
466	*/
467	void ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci,
468	struct buffer_head *bh)
469	{
470	int expand;
471
472	/ The block may very well exist in our cache already, so avoid*
473	* doing any more work in that case. */
474	if (ocfs2_buffer_cached(ci, bh))
475	return;
476
477	trace_ocfs2_set_buffer_uptodate_begin(
478	val1: (unsigned long long)ocfs2_metadata_cache_owner(ci),
479	val2: (unsigned long long)bh->b_blocknr);
480
481	/ No need to recheck under spinlock - insertion is guarded by*
482	* co_io_lock() */
483	ocfs2_metadata_cache_lock(ci);
484	if (ocfs2_insert_can_use_array(ci)) {
485	/ Fast case - it's an array and there's a free*
486	* spot. */
487	ocfs2_append_cache_array(ci, block: bh->b_blocknr);
488	ocfs2_metadata_cache_unlock(ci);
489	return;
490	}
491
492	expand = `0`;
493	if (ci->ci_flags & OCFS2_CACHE_FL_INLINE) {
494	/ We need to bump things up to a tree. /
495	expand = `1`;
496	}
497	ocfs2_metadata_cache_unlock(ci);
498
499	__ocfs2_set_buffer_uptodate(ci, block: bh->b_blocknr, expand_tree: expand);
500	}
501
502	/ Called against a newly allocated buffer. Most likely nobody should*
503	* be able to read this sort of metadata while it's still being
504	* allocated, but this is careful to take co_io_lock() anyway. */
505	void ocfs2_set_new_buffer_uptodate(struct ocfs2_caching_info *ci,
506	struct buffer_head *bh)
507	{
508	/ This should definitely not exist in our cache /
509	BUG_ON(ocfs2_buffer_cached(ci, bh));
510
511	set_buffer_uptodate(bh);
512
513	ocfs2_metadata_cache_io_lock(ci);
514	ocfs2_set_buffer_uptodate(ci, bh);
515	ocfs2_metadata_cache_io_unlock(ci);
516	}
517
518	/ Requires ip_lock. /
519	static void ocfs2_remove_metadata_array(struct ocfs2_caching_info *ci,
520	int index)
521	{
522	sector_t *array = ci->ci_cache.ci_array;
523	int bytes;
524
525	BUG_ON(index < `0` \|\| index >= OCFS2_CACHE_INFO_MAX_ARRAY);
526	BUG_ON(index >= ci->ci_num_cached);
527	BUG_ON(!ci->ci_num_cached);
528
529	trace_ocfs2_remove_metadata_array(
530	val1: (unsigned long long)ocfs2_metadata_cache_owner(ci),
531	val2: index, val3: ci->ci_num_cached);
532
533	ci->ci_num_cached--;
534
535	/ don't need to copy if the array is now empty, or if we*
536	* removed at the tail */
537	if (ci->ci_num_cached && index < ci->ci_num_cached) {
538	bytes = sizeof(sector_t) * (ci->ci_num_cached - index);
539	memmove(&array[index], &array[index + `1`], bytes);
540	}
541	}
542
543	/ Requires ip_lock. /
544	static void ocfs2_remove_metadata_tree(struct ocfs2_caching_info *ci,
545	struct ocfs2_meta_cache_item *item)
546	{
547	trace_ocfs2_remove_metadata_tree(
548	val1: (unsigned long long)ocfs2_metadata_cache_owner(ci),
549	val2: (unsigned long long)item->c_block);
550
551	rb_erase(&item->c_node, &ci->ci_cache.ci_tree);
552	ci->ci_num_cached--;
553	}
554
555	static void ocfs2_remove_block_from_cache(struct ocfs2_caching_info *ci,
556	sector_t block)
557	{
558	int index;
559	struct ocfs2_meta_cache_item *item = NULL;
560
561	ocfs2_metadata_cache_lock(ci);
562	trace_ocfs2_remove_block_from_cache(
563	ull: (unsigned long long)ocfs2_metadata_cache_owner(ci),
564	ull1: (unsigned long long) block, value2: ci->ci_num_cached,
565	value3: ci->ci_flags);
566
567	if (ci->ci_flags & OCFS2_CACHE_FL_INLINE) {
568	index = ocfs2_search_cache_array(ci, item: block);
569	if (index != -`1`)
570	ocfs2_remove_metadata_array(ci, index);
571	} else {
572	item = ocfs2_search_cache_tree(ci, block);
573	if (item)
574	ocfs2_remove_metadata_tree(ci, item);
575	}
576	ocfs2_metadata_cache_unlock(ci);
577
578	if (item)
579	kmem_cache_free(s: ocfs2_uptodate_cachep, objp: item);
580	}
581
582	/*
583	* Called when we remove a chunk of metadata from an inode. We don't
584	* bother reverting things to an inlined array in the case of a remove
585	* which moves us back under the limit.
586	*/
587	void ocfs2_remove_from_cache(struct ocfs2_caching_info *ci,
588	struct buffer_head *bh)
589	{
590	sector_t block = bh->b_blocknr;
591
592	ocfs2_remove_block_from_cache(ci, block);
593	}
594
595	/ Called when we remove xattr clusters from an inode. /
596	void ocfs2_remove_xattr_clusters_from_cache(struct ocfs2_caching_info *ci,
597	sector_t block,
598	u32 c_len)
599	{
600	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
601	unsigned int i, b_len = ocfs2_clusters_to_blocks(sb, clusters: `1`) * c_len;
602
603	for (i = `0`; i < b_len; i++, block++)
604	ocfs2_remove_block_from_cache(ci, block);
605	}
606
607	int __init init_ocfs2_uptodate_cache(void)
608	{
609	ocfs2_uptodate_cachep = kmem_cache_create(name: "ocfs2_uptodate",
610	size: sizeof(struct ocfs2_meta_cache_item),
611	align: `0`, SLAB_HWCACHE_ALIGN, NULL);
612	if (!ocfs2_uptodate_cachep)
613	return -ENOMEM;
614
615	return `0`;
616	}
617
618	void exit_ocfs2_uptodate_cache(void)
619	{
620	kmem_cache_destroy(s: ocfs2_uptodate_cachep);
621	}
622

source code of linux/fs/ocfs2/uptodate.c