xfs_log.c source code [linux/fs/xfs/xfs_log.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Copyright (c) 2000-2005 Silicon Graphics, Inc.
4	* All Rights Reserved.
5	*/
6	#include "xfs.h"
7	#include "xfs_fs.h"
8	#include "xfs_shared.h"
9	#include "xfs_format.h"
10	#include "xfs_log_format.h"
11	#include "xfs_trans_resv.h"
12	#include "xfs_mount.h"
13	#include "xfs_errortag.h"
14	#include "xfs_error.h"
15	#include "xfs_trans.h"
16	#include "xfs_trans_priv.h"
17	#include "xfs_log.h"
18	#include "xfs_log_priv.h"
19	#include "xfs_trace.h"
20	#include "xfs_sysfs.h"
21	#include "xfs_sb.h"
22	#include "xfs_health.h"
23
24	struct kmem_cache *xfs_log_ticket_cache;
25
26	/ Local miscellaneous function prototypes /
27	STATIC struct xlog *
28	xlog_alloc_log(
29	struct xfs_mount *mp,
30	struct xfs_buftarg *log_target,
31	xfs_daddr_t blk_offset,
32	int num_bblks);
33	STATIC int
34	xlog_space_left(
35	struct xlog *log,
36	atomic64_t *head);
37	STATIC void
38	xlog_dealloc_log(
39	struct xlog *log);
40
41	/ local state machine functions /
42	STATIC void xlog_state_done_syncing(
43	struct xlog_in_core *iclog);
44	STATIC void xlog_state_do_callback(
45	struct xlog *log);
46	STATIC int
47	xlog_state_get_iclog_space(
48	struct xlog *log,
49	int len,
50	struct xlog_in_core **iclog,
51	struct xlog_ticket *ticket,
52	int *logoffsetp);
53	STATIC void
54	xlog_grant_push_ail(
55	struct xlog *log,
56	int need_bytes);
57	STATIC void
58	xlog_sync(
59	struct xlog *log,
60	struct xlog_in_core *iclog,
61	struct xlog_ticket *ticket);
62	#if defined(DEBUG)
63	STATIC void
64	xlog_verify_grant_tail(
65	struct xlog *log);
66	STATIC void
67	xlog_verify_iclog(
68	struct xlog *log,
69	struct xlog_in_core *iclog,
70	int count);
71	STATIC void
72	xlog_verify_tail_lsn(
73	struct xlog *log,
74	struct xlog_in_core *iclog);
75	#else
76	#define xlog_verify_grant_tail(a)
77	#define xlog_verify_iclog(a,b,c)
78	#define xlog_verify_tail_lsn(a,b)
79	#endif
80
81	STATIC int
82	xlog_iclogs_empty(
83	struct xlog *log);
84
85	static int
86	xfs_log_cover(struct xfs_mount *);
87
88	/*
89	* We need to make sure the buffer pointer returned is naturally aligned for the
90	* biggest basic data type we put into it. We have already accounted for this
91	* padding when sizing the buffer.
92	*
93	* However, this padding does not get written into the log, and hence we have to
94	* track the space used by the log vectors separately to prevent log space hangs
95	* due to inaccurate accounting (i.e. a leak) of the used log space through the
96	* CIL context ticket.
97	*
98	* We also add space for the xlog_op_header that describes this region in the
99	* log. This prepends the data region we return to the caller to copy their data
100	* into, so do all the static initialisation of the ophdr now. Because the ophdr
101	* is not 8 byte aligned, we have to be careful to ensure that we align the
102	* start of the buffer such that the region we return to the call is 8 byte
103	* aligned and packed against the tail of the ophdr.
104	*/
105	void *
106	xlog_prepare_iovec(
107	struct xfs_log_vec *lv,
108	struct xfs_log_iovec **vecp,
109	uint type)
110	{
111	struct xfs_log_iovec vec = vecp;
112	struct xlog_op_header *oph;
113	uint32_t len;
114	void *buf;
115
116	if (vec) {
117	ASSERT(vec - lv->lv_iovecp < lv->lv_niovecs);
118	vec++;
119	} else {
120	vec = &lv->lv_iovecp[`0`];
121	}
122
123	len = lv->lv_buf_len + sizeof(struct xlog_op_header);
124	if (!IS_ALIGNED(len, sizeof(uint64_t))) {
125	lv->lv_buf_len = round_up(len, sizeof(uint64_t)) -
126	sizeof(struct xlog_op_header);
127	}
128
129	vec->i_type = type;
130	vec->i_addr = lv->lv_buf + lv->lv_buf_len;
131
132	oph = vec->i_addr;
133	oph->oh_clientid = XFS_TRANSACTION;
134	oph->oh_res2 = `0`;
135	oph->oh_flags = `0`;
136
137	buf = vec->i_addr + sizeof(struct xlog_op_header);
138	ASSERT(IS_ALIGNED((unsigned long)buf, sizeof(uint64_t)));
139
140	*vecp = vec;
141	return buf;
142	}
143
144	static void
145	xlog_grant_sub_space(
146	struct xlog *log,
147	atomic64_t *head,
148	int bytes)
149	{
150	int64_t head_val = atomic64_read(v: head);
151	int64_t new, old;
152
153	do {
154	int cycle, space;
155
156	xlog_crack_grant_head_val(val: head_val, cycle: &cycle, space: &space);
157
158	space -= bytes;
159	if (space < `0`) {
160	space += log->l_logsize;
161	cycle--;
162	}
163
164	old = head_val;
165	new = xlog_assign_grant_head_val(cycle, space);
166	head_val = atomic64_cmpxchg(v: head, old, new);
167	} while (head_val != old);
168	}
169
170	static void
171	xlog_grant_add_space(
172	struct xlog *log,
173	atomic64_t *head,
174	int bytes)
175	{
176	int64_t head_val = atomic64_read(v: head);
177	int64_t new, old;
178
179	do {
180	int tmp;
181	int cycle, space;
182
183	xlog_crack_grant_head_val(val: head_val, cycle: &cycle, space: &space);
184
185	tmp = log->l_logsize - space;
186	if (tmp > bytes)
187	space += bytes;
188	else {
189	space = bytes - tmp;
190	cycle++;
191	}
192
193	old = head_val;
194	new = xlog_assign_grant_head_val(cycle, space);
195	head_val = atomic64_cmpxchg(v: head, old, new);
196	} while (head_val != old);
197	}
198
199	STATIC void
200	xlog_grant_head_init(
201	struct xlog_grant_head *head)
202	{
203	xlog_assign_grant_head(head: &head->grant, cycle: `1`, space: `0`);
204	INIT_LIST_HEAD(list: &head->waiters);
205	spin_lock_init(&head->lock);
206	}
207
208	STATIC void
209	xlog_grant_head_wake_all(
210	struct xlog_grant_head *head)
211	{
212	struct xlog_ticket *tic;
213
214	spin_lock(lock: &head->lock);
215	list_for_each_entry(tic, &head->waiters, t_queue)
216	wake_up_process(tsk: tic->t_task);
217	spin_unlock(lock: &head->lock);
218	}
219
220	static inline int
221	xlog_ticket_reservation(
222	struct xlog *log,
223	struct xlog_grant_head *head,
224	struct xlog_ticket *tic)
225	{
226	if (head == &log->l_write_head) {
227	ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
228	return tic->t_unit_res;
229	}
230
231	if (tic->t_flags & XLOG_TIC_PERM_RESERV)
232	return tic->t_unit_res * tic->t_cnt;
233
234	return tic->t_unit_res;
235	}
236
237	STATIC bool
238	xlog_grant_head_wake(
239	struct xlog *log,
240	struct xlog_grant_head *head,
241	int *free_bytes)
242	{
243	struct xlog_ticket *tic;
244	int need_bytes;
245	bool woken_task = false;
246
247	list_for_each_entry(tic, &head->waiters, t_queue) {
248
249	/*
250	* There is a chance that the size of the CIL checkpoints in
251	* progress at the last AIL push target calculation resulted in
252	* limiting the target to the log head (l_last_sync_lsn) at the
253	* time. This may not reflect where the log head is now as the
254	* CIL checkpoints may have completed.
255	*
256	* Hence when we are woken here, it may be that the head of the
257	* log that has moved rather than the tail. As the tail didn't
258	* move, there still won't be space available for the
259	* reservation we require. However, if the AIL has already
260	* pushed to the target defined by the old log head location, we
261	* will hang here waiting for something else to update the AIL
262	* push target.
263	*
264	* Therefore, if there isn't space to wake the first waiter on
265	* the grant head, we need to push the AIL again to ensure the
266	* target reflects both the current log tail and log head
267	* position before we wait for the tail to move again.
268	*/
269
270	need_bytes = xlog_ticket_reservation(log, head, tic);
271	if (*free_bytes < need_bytes) {
272	if (!woken_task)
273	xlog_grant_push_ail(log, need_bytes);
274	return false;
275	}
276
277	*free_bytes -= need_bytes;
278	trace_xfs_log_grant_wake_up(log, tic);
279	wake_up_process(tsk: tic->t_task);
280	woken_task = true;
281	}
282
283	return true;
284	}
285
286	STATIC int
287	xlog_grant_head_wait(
288	struct xlog *log,
289	struct xlog_grant_head *head,
290	struct xlog_ticket *tic,
291	int need_bytes) __releases(&head->lock)
292	__acquires(&head->lock)
293	{
294	list_add_tail(new: &tic->t_queue, head: &head->waiters);
295
296	do {
297	if (xlog_is_shutdown(log))
298	goto shutdown;
299	xlog_grant_push_ail(log, need_bytes);
300
301	__set_current_state(TASK_UNINTERRUPTIBLE);
302	spin_unlock(lock: &head->lock);
303
304	XFS_STATS_INC(log->l_mp, xs_sleep_logspace);
305
306	trace_xfs_log_grant_sleep(log, tic);
307	schedule();
308	trace_xfs_log_grant_wake(log, tic);
309
310	spin_lock(lock: &head->lock);
311	if (xlog_is_shutdown(log))
312	goto shutdown;
313	} while (xlog_space_left(log, head: &head->grant) < need_bytes);
314
315	list_del_init(entry: &tic->t_queue);
316	return `0`;
317	shutdown:
318	list_del_init(entry: &tic->t_queue);
319	return -EIO;
320	}
321
322	/*
323	* Atomically get the log space required for a log ticket.
324	*
325	* Once a ticket gets put onto head->waiters, it will only return after the
326	* needed reservation is satisfied.
327	*
328	* This function is structured so that it has a lock free fast path. This is
329	* necessary because every new transaction reservation will come through this
330	* path. Hence any lock will be globally hot if we take it unconditionally on
331	* every pass.
332	*
333	* As tickets are only ever moved on and off head->waiters under head->lock, we
334	* only need to take that lock if we are going to add the ticket to the queue
335	* and sleep. We can avoid taking the lock if the ticket was never added to
336	* head->waiters because the t_queue list head will be empty and we hold the
337	* only reference to it so it can safely be checked unlocked.
338	*/
339	STATIC int
340	xlog_grant_head_check(
341	struct xlog *log,
342	struct xlog_grant_head *head,
343	struct xlog_ticket *tic,
344	int *need_bytes)
345	{
346	int free_bytes;
347	int error = `0`;
348
349	ASSERT(!xlog_in_recovery(log));
350
351	/*
352	* If there are other waiters on the queue then give them a chance at
353	* logspace before us. Wake up the first waiters, if we do not wake
354	* up all the waiters then go to sleep waiting for more free space,
355	* otherwise try to get some space for this transaction.
356	*/
357	*need_bytes = xlog_ticket_reservation(log, head, tic);
358	free_bytes = xlog_space_left(log, head: &head->grant);
359	if (!list_empty_careful(head: &head->waiters)) {
360	spin_lock(lock: &head->lock);
361	if (!xlog_grant_head_wake(log, head, free_bytes: &free_bytes) \|\|
362	free_bytes < *need_bytes) {
363	error = xlog_grant_head_wait(log, head, tic,
364	need_bytes: *need_bytes);
365	}
366	spin_unlock(lock: &head->lock);
367	} else if (free_bytes < *need_bytes) {
368	spin_lock(lock: &head->lock);
369	error = xlog_grant_head_wait(log, head, tic, need_bytes: *need_bytes);
370	spin_unlock(lock: &head->lock);
371	}
372
373	return error;
374	}
375
376	bool
377	xfs_log_writable(
378	struct xfs_mount *mp)
379	{
380	/*
381	* Do not write to the log on norecovery mounts, if the data or log
382	* devices are read-only, or if the filesystem is shutdown. Read-only
383	* mounts allow internal writes for log recovery and unmount purposes,
384	* so don't restrict that case.
385	*/
386	if (xfs_has_norecovery(mp))
387	return false;
388	if (xfs_readonly_buftarg(mp->m_ddev_targp))
389	return false;
390	if (xfs_readonly_buftarg(mp->m_log->l_targ))
391	return false;
392	if (xlog_is_shutdown(log: mp->m_log))
393	return false;
394	return true;
395	}
396
397	/*
398	* Replenish the byte reservation required by moving the grant write head.
399	*/
400	int
401	xfs_log_regrant(
402	struct xfs_mount *mp,
403	struct xlog_ticket *tic)
404	{
405	struct xlog *log = mp->m_log;
406	int need_bytes;
407	int error = `0`;
408
409	if (xlog_is_shutdown(log))
410	return -EIO;
411
412	XFS_STATS_INC(mp, xs_try_logspace);
413
414	/*
415	* This is a new transaction on the ticket, so we need to change the
416	* transaction ID so that the next transaction has a different TID in
417	* the log. Just add one to the existing tid so that we can see chains
418	* of rolling transactions in the log easily.
419	*/
420	tic->t_tid++;
421
422	xlog_grant_push_ail(log, need_bytes: tic->t_unit_res);
423
424	tic->t_curr_res = tic->t_unit_res;
425	if (tic->t_cnt > `0`)
426	return `0`;
427
428	trace_xfs_log_regrant(log, tic);
429
430	error = xlog_grant_head_check(log, head: &log->l_write_head, tic,
431	need_bytes: &need_bytes);
432	if (error)
433	goto out_error;
434
435	xlog_grant_add_space(log, head: &log->l_write_head.grant, bytes: need_bytes);
436	trace_xfs_log_regrant_exit(log, tic);
437	xlog_verify_grant_tail(log);
438	return `0`;
439
440	out_error:
441	/*
442	* If we are failing, make sure the ticket doesn't have any current
443	* reservations. We don't want to add this back when the ticket/
444	* transaction gets cancelled.
445	*/
446	tic->t_curr_res = `0`;
447	tic->t_cnt = `0`; / ungrant will give back unit_res * t_cnt. /
448	return error;
449	}
450
451	/*
452	* Reserve log space and return a ticket corresponding to the reservation.
453	*
454	* Each reservation is going to reserve extra space for a log record header.
455	* When writes happen to the on-disk log, we don't subtract the length of the
456	* log record header from any reservation. By wasting space in each
457	* reservation, we prevent over allocation problems.
458	*/
459	int
460	xfs_log_reserve(
461	struct xfs_mount *mp,
462	int unit_bytes,
463	int cnt,
464	struct xlog_ticket **ticp,
465	bool permanent)
466	{
467	struct xlog *log = mp->m_log;
468	struct xlog_ticket *tic;
469	int need_bytes;
470	int error = `0`;
471
472	if (xlog_is_shutdown(log))
473	return -EIO;
474
475	XFS_STATS_INC(mp, xs_try_logspace);
476
477	ASSERT(*ticp == NULL);
478	tic = xlog_ticket_alloc(log, unit_bytes, count: cnt, permanent);
479	*ticp = tic;
480
481	xlog_grant_push_ail(log, need_bytes: tic->t_cnt ? tic->t_unit_res * tic->t_cnt
482	: tic->t_unit_res);
483
484	trace_xfs_log_reserve(log, tic);
485
486	error = xlog_grant_head_check(log, head: &log->l_reserve_head, tic,
487	need_bytes: &need_bytes);
488	if (error)
489	goto out_error;
490
491	xlog_grant_add_space(log, head: &log->l_reserve_head.grant, bytes: need_bytes);
492	xlog_grant_add_space(log, head: &log->l_write_head.grant, bytes: need_bytes);
493	trace_xfs_log_reserve_exit(log, tic);
494	xlog_verify_grant_tail(log);
495	return `0`;
496
497	out_error:
498	/*
499	* If we are failing, make sure the ticket doesn't have any current
500	* reservations. We don't want to add this back when the ticket/
501	* transaction gets cancelled.
502	*/
503	tic->t_curr_res = `0`;
504	tic->t_cnt = `0`; / ungrant will give back unit_res * t_cnt. /
505	return error;
506	}
507
508	/*
509	* Run all the pending iclog callbacks and wake log force waiters and iclog
510	* space waiters so they can process the newly set shutdown state. We really
511	* don't care what order we process callbacks here because the log is shut down
512	* and so state cannot change on disk anymore. However, we cannot wake waiters
513	* until the callbacks have been processed because we may be in unmount and
514	* we must ensure that all AIL operations the callbacks perform have completed
515	* before we tear down the AIL.
516	*
517	* We avoid processing actively referenced iclogs so that we don't run callbacks
518	* while the iclog owner might still be preparing the iclog for IO submssion.
519	* These will be caught by xlog_state_iclog_release() and call this function
520	* again to process any callbacks that may have been added to that iclog.
521	*/
522	static void
523	xlog_state_shutdown_callbacks(
524	struct xlog *log)
525	{
526	struct xlog_in_core *iclog;
527	LIST_HEAD(cb_list);
528
529	iclog = log->l_iclog;
530	do {
531	if (atomic_read(v: &iclog->ic_refcnt)) {
532	/ Reference holder will re-run iclog callbacks. /
533	continue;
534	}
535	list_splice_init(list: &iclog->ic_callbacks, head: &cb_list);
536	spin_unlock(lock: &log->l_icloglock);
537
538	xlog_cil_process_committed(list: &cb_list);
539
540	spin_lock(lock: &log->l_icloglock);
541	wake_up_all(&iclog->ic_write_wait);
542	wake_up_all(&iclog->ic_force_wait);
543	} while ((iclog = iclog->ic_next) != log->l_iclog);
544
545	wake_up_all(&log->l_flush_wait);
546	}
547
548	/*
549	* Flush iclog to disk if this is the last reference to the given iclog and the
550	* it is in the WANT_SYNC state.
551	*
552	* If XLOG_ICL_NEED_FUA is already set on the iclog, we need to ensure that the
553	* log tail is updated correctly. NEED_FUA indicates that the iclog will be
554	* written to stable storage, and implies that a commit record is contained
555	* within the iclog. We need to ensure that the log tail does not move beyond
556	* the tail that the first commit record in the iclog ordered against, otherwise
557	* correct recovery of that checkpoint becomes dependent on future operations
558	* performed on this iclog.
559	*
560	* Hence if NEED_FUA is set and the current iclog tail lsn is empty, write the
561	* current tail into iclog. Once the iclog tail is set, future operations must
562	* not modify it, otherwise they potentially violate ordering constraints for
563	* the checkpoint commit that wrote the initial tail lsn value. The tail lsn in
564	* the iclog will get zeroed on activation of the iclog after sync, so we
565	* always capture the tail lsn on the iclog on the first NEED_FUA release
566	* regardless of the number of active reference counts on this iclog.
567	*/
568	int
569	xlog_state_release_iclog(
570	struct xlog *log,
571	struct xlog_in_core *iclog,
572	struct xlog_ticket *ticket)
573	{
574	xfs_lsn_t tail_lsn;
575	bool last_ref;
576
577	lockdep_assert_held(&log->l_icloglock);
578
579	trace_xlog_iclog_release(iclog, _RET_IP_);
580	/*
581	* Grabbing the current log tail needs to be atomic w.r.t. the writing
582	* of the tail LSN into the iclog so we guarantee that the log tail does
583	* not move between the first time we know that the iclog needs to be
584	* made stable and when we eventually submit it.
585	*/
586	if ((iclog->ic_state == XLOG_STATE_WANT_SYNC \|\|
587	(iclog->ic_flags & XLOG_ICL_NEED_FUA)) &&
588	!iclog->ic_header.h_tail_lsn) {
589	tail_lsn = xlog_assign_tail_lsn(log->l_mp);
590	iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
591	}
592
593	last_ref = atomic_dec_and_test(v: &iclog->ic_refcnt);
594
595	if (xlog_is_shutdown(log)) {
596	/*
597	* If there are no more references to this iclog, process the
598	* pending iclog callbacks that were waiting on the release of
599	* this iclog.
600	*/
601	if (last_ref)
602	xlog_state_shutdown_callbacks(log);
603	return -EIO;
604	}
605
606	if (!last_ref)
607	return `0`;
608
609	if (iclog->ic_state != XLOG_STATE_WANT_SYNC) {
610	ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
611	return `0`;
612	}
613
614	iclog->ic_state = XLOG_STATE_SYNCING;
615	xlog_verify_tail_lsn(log, iclog);
616	trace_xlog_iclog_syncing(iclog, _RET_IP_);
617
618	spin_unlock(lock: &log->l_icloglock);
619	xlog_sync(log, iclog, ticket);
620	spin_lock(lock: &log->l_icloglock);
621	return `0`;
622	}
623
624	/*
625	* Mount a log filesystem
626	*
627	* mp - ubiquitous xfs mount point structure
628	* log_target - buftarg of on-disk log device
629	* blk_offset - Start block # where block size is 512 bytes (BBSIZE)
630	* num_bblocks - Number of BBSIZE blocks in on-disk log
631	*
632	* Return error or zero.
633	*/
634	int
635	xfs_log_mount(
636	xfs_mount_t *mp,
637	xfs_buftarg_t *log_target,
638	xfs_daddr_t blk_offset,
639	int num_bblks)
640	{
641	struct xlog *log;
642	int error = `0`;
643	int min_logfsbs;
644
645	if (!xfs_has_norecovery(mp)) {
646	xfs_notice(mp, "Mounting V%d Filesystem %pU",
647	XFS_SB_VERSION_NUM(&mp->m_sb),
648	&mp->m_sb.sb_uuid);
649	} else {
650	xfs_notice(mp,
651	"Mounting V%d filesystem %pU in no-recovery mode. Filesystem will be inconsistent.",
652	XFS_SB_VERSION_NUM(&mp->m_sb),
653	&mp->m_sb.sb_uuid);
654	ASSERT(xfs_is_readonly(mp));
655	}
656
657	log = xlog_alloc_log(mp, log_target, blk_offset, num_bblks);
658	if (IS_ERR(ptr: log)) {
659	error = PTR_ERR(ptr: log);
660	goto out;
661	}
662	mp->m_log = log;
663
664	/*
665	* Now that we have set up the log and it's internal geometry
666	* parameters, we can validate the given log space and drop a critical
667	* message via syslog if the log size is too small. A log that is too
668	* small can lead to unexpected situations in transaction log space
669	* reservation stage. The superblock verifier has already validated all
670	* the other log geometry constraints, so we don't have to check those
671	* here.
672	*
673	* Note: For v4 filesystems, we can't just reject the mount if the
674	* validation fails. This would mean that people would have to
675	* downgrade their kernel just to remedy the situation as there is no
676	* way to grow the log (short of black magic surgery with xfs_db).
677	*
678	* We can, however, reject mounts for V5 format filesystems, as the
679	* mkfs binary being used to make the filesystem should never create a
680	* filesystem with a log that is too small.
681	*/
682	min_logfsbs = xfs_log_calc_minimum_size(mp);
683	if (mp->m_sb.sb_logblocks < min_logfsbs) {
684	xfs_warn(mp,
685	"Log size %d blocks too small, minimum size is %d blocks",
686	mp->m_sb.sb_logblocks, min_logfsbs);
687
688	/*
689	* Log check errors are always fatal on v5; or whenever bad
690	* metadata leads to a crash.
691	*/
692	if (xfs_has_crc(mp)) {
693	xfs_crit(mp, "AAIEEE! Log failed size checks. Abort!");
694	ASSERT(`0`);
695	error = -EINVAL;
696	goto out_free_log;
697	}
698	xfs_crit(mp, "Log size out of supported range.");
699	xfs_crit(mp,
700	"Continuing onwards, but if log hangs are experienced then please report this message in the bug report.");
701	}
702
703	/*
704	* Initialize the AIL now we have a log.
705	*/
706	error = xfs_trans_ail_init(mp);
707	if (error) {
708	xfs_warn(mp, "AIL initialisation failed: error %d", error);
709	goto out_free_log;
710	}
711	log->l_ailp = mp->m_ail;
712
713	/*
714	* skip log recovery on a norecovery mount. pretend it all
715	* just worked.
716	*/
717	if (!xfs_has_norecovery(mp)) {
718	error = xlog_recover(log);
719	if (error) {
720	xfs_warn(mp, "log mount/recovery failed: error %d",
721	error);
722	xlog_recover_cancel(log);
723	goto out_destroy_ail;
724	}
725	}
726
727	error = xfs_sysfs_init(kobj: &log->l_kobj, ktype: &xfs_log_ktype, parent_kobj: &mp->m_kobj,
728	name: "log");
729	if (error)
730	goto out_destroy_ail;
731
732	/ Normal transactions can now occur /
733	clear_bit(XLOG_ACTIVE_RECOVERY, addr: &log->l_opstate);
734
735	/*
736	* Now the log has been fully initialised and we know were our
737	* space grant counters are, we can initialise the permanent ticket
738	* needed for delayed logging to work.
739	*/
740	xlog_cil_init_post_recovery(log);
741
742	return `0`;
743
744	out_destroy_ail:
745	xfs_trans_ail_destroy(mp);
746	out_free_log:
747	xlog_dealloc_log(log);
748	out:
749	return error;
750	}
751
752	/*
753	* Finish the recovery of the file system. This is separate from the
754	* xfs_log_mount() call, because it depends on the code in xfs_mountfs() to read
755	* in the root and real-time bitmap inodes between calling xfs_log_mount() and
756	* here.
757	*
758	* If we finish recovery successfully, start the background log work. If we are
759	* not doing recovery, then we have a RO filesystem and we don't need to start
760	* it.
761	*/
762	int
763	xfs_log_mount_finish(
764	struct xfs_mount *mp)
765	{
766	struct xlog *log = mp->m_log;
767	int error = `0`;
768
769	if (xfs_has_norecovery(mp)) {
770	ASSERT(xfs_is_readonly(mp));
771	return `0`;
772	}
773
774	/*
775	* During the second phase of log recovery, we need iget and
776	* iput to behave like they do for an active filesystem.
777	* xfs_fs_drop_inode needs to be able to prevent the deletion
778	* of inodes before we're done replaying log items on those
779	* inodes. Turn it off immediately after recovery finishes
780	* so that we don't leak the quota inodes if subsequent mount
781	* activities fail.
782	*
783	* We let all inodes involved in redo item processing end up on
784	* the LRU instead of being evicted immediately so that if we do
785	* something to an unlinked inode, the irele won't cause
786	* premature truncation and freeing of the inode, which results
787	* in log recovery failure. We have to evict the unreferenced
788	* lru inodes after clearing SB_ACTIVE because we don't
789	* otherwise clean up the lru if there's a subsequent failure in
790	* xfs_mountfs, which leads to us leaking the inodes if nothing
791	* else (e.g. quotacheck) references the inodes before the
792	* mount failure occurs.
793	*/
794	mp->m_super->s_flags \|= SB_ACTIVE;
795	xfs_log_work_queue(mp);
796	if (xlog_recovery_needed(log))
797	error = xlog_recover_finish(log);
798	mp->m_super->s_flags &= ~SB_ACTIVE;
799	evict_inodes(sb: mp->m_super);
800
801	/*
802	* Drain the buffer LRU after log recovery. This is required for v4
803	* filesystems to avoid leaving around buffers with NULL verifier ops,
804	* but we do it unconditionally to make sure we're always in a clean
805	* cache state after mount.
806	*
807	* Don't push in the error case because the AIL may have pending intents
808	* that aren't removed until recovery is cancelled.
809	*/
810	if (xlog_recovery_needed(log)) {
811	if (!error) {
812	xfs_log_force(mp, XFS_LOG_SYNC);
813	xfs_ail_push_all_sync(mp->m_ail);
814	}
815	xfs_notice(mp, "Ending recovery (logdev: %s)",
816	mp->m_logname ? mp->m_logname : "internal");
817	} else {
818	xfs_info(mp, "Ending clean mount");
819	}
820	xfs_buftarg_drain(mp->m_ddev_targp);
821
822	clear_bit(XLOG_RECOVERY_NEEDED, addr: &log->l_opstate);
823
824	/ Make sure the log is dead if we're returning failure. /
825	ASSERT(!error \|\| xlog_is_shutdown(log));
826
827	return error;
828	}
829
830	/*
831	* The mount has failed. Cancel the recovery if it hasn't completed and destroy
832	* the log.
833	*/
834	void
835	xfs_log_mount_cancel(
836	struct xfs_mount *mp)
837	{
838	xlog_recover_cancel(mp->m_log);
839	xfs_log_unmount(mp);
840	}
841
842	/*
843	* Flush out the iclog to disk ensuring that device caches are flushed and
844	* the iclog hits stable storage before any completion waiters are woken.
845	*/
846	static inline int
847	xlog_force_iclog(
848	struct xlog_in_core *iclog)
849	{
850	atomic_inc(v: &iclog->ic_refcnt);
851	iclog->ic_flags \|= XLOG_ICL_NEED_FLUSH \| XLOG_ICL_NEED_FUA;
852	if (iclog->ic_state == XLOG_STATE_ACTIVE)
853	xlog_state_switch_iclogs(log: iclog->ic_log, iclog, eventual_size: `0`);
854	return xlog_state_release_iclog(log: iclog->ic_log, iclog, NULL);
855	}
856
857	/*
858	* Cycle all the iclogbuf locks to make sure all log IO completion
859	* is done before we tear down these buffers.
860	*/
861	static void
862	xlog_wait_iclog_completion(struct xlog *log)
863	{
864	int i;
865	struct xlog_in_core *iclog = log->l_iclog;
866
867	for (i = `0`; i < log->l_iclog_bufs; i++) {
868	down(sem: &iclog->ic_sema);
869	up(sem: &iclog->ic_sema);
870	iclog = iclog->ic_next;
871	}
872	}
873
874	/*
875	* Wait for the iclog and all prior iclogs to be written disk as required by the
876	* log force state machine. Waiting on ic_force_wait ensures iclog completions
877	* have been ordered and callbacks run before we are woken here, hence
878	* guaranteeing that all the iclogs up to this one are on stable storage.
879	*/
880	int
881	xlog_wait_on_iclog(
882	struct xlog_in_core *iclog)
883	__releases(iclog->ic_log->l_icloglock)
884	{
885	struct xlog *log = iclog->ic_log;
886
887	trace_xlog_iclog_wait_on(iclog, _RET_IP_);
888	if (!xlog_is_shutdown(log) &&
889	iclog->ic_state != XLOG_STATE_ACTIVE &&
890	iclog->ic_state != XLOG_STATE_DIRTY) {
891	XFS_STATS_INC(log->l_mp, xs_log_force_sleep);
892	xlog_wait(wq: &iclog->ic_force_wait, lock: &log->l_icloglock);
893	} else {
894	spin_unlock(lock: &log->l_icloglock);
895	}
896
897	if (xlog_is_shutdown(log))
898	return -EIO;
899	return `0`;
900	}
901
902	/*
903	* Write out an unmount record using the ticket provided. We have to account for
904	* the data space used in the unmount ticket as this write is not done from a
905	* transaction context that has already done the accounting for us.
906	*/
907	static int
908	xlog_write_unmount_record(
909	struct xlog *log,
910	struct xlog_ticket *ticket)
911	{
912	struct {
913	struct xlog_op_header ophdr;
914	struct xfs_unmount_log_format ulf;
915	} unmount_rec = {
916	.ophdr = {
917	.oh_clientid = XFS_LOG,
918	.oh_tid = cpu_to_be32(ticket->t_tid),
919	.oh_flags = XLOG_UNMOUNT_TRANS,
920	},
921	.ulf = {
922	.magic = XLOG_UNMOUNT_TYPE,
923	},
924	};
925	struct xfs_log_iovec reg = {
926	.i_addr = &unmount_rec,
927	.i_len = sizeof(unmount_rec),
928	.i_type = XLOG_REG_TYPE_UNMOUNT,
929	};
930	struct xfs_log_vec vec = {
931	.lv_niovecs = `1`,
932	.lv_iovecp = &reg,
933	};
934	LIST_HEAD(lv_chain);
935	list_add(new: &vec.lv_list, head: &lv_chain);
936
937	BUILD_BUG_ON((sizeof(struct xlog_op_header) +
938	sizeof(struct xfs_unmount_log_format)) !=
939	sizeof(unmount_rec));
940
941	/ account for space used by record data /
942	ticket->t_curr_res -= sizeof(unmount_rec);
943
944	return xlog_write(log, NULL, lv_chain: &lv_chain, tic: ticket, len: reg.i_len);
945	}
946
947	/*
948	* Mark the filesystem clean by writing an unmount record to the head of the
949	* log.
950	*/
951	static void
952	xlog_unmount_write(
953	struct xlog *log)
954	{
955	struct xfs_mount *mp = log->l_mp;
956	struct xlog_in_core *iclog;
957	struct xlog_ticket *tic = NULL;
958	int error;
959
960	error = xfs_log_reserve(mp, unit_bytes: `600`, cnt: `1`, ticp: &tic, permanent: `0`);
961	if (error)
962	goto out_err;
963
964	error = xlog_write_unmount_record(log, ticket: tic);
965	/*
966	* At this point, we're umounting anyway, so there's no point in
967	* transitioning log state to shutdown. Just continue...
968	*/
969	out_err:
970	if (error)
971	xfs_alert(mp, "%s: unmount record failed", __func__);
972
973	spin_lock(lock: &log->l_icloglock);
974	iclog = log->l_iclog;
975	error = xlog_force_iclog(iclog);
976	xlog_wait_on_iclog(iclog);
977
978	if (tic) {
979	trace_xfs_log_umount_write(log, tic);
980	xfs_log_ticket_ungrant(log, ticket: tic);
981	}
982	}
983
984	static void
985	xfs_log_unmount_verify_iclog(
986	struct xlog *log)
987	{
988	struct xlog_in_core *iclog = log->l_iclog;
989
990	do {
991	ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
992	ASSERT(iclog->ic_offset == `0`);
993	} while ((iclog = iclog->ic_next) != log->l_iclog);
994	}
995
996	/*
997	* Unmount record used to have a string "Unmount filesystem--" in the
998	* data section where the "Un" was really a magic number (XLOG_UNMOUNT_TYPE).
999	* We just write the magic number now since that particular field isn't
1000	* currently architecture converted and "Unmount" is a bit foo.
1001	* As far as I know, there weren't any dependencies on the old behaviour.
1002	*/
1003	static void
1004	xfs_log_unmount_write(
1005	struct xfs_mount *mp)
1006	{
1007	struct xlog *log = mp->m_log;
1008
1009	if (!xfs_log_writable(mp))
1010	return;
1011
1012	xfs_log_force(mp, XFS_LOG_SYNC);
1013
1014	if (xlog_is_shutdown(log))
1015	return;
1016
1017	/*
1018	* If we think the summary counters are bad, avoid writing the unmount
1019	* record to force log recovery at next mount, after which the summary
1020	* counters will be recalculated. Refer to xlog_check_unmount_rec for
1021	* more details.
1022	*/
1023	if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS), mp,
1024	XFS_ERRTAG_FORCE_SUMMARY_RECALC)) {
1025	xfs_alert(mp, "%s: will fix summary counters at next mount",
1026	__func__);
1027	return;
1028	}
1029
1030	xfs_log_unmount_verify_iclog(log);
1031	xlog_unmount_write(log);
1032	}
1033
1034	/*
1035	* Empty the log for unmount/freeze.
1036	*
1037	* To do this, we first need to shut down the background log work so it is not
1038	* trying to cover the log as we clean up. We then need to unpin all objects in
1039	* the log so we can then flush them out. Once they have completed their IO and
1040	* run the callbacks removing themselves from the AIL, we can cover the log.
1041	*/
1042	int
1043	xfs_log_quiesce(
1044	struct xfs_mount *mp)
1045	{
1046	/*
1047	* Clear log incompat features since we're quiescing the log. Report
1048	* failures, though it's not fatal to have a higher log feature
1049	* protection level than the log contents actually require.
1050	*/
1051	if (xfs_clear_incompat_log_features(mp)) {
1052	int error;
1053
1054	error = xfs_sync_sb(mp, false);
1055	if (error)
1056	xfs_warn(mp,
1057	"Failed to clear log incompat features on quiesce");
1058	}
1059
1060	cancel_delayed_work_sync(dwork: &mp->m_log->l_work);
1061	xfs_log_force(mp, XFS_LOG_SYNC);
1062
1063	/*
1064	* The superblock buffer is uncached and while xfs_ail_push_all_sync()
1065	* will push it, xfs_buftarg_wait() will not wait for it. Further,
1066	* xfs_buf_iowait() cannot be used because it was pushed with the
1067	* XBF_ASYNC flag set, so we need to use a lock/unlock pair to wait for
1068	* the IO to complete.
1069	*/
1070	xfs_ail_push_all_sync(mp->m_ail);
1071	xfs_buftarg_wait(mp->m_ddev_targp);
1072	xfs_buf_lock(mp->m_sb_bp);
1073	xfs_buf_unlock(mp->m_sb_bp);
1074
1075	return xfs_log_cover(mp);
1076	}
1077
1078	void
1079	xfs_log_clean(
1080	struct xfs_mount *mp)
1081	{
1082	xfs_log_quiesce(mp);
1083	xfs_log_unmount_write(mp);
1084	}
1085
1086	/*
1087	* Shut down and release the AIL and Log.
1088	*
1089	* During unmount, we need to ensure we flush all the dirty metadata objects
1090	* from the AIL so that the log is empty before we write the unmount record to
1091	* the log. Once this is done, we can tear down the AIL and the log.
1092	*/
1093	void
1094	xfs_log_unmount(
1095	struct xfs_mount *mp)
1096	{
1097	xfs_log_clean(mp);
1098
1099	/*
1100	* If shutdown has come from iclog IO context, the log
1101	* cleaning will have been skipped and so we need to wait
1102	* for the iclog to complete shutdown processing before we
1103	* tear anything down.
1104	*/
1105	xlog_wait_iclog_completion(log: mp->m_log);
1106
1107	xfs_buftarg_drain(mp->m_ddev_targp);
1108
1109	xfs_trans_ail_destroy(mp);
1110
1111	xfs_sysfs_del(kobj: &mp->m_log->l_kobj);
1112
1113	xlog_dealloc_log(log: mp->m_log);
1114	}
1115
1116	void
1117	xfs_log_item_init(
1118	struct xfs_mount *mp,
1119	struct xfs_log_item *item,
1120	int type,
1121	const struct xfs_item_ops *ops)
1122	{
1123	item->li_log = mp->m_log;
1124	item->li_ailp = mp->m_ail;
1125	item->li_type = type;
1126	item->li_ops = ops;
1127	item->li_lv = NULL;
1128
1129	INIT_LIST_HEAD(list: &item->li_ail);
1130	INIT_LIST_HEAD(list: &item->li_cil);
1131	INIT_LIST_HEAD(list: &item->li_bio_list);
1132	INIT_LIST_HEAD(list: &item->li_trans);
1133	}
1134
1135	/*
1136	* Wake up processes waiting for log space after we have moved the log tail.
1137	*/
1138	void
1139	xfs_log_space_wake(
1140	struct xfs_mount *mp)
1141	{
1142	struct xlog *log = mp->m_log;
1143	int free_bytes;
1144
1145	if (xlog_is_shutdown(log))
1146	return;
1147
1148	if (!list_empty_careful(head: &log->l_write_head.waiters)) {
1149	ASSERT(!xlog_in_recovery(log));
1150
1151	spin_lock(lock: &log->l_write_head.lock);
1152	free_bytes = xlog_space_left(log, head: &log->l_write_head.grant);
1153	xlog_grant_head_wake(log, head: &log->l_write_head, free_bytes: &free_bytes);
1154	spin_unlock(lock: &log->l_write_head.lock);
1155	}
1156
1157	if (!list_empty_careful(head: &log->l_reserve_head.waiters)) {
1158	ASSERT(!xlog_in_recovery(log));
1159
1160	spin_lock(lock: &log->l_reserve_head.lock);
1161	free_bytes = xlog_space_left(log, head: &log->l_reserve_head.grant);
1162	xlog_grant_head_wake(log, head: &log->l_reserve_head, free_bytes: &free_bytes);
1163	spin_unlock(lock: &log->l_reserve_head.lock);
1164	}
1165	}
1166
1167	/*
1168	* Determine if we have a transaction that has gone to disk that needs to be
1169	* covered. To begin the transition to the idle state firstly the log needs to
1170	* be idle. That means the CIL, the AIL and the iclogs needs to be empty before
1171	* we start attempting to cover the log.
1172	*
1173	* Only if we are then in a state where covering is needed, the caller is
1174	* informed that dummy transactions are required to move the log into the idle
1175	* state.
1176	*
1177	* If there are any items in the AIl or CIL, then we do not want to attempt to
1178	* cover the log as we may be in a situation where there isn't log space
1179	* available to run a dummy transaction and this can lead to deadlocks when the
1180	* tail of the log is pinned by an item that is modified in the CIL. Hence
1181	* there's no point in running a dummy transaction at this point because we
1182	* can't start trying to idle the log until both the CIL and AIL are empty.
1183	*/
1184	static bool
1185	xfs_log_need_covered(
1186	struct xfs_mount *mp)
1187	{
1188	struct xlog *log = mp->m_log;
1189	bool needed = false;
1190
1191	if (!xlog_cil_empty(log))
1192	return false;
1193
1194	spin_lock(lock: &log->l_icloglock);
1195	switch (log->l_covered_state) {
1196	case XLOG_STATE_COVER_DONE:
1197	case XLOG_STATE_COVER_DONE2:
1198	case XLOG_STATE_COVER_IDLE:
1199	break;
1200	case XLOG_STATE_COVER_NEED:
1201	case XLOG_STATE_COVER_NEED2:
1202	if (xfs_ail_min_lsn(log->l_ailp))
1203	break;
1204	if (!xlog_iclogs_empty(log))
1205	break;
1206
1207	needed = true;
1208	if (log->l_covered_state == XLOG_STATE_COVER_NEED)
1209	log->l_covered_state = XLOG_STATE_COVER_DONE;
1210	else
1211	log->l_covered_state = XLOG_STATE_COVER_DONE2;
1212	break;
1213	default:
1214	needed = true;
1215	break;
1216	}
1217	spin_unlock(lock: &log->l_icloglock);
1218	return needed;
1219	}
1220
1221	/*
1222	* Explicitly cover the log. This is similar to background log covering but
1223	* intended for usage in quiesce codepaths. The caller is responsible to ensure
1224	* the log is idle and suitable for covering. The CIL, iclog buffers and AIL
1225	* must all be empty.
1226	*/
1227	static int
1228	xfs_log_cover(
1229	struct xfs_mount *mp)
1230	{
1231	int error = `0`;
1232	bool need_covered;
1233
1234	ASSERT((xlog_cil_empty(mp->m_log) && xlog_iclogs_empty(mp->m_log) &&
1235	!xfs_ail_min_lsn(mp->m_log->l_ailp)) \|\|
1236	xlog_is_shutdown(mp->m_log));
1237
1238	if (!xfs_log_writable(mp))
1239	return `0`;
1240
1241	/*
1242	* xfs_log_need_covered() is not idempotent because it progresses the
1243	* state machine if the log requires covering. Therefore, we must call
1244	* this function once and use the result until we've issued an sb sync.
1245	* Do so first to make that abundantly clear.
1246	*
1247	* Fall into the covering sequence if the log needs covering or the
1248	* mount has lazy superblock accounting to sync to disk. The sb sync
1249	* used for covering accumulates the in-core counters, so covering
1250	* handles this for us.
1251	*/
1252	need_covered = xfs_log_need_covered(mp);
1253	if (!need_covered && !xfs_has_lazysbcount(mp))
1254	return `0`;
1255
1256	/*
1257	* To cover the log, commit the superblock twice (at most) in
1258	* independent checkpoints. The first serves as a reference for the
1259	* tail pointer. The sync transaction and AIL push empties the AIL and
1260	* updates the in-core tail to the LSN of the first checkpoint. The
1261	* second commit updates the on-disk tail with the in-core LSN,
1262	* covering the log. Push the AIL one more time to leave it empty, as
1263	* we found it.
1264	*/
1265	do {
1266	error = xfs_sync_sb(mp, true);
1267	if (error)
1268	break;
1269	xfs_ail_push_all_sync(mp->m_ail);
1270	} while (xfs_log_need_covered(mp));
1271
1272	return error;
1273	}
1274
1275	/*
1276	* We may be holding the log iclog lock upon entering this routine.
1277	*/
1278	xfs_lsn_t
1279	xlog_assign_tail_lsn_locked(
1280	struct xfs_mount *mp)
1281	{
1282	struct xlog *log = mp->m_log;
1283	struct xfs_log_item *lip;
1284	xfs_lsn_t tail_lsn;
1285
1286	assert_spin_locked(&mp->m_ail->ail_lock);
1287
1288	/*
1289	* To make sure we always have a valid LSN for the log tail we keep
1290	* track of the last LSN which was committed in log->l_last_sync_lsn,
1291	* and use that when the AIL was empty.
1292	*/
1293	lip = xfs_ail_min(ailp: mp->m_ail);
1294	if (lip)
1295	tail_lsn = lip->li_lsn;
1296	else
1297	tail_lsn = atomic64_read(&log->l_last_sync_lsn);
1298	trace_xfs_log_assign_tail_lsn(log, tail_lsn);
1299	atomic64_set(&log->l_tail_lsn, tail_lsn);
1300	return tail_lsn;
1301	}
1302
1303	xfs_lsn_t
1304	xlog_assign_tail_lsn(
1305	struct xfs_mount *mp)
1306	{
1307	xfs_lsn_t tail_lsn;
1308
1309	spin_lock(lock: &mp->m_ail->ail_lock);
1310	tail_lsn = xlog_assign_tail_lsn_locked(mp);
1311	spin_unlock(lock: &mp->m_ail->ail_lock);
1312
1313	return tail_lsn;
1314	}
1315
1316	/*
1317	* Return the space in the log between the tail and the head. The head
1318	* is passed in the cycle/bytes formal parms. In the special case where
1319	* the reserve head has wrapped passed the tail, this calculation is no
1320	* longer valid. In this case, just return 0 which means there is no space
1321	* in the log. This works for all places where this function is called
1322	* with the reserve head. Of course, if the write head were to ever
1323	* wrap the tail, we should blow up. Rather than catch this case here,
1324	* we depend on other ASSERTions in other parts of the code. XXXmiken
1325	*
1326	* If reservation head is behind the tail, we have a problem. Warn about it,
1327	* but then treat it as if the log is empty.
1328	*
1329	* If the log is shut down, the head and tail may be invalid or out of whack, so
1330	* shortcut invalidity asserts in this case so that we don't trigger them
1331	* falsely.
1332	*/
1333	STATIC int
1334	xlog_space_left(
1335	struct xlog *log,
1336	atomic64_t *head)
1337	{
1338	int tail_bytes;
1339	int tail_cycle;
1340	int head_cycle;
1341	int head_bytes;
1342
1343	xlog_crack_grant_head(head, cycle: &head_cycle, space: &head_bytes);
1344	xlog_crack_atomic_lsn(lsn: &log->l_tail_lsn, cycle: &tail_cycle, block: &tail_bytes);
1345	tail_bytes = BBTOB(tail_bytes);
1346	if (tail_cycle == head_cycle && head_bytes >= tail_bytes)
1347	return log->l_logsize - (head_bytes - tail_bytes);
1348	if (tail_cycle + `1` < head_cycle)
1349	return `0`;
1350
1351	/ Ignore potential inconsistency when shutdown. /
1352	if (xlog_is_shutdown(log))
1353	return log->l_logsize;
1354
1355	if (tail_cycle < head_cycle) {
1356	ASSERT(tail_cycle == (head_cycle - `1`));
1357	return tail_bytes - head_bytes;
1358	}
1359
1360	/*
1361	* The reservation head is behind the tail. In this case we just want to
1362	* return the size of the log as the amount of space left.
1363	*/
1364	xfs_alert(log->l_mp, "xlog_space_left: head behind tail");
1365	xfs_alert(log->l_mp, " tail_cycle = %d, tail_bytes = %d",
1366	tail_cycle, tail_bytes);
1367	xfs_alert(log->l_mp, " GH cycle = %d, GH bytes = %d",
1368	head_cycle, head_bytes);
1369	ASSERT(`0`);
1370	return log->l_logsize;
1371	}
1372
1373
1374	static void
1375	xlog_ioend_work(
1376	struct work_struct *work)
1377	{
1378	struct xlog_in_core *iclog =
1379	container_of(work, struct xlog_in_core, ic_end_io_work);
1380	struct xlog *log = iclog->ic_log;
1381	int error;
1382
1383	error = blk_status_to_errno(status: iclog->ic_bio.bi_status);
1384	#ifdef DEBUG
1385	/ treat writes with injected CRC errors as failed /
1386	if (iclog->ic_fail_crc)
1387	error = -EIO;
1388	#endif
1389
1390	/*
1391	* Race to shutdown the filesystem if we see an error.
1392	*/
1393	if (XFS_TEST_ERROR(error, log->l_mp, XFS_ERRTAG_IODONE_IOERR)) {
1394	xfs_alert(log->l_mp, "log I/O error %d", error);
1395	xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
1396	}
1397
1398	xlog_state_done_syncing(iclog);
1399	bio_uninit(&iclog->ic_bio);
1400
1401	/*
1402	* Drop the lock to signal that we are done. Nothing references the
1403	* iclog after this, so an unmount waiting on this lock can now tear it
1404	* down safely. As such, it is unsafe to reference the iclog after the
1405	* unlock as we could race with it being freed.
1406	*/
1407	up(sem: &iclog->ic_sema);
1408	}
1409
1410	/*
1411	* Return size of each in-core log record buffer.
1412	*
1413	* All machines get 8 x 32kB buffers by default, unless tuned otherwise.
1414	*
1415	* If the filesystem blocksize is too large, we may need to choose a
1416	* larger size since the directory code currently logs entire blocks.
1417	*/
1418	STATIC void
1419	xlog_get_iclog_buffer_size(
1420	struct xfs_mount *mp,
1421	struct xlog *log)
1422	{
1423	if (mp->m_logbufs <= `0`)
1424	mp->m_logbufs = XLOG_MAX_ICLOGS;
1425	if (mp->m_logbsize <= `0`)
1426	mp->m_logbsize = XLOG_BIG_RECORD_BSIZE;
1427
1428	log->l_iclog_bufs = mp->m_logbufs;
1429	log->l_iclog_size = mp->m_logbsize;
1430
1431	/*
1432	* # headers = size / 32k - one header holds cycles from 32k of data.
1433	*/
1434	log->l_iclog_heads =
1435	DIV_ROUND_UP(mp->m_logbsize, XLOG_HEADER_CYCLE_SIZE);
1436	log->l_iclog_hsize = log->l_iclog_heads << BBSHIFT;
1437	}
1438
1439	void
1440	xfs_log_work_queue(
1441	struct xfs_mount *mp)
1442	{
1443	queue_delayed_work(wq: mp->m_sync_workqueue, dwork: &mp->m_log->l_work,
1444	delay: msecs_to_jiffies(xfs_syncd_centisecs * `10`));
1445	}
1446
1447	/*
1448	* Clear the log incompat flags if we have the opportunity.
1449	*
1450	* This only happens if we're about to log the second dummy transaction as part
1451	* of covering the log and we can get the log incompat feature usage lock.
1452	*/
1453	static inline void
1454	xlog_clear_incompat(
1455	struct xlog *log)
1456	{
1457	struct xfs_mount *mp = log->l_mp;
1458
1459	if (!xfs_sb_has_incompat_log_feature(&mp->m_sb,
1460	XFS_SB_FEAT_INCOMPAT_LOG_ALL))
1461	return;
1462
1463	if (log->l_covered_state != XLOG_STATE_COVER_DONE2)
1464	return;
1465
1466	if (!down_write_trylock(sem: &log->l_incompat_users))
1467	return;
1468
1469	xfs_clear_incompat_log_features(mp);
1470	up_write(sem: &log->l_incompat_users);
1471	}
1472
1473	/*
1474	* Every sync period we need to unpin all items in the AIL and push them to
1475	* disk. If there is nothing dirty, then we might need to cover the log to
1476	* indicate that the filesystem is idle.
1477	*/
1478	static void
1479	xfs_log_worker(
1480	struct work_struct *work)
1481	{
1482	struct xlog *log = container_of(to_delayed_work(work),
1483	struct xlog, l_work);
1484	struct xfs_mount *mp = log->l_mp;
1485
1486	/ dgc: errors ignored - not fatal and nowhere to report them /
1487	if (xfs_fs_writable(mp, level: SB_FREEZE_WRITE) && xfs_log_need_covered(mp)) {
1488	/*
1489	* Dump a transaction into the log that contains no real change.
1490	* This is needed to stamp the current tail LSN into the log
1491	* during the covering operation.
1492	*
1493	* We cannot use an inode here for this - that will push dirty
1494	* state back up into the VFS and then periodic inode flushing
1495	* will prevent log covering from making progress. Hence we
1496	* synchronously log the superblock instead to ensure the
1497	* superblock is immediately unpinned and can be written back.
1498	*/
1499	xlog_clear_incompat(log);
1500	xfs_sync_sb(mp, true);
1501	} else
1502	xfs_log_force(mp, flags: `0`);
1503
1504	/ start pushing all the metadata that is currently dirty /
1505	xfs_ail_push_all(mp->m_ail);
1506
1507	/ queue us up again /
1508	xfs_log_work_queue(mp);
1509	}
1510
1511	/*
1512	* This routine initializes some of the log structure for a given mount point.
1513	* Its primary purpose is to fill in enough, so recovery can occur. However,
1514	* some other stuff may be filled in too.
1515	*/
1516	STATIC struct xlog *
1517	xlog_alloc_log(
1518	struct xfs_mount *mp,
1519	struct xfs_buftarg *log_target,
1520	xfs_daddr_t blk_offset,
1521	int num_bblks)
1522	{
1523	struct xlog *log;
1524	xlog_rec_header_t *head;
1525	xlog_in_core_t **iclogp;
1526	xlog_in_core_t iclog, prev_iclog=NULL;
1527	int i;
1528	int error = -ENOMEM;
1529	uint log2_size = `0`;
1530
1531	log = kmem_zalloc(size: sizeof(struct xlog), KM_MAYFAIL);
1532	if (!log) {
1533	xfs_warn(mp, "Log allocation failed: No memory!");
1534	goto out;
1535	}
1536
1537	log->l_mp = mp;
1538	log->l_targ = log_target;
1539	log->l_logsize = BBTOB(num_bblks);
1540	log->l_logBBstart = blk_offset;
1541	log->l_logBBsize = num_bblks;
1542	log->l_covered_state = XLOG_STATE_COVER_IDLE;
1543	set_bit(XLOG_ACTIVE_RECOVERY, addr: &log->l_opstate);
1544	INIT_DELAYED_WORK(&log->l_work, xfs_log_worker);
1545
1546	log->l_prev_block = -`1`;
1547	/ log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 /
1548	xlog_assign_atomic_lsn(lsn: &log->l_tail_lsn, cycle: `1`, block: `0`);
1549	xlog_assign_atomic_lsn(lsn: &log->l_last_sync_lsn, cycle: `1`, block: `0`);
1550	log->l_curr_cycle = `1`; / 0 is bad since this is initial value /
1551
1552	if (xfs_has_logv2(mp) && mp->m_sb.sb_logsunit > `1`)
1553	log->l_iclog_roundoff = mp->m_sb.sb_logsunit;
1554	else
1555	log->l_iclog_roundoff = BBSIZE;
1556
1557	xlog_grant_head_init(head: &log->l_reserve_head);
1558	xlog_grant_head_init(head: &log->l_write_head);
1559
1560	error = -EFSCORRUPTED;
1561	if (xfs_has_sector(mp)) {
1562	log2_size = mp->m_sb.sb_logsectlog;
1563	if (log2_size < BBSHIFT) {
1564	xfs_warn(mp, "Log sector size too small (0x%x < 0x%x)",
1565	log2_size, BBSHIFT);
1566	goto out_free_log;
1567	}
1568
1569	log2_size -= BBSHIFT;
1570	if (log2_size > mp->m_sectbb_log) {
1571	xfs_warn(mp, "Log sector size too large (0x%x > 0x%x)",
1572	log2_size, mp->m_sectbb_log);
1573	goto out_free_log;
1574	}
1575
1576	/ for larger sector sizes, must have v2 or external log /
1577	if (log2_size && log->l_logBBstart > `0` &&
1578	!xfs_has_logv2(mp)) {
1579	xfs_warn(mp,
1580	"log sector size (0x%x) invalid for configuration.",
1581	log2_size);
1582	goto out_free_log;
1583	}
1584	}
1585	log->l_sectBBsize = `1` << log2_size;
1586
1587	init_rwsem(&log->l_incompat_users);
1588
1589	xlog_get_iclog_buffer_size(mp, log);
1590
1591	spin_lock_init(&log->l_icloglock);
1592	init_waitqueue_head(&log->l_flush_wait);
1593
1594	iclogp = &log->l_iclog;
1595	/*
1596	* The amount of memory to allocate for the iclog structure is
1597	* rather funky due to the way the structure is defined. It is
1598	* done this way so that we can use different sizes for machines
1599	* with different amounts of memory. See the definition of
1600	* xlog_in_core_t in xfs_log_priv.h for details.
1601	*/
1602	ASSERT(log->l_iclog_size >= `4096`);
1603	for (i = `0`; i < log->l_iclog_bufs; i++) {
1604	size_t bvec_size = howmany(log->l_iclog_size, PAGE_SIZE) *
1605	sizeof(struct bio_vec);
1606
1607	iclog = kmem_zalloc(size: sizeof(*iclog) + bvec_size, KM_MAYFAIL);
1608	if (!iclog)
1609	goto out_free_iclog;
1610
1611	*iclogp = iclog;
1612	iclog->ic_prev = prev_iclog;
1613	prev_iclog = iclog;
1614
1615	iclog->ic_data = kvzalloc(size: log->l_iclog_size,
1616	GFP_KERNEL \| __GFP_RETRY_MAYFAIL);
1617	if (!iclog->ic_data)
1618	goto out_free_iclog;
1619	head = &iclog->ic_header;
1620	memset(head, `0`, sizeof(xlog_rec_header_t));
1621	head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
1622	head->h_version = cpu_to_be32(
1623	xfs_has_logv2(log->l_mp) ? `2` : `1`);
1624	head->h_size = cpu_to_be32(log->l_iclog_size);
1625	/ new fields /
1626	head->h_fmt = cpu_to_be32(XLOG_FMT);
1627	memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t));
1628
1629	iclog->ic_size = log->l_iclog_size - log->l_iclog_hsize;
1630	iclog->ic_state = XLOG_STATE_ACTIVE;
1631	iclog->ic_log = log;
1632	atomic_set(v: &iclog->ic_refcnt, i: `0`);
1633	INIT_LIST_HEAD(list: &iclog->ic_callbacks);
1634	iclog->ic_datap = (void *)iclog->ic_data + log->l_iclog_hsize;
1635
1636	init_waitqueue_head(&iclog->ic_force_wait);
1637	init_waitqueue_head(&iclog->ic_write_wait);
1638	INIT_WORK(&iclog->ic_end_io_work, xlog_ioend_work);
1639	sema_init(sem: &iclog->ic_sema, val: `1`);
1640
1641	iclogp = &iclog->ic_next;
1642	}
1643	iclogp = log->l_iclog; /* complete ring /
1644	log->l_iclog->ic_prev = prev_iclog; / re-write 1st prev ptr /
1645
1646	log->l_ioend_workqueue = alloc_workqueue(fmt: "xfs-log/%s",
1647	XFS_WQFLAGS(WQ_FREEZABLE \| WQ_MEM_RECLAIM \|
1648	WQ_HIGHPRI),
1649	max_active: `0`, mp->m_super->s_id);
1650	if (!log->l_ioend_workqueue)
1651	goto out_free_iclog;
1652
1653	error = xlog_cil_init(log);
1654	if (error)
1655	goto out_destroy_workqueue;
1656	return log;
1657
1658	out_destroy_workqueue:
1659	destroy_workqueue(wq: log->l_ioend_workqueue);
1660	out_free_iclog:
1661	for (iclog = log->l_iclog; iclog; iclog = prev_iclog) {
1662	prev_iclog = iclog->ic_next;
1663	kmem_free(ptr: iclog->ic_data);
1664	kmem_free(ptr: iclog);
1665	if (prev_iclog == log->l_iclog)
1666	break;
1667	}
1668	out_free_log:
1669	kmem_free(ptr: log);
1670	out:
1671	return ERR_PTR(error);
1672	} / xlog_alloc_log /
1673
1674	/*
1675	* Compute the LSN that we'd need to push the log tail towards in order to have
1676	* (a) enough on-disk log space to log the number of bytes specified, (b) at
1677	* least 25% of the log space free, and (c) at least 256 blocks free. If the
1678	* log free space already meets all three thresholds, this function returns
1679	* NULLCOMMITLSN.
1680	*/
1681	xfs_lsn_t
1682	xlog_grant_push_threshold(
1683	struct xlog *log,
1684	int need_bytes)
1685	{
1686	xfs_lsn_t threshold_lsn = `0`;
1687	xfs_lsn_t last_sync_lsn;
1688	int free_blocks;
1689	int free_bytes;
1690	int threshold_block;
1691	int threshold_cycle;
1692	int free_threshold;
1693
1694	ASSERT(BTOBB(need_bytes) < log->l_logBBsize);
1695
1696	free_bytes = xlog_space_left(log, head: &log->l_reserve_head.grant);
1697	free_blocks = BTOBBT(free_bytes);
1698
1699	/*
1700	* Set the threshold for the minimum number of free blocks in the
1701	* log to the maximum of what the caller needs, one quarter of the
1702	* log, and 256 blocks.
1703	*/
1704	free_threshold = BTOBB(need_bytes);
1705	free_threshold = max(free_threshold, (log->l_logBBsize >> `2`));
1706	free_threshold = max(free_threshold, `256`);
1707	if (free_blocks >= free_threshold)
1708	return NULLCOMMITLSN;
1709
1710	xlog_crack_atomic_lsn(lsn: &log->l_tail_lsn, cycle: &threshold_cycle,
1711	block: &threshold_block);
1712	threshold_block += free_threshold;
1713	if (threshold_block >= log->l_logBBsize) {
1714	threshold_block -= log->l_logBBsize;
1715	threshold_cycle += `1`;
1716	}
1717	threshold_lsn = xlog_assign_lsn(threshold_cycle,
1718	threshold_block);
1719	/*
1720	* Don't pass in an lsn greater than the lsn of the last
1721	* log record known to be on disk. Use a snapshot of the last sync lsn
1722	* so that it doesn't change between the compare and the set.
1723	*/
1724	last_sync_lsn = atomic64_read(&log->l_last_sync_lsn);
1725	if (XFS_LSN_CMP(threshold_lsn, last_sync_lsn) > `0`)
1726	threshold_lsn = last_sync_lsn;
1727
1728	return threshold_lsn;
1729	}
1730
1731	/*
1732	* Push the tail of the log if we need to do so to maintain the free log space
1733	* thresholds set out by xlog_grant_push_threshold. We may need to adopt a
1734	* policy which pushes on an lsn which is further along in the log once we
1735	* reach the high water mark. In this manner, we would be creating a low water
1736	* mark.
1737	*/
1738	STATIC void
1739	xlog_grant_push_ail(
1740	struct xlog *log,
1741	int need_bytes)
1742	{
1743	xfs_lsn_t threshold_lsn;
1744
1745	threshold_lsn = xlog_grant_push_threshold(log, need_bytes);
1746	if (threshold_lsn == NULLCOMMITLSN \|\| xlog_is_shutdown(log))
1747	return;
1748
1749	/*
1750	* Get the transaction layer to kick the dirty buffers out to
1751	* disk asynchronously. No point in trying to do this if
1752	* the filesystem is shutting down.
1753	*/
1754	xfs_ail_push(log->l_ailp, threshold_lsn);
1755	}
1756
1757	/*
1758	* Stamp cycle number in every block
1759	*/
1760	STATIC void
1761	xlog_pack_data(
1762	struct xlog *log,
1763	struct xlog_in_core *iclog,
1764	int roundoff)
1765	{
1766	int i, j, k;
1767	int size = iclog->ic_offset + roundoff;
1768	__be32 cycle_lsn;
1769	char *dp;
1770
1771	cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn);
1772
1773	dp = iclog->ic_datap;
1774	for (i = `0`; i < BTOBB(size); i++) {
1775	if (i >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE))
1776	break;
1777	iclog->ic_header.h_cycle_data[i] = (__be32 )dp;
1778	(__be32 )dp = cycle_lsn;
1779	dp += BBSIZE;
1780	}
1781
1782	if (xfs_has_logv2(mp: log->l_mp)) {
1783	xlog_in_core_2_t *xhdr = iclog->ic_data;
1784
1785	for ( ; i < BTOBB(size); i++) {
1786	j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
1787	k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
1788	xhdr[j].hic_xheader.xh_cycle_data[k] = (__be32 )dp;
1789	(__be32 )dp = cycle_lsn;
1790	dp += BBSIZE;
1791	}
1792
1793	for (i = `1`; i < log->l_iclog_heads; i++)
1794	xhdr[i].hic_xheader.xh_cycle = cycle_lsn;
1795	}
1796	}
1797
1798	/*
1799	* Calculate the checksum for a log buffer.
1800	*
1801	* This is a little more complicated than it should be because the various
1802	* headers and the actual data are non-contiguous.
1803	*/
1804	__le32
1805	xlog_cksum(
1806	struct xlog *log,
1807	struct xlog_rec_header *rhead,
1808	char *dp,
1809	int size)
1810	{
1811	uint32_t crc;
1812
1813	/ first generate the crc for the record header ... /
1814	crc = xfs_start_cksum_update((char *)rhead,
1815	sizeof(struct xlog_rec_header),
1816	offsetof(struct xlog_rec_header, h_crc));
1817
1818	/ ... then for additional cycle data for v2 logs ... /
1819	if (xfs_has_logv2(mp: log->l_mp)) {
1820	union xlog_in_core2 xhdr = (union* xlog_in_core2 *)rhead;
1821	int i;
1822	int xheads;
1823
1824	xheads = DIV_ROUND_UP(size, XLOG_HEADER_CYCLE_SIZE);
1825
1826	for (i = `1`; i < xheads; i++) {
1827	crc = crc32c(crc, &xhdr[i].hic_xheader,
1828	sizeof(struct xlog_rec_ext_header));
1829	}
1830	}
1831
1832	/ ... and finally for the payload /
1833	crc = crc32c(crc, address: dp, length: size);
1834
1835	return xfs_end_cksum(crc);
1836	}
1837
1838	static void
1839	xlog_bio_end_io(
1840	struct bio *bio)
1841	{
1842	struct xlog_in_core *iclog = bio->bi_private;
1843
1844	queue_work(wq: iclog->ic_log->l_ioend_workqueue,
1845	work: &iclog->ic_end_io_work);
1846	}
1847
1848	static int
1849	xlog_map_iclog_data(
1850	struct bio *bio,
1851	void *data,
1852	size_t count)
1853	{
1854	do {
1855	struct page *page = kmem_to_page(addr: data);
1856	unsigned int off = offset_in_page(data);
1857	size_t len = min_t(size_t, count, PAGE_SIZE - off);
1858
1859	if (bio_add_page(bio, page, len, off) != len)
1860	return -EIO;
1861
1862	data += len;
1863	count -= len;
1864	} while (count);
1865
1866	return `0`;
1867	}
1868
1869	STATIC void
1870	xlog_write_iclog(
1871	struct xlog *log,
1872	struct xlog_in_core *iclog,
1873	uint64_t bno,
1874	unsigned int count)
1875	{
1876	ASSERT(bno < log->l_logBBsize);
1877	trace_xlog_iclog_write(iclog, _RET_IP_);
1878
1879	/*
1880	* We lock the iclogbufs here so that we can serialise against I/O
1881	* completion during unmount. We might be processing a shutdown
1882	* triggered during unmount, and that can occur asynchronously to the
1883	* unmount thread, and hence we need to ensure that completes before
1884	* tearing down the iclogbufs. Hence we need to hold the buffer lock
1885	* across the log IO to archieve that.
1886	*/
1887	down(sem: &iclog->ic_sema);
1888	if (xlog_is_shutdown(log)) {
1889	/*
1890	* It would seem logical to return EIO here, but we rely on
1891	* the log state machine to propagate I/O errors instead of
1892	* doing it here. We kick of the state machine and unlock
1893	* the buffer manually, the code needs to be kept in sync
1894	* with the I/O completion path.
1895	*/
1896	xlog_state_done_syncing(iclog);
1897	up(sem: &iclog->ic_sema);
1898	return;
1899	}
1900
1901	/*
1902	* We use REQ_SYNC \| REQ_IDLE here to tell the block layer the are more
1903	* IOs coming immediately after this one. This prevents the block layer
1904	* writeback throttle from throttling log writes behind background
1905	* metadata writeback and causing priority inversions.
1906	*/
1907	bio_init(bio: &iclog->ic_bio, bdev: log->l_targ->bt_bdev, table: iclog->ic_bvec,
1908	howmany(count, PAGE_SIZE),
1909	opf: REQ_OP_WRITE \| REQ_META \| REQ_SYNC \| REQ_IDLE);
1910	iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart + bno;
1911	iclog->ic_bio.bi_end_io = xlog_bio_end_io;
1912	iclog->ic_bio.bi_private = iclog;
1913
1914	if (iclog->ic_flags & XLOG_ICL_NEED_FLUSH) {
1915	iclog->ic_bio.bi_opf \|= REQ_PREFLUSH;
1916	/*
1917	* For external log devices, we also need to flush the data
1918	* device cache first to ensure all metadata writeback covered
1919	* by the LSN in this iclog is on stable storage. This is slow,
1920	* but it must complete before we issue the external log IO.
1921	*
1922	* If the flush fails, we cannot conclude that past metadata
1923	* writeback from the log succeeded. Repeating the flush is
1924	* not possible, hence we must shut down with log IO error to
1925	* avoid shutdown re-entering this path and erroring out again.
1926	*/
1927	if (log->l_targ != log->l_mp->m_ddev_targp &&
1928	blkdev_issue_flush(bdev: log->l_mp->m_ddev_targp->bt_bdev)) {
1929	xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
1930	return;
1931	}
1932	}
1933	if (iclog->ic_flags & XLOG_ICL_NEED_FUA)
1934	iclog->ic_bio.bi_opf \|= REQ_FUA;
1935
1936	iclog->ic_flags &= ~(XLOG_ICL_NEED_FLUSH \| XLOG_ICL_NEED_FUA);
1937
1938	if (xlog_map_iclog_data(bio: &iclog->ic_bio, data: iclog->ic_data, count)) {
1939	xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
1940	return;
1941	}
1942	if (is_vmalloc_addr(x: iclog->ic_data))
1943	flush_kernel_vmap_range(vaddr: iclog->ic_data, size: count);
1944
1945	/*
1946	* If this log buffer would straddle the end of the log we will have
1947	* to split it up into two bios, so that we can continue at the start.
1948	*/
1949	if (bno + BTOBB(count) > log->l_logBBsize) {
1950	struct bio *split;
1951
1952	split = bio_split(bio: &iclog->ic_bio, sectors: log->l_logBBsize - bno,
1953	GFP_NOIO, bs: &fs_bio_set);
1954	bio_chain(split, &iclog->ic_bio);
1955	submit_bio(bio: split);
1956
1957	/ restart at logical offset zero for the remainder /
1958	iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart;
1959	}
1960
1961	submit_bio(bio: &iclog->ic_bio);
1962	}
1963
1964	/*
1965	* We need to bump cycle number for the part of the iclog that is
1966	* written to the start of the log. Watch out for the header magic
1967	* number case, though.
1968	*/
1969	static void
1970	xlog_split_iclog(
1971	struct xlog *log,
1972	void *data,
1973	uint64_t bno,
1974	unsigned int count)
1975	{
1976	unsigned int split_offset = BBTOB(log->l_logBBsize - bno);
1977	unsigned int i;
1978
1979	for (i = split_offset; i < count; i += BBSIZE) {
1980	uint32_t cycle = get_unaligned_be32(p: data + i);
1981
1982	if (++cycle == XLOG_HEADER_MAGIC_NUM)
1983	cycle++;
1984	put_unaligned_be32(val: cycle, p: data + i);
1985	}
1986	}
1987
1988	static int
1989	xlog_calc_iclog_size(
1990	struct xlog *log,
1991	struct xlog_in_core *iclog,
1992	uint32_t *roundoff)
1993	{
1994	uint32_t count_init, count;
1995
1996	/ Add for LR header /
1997	count_init = log->l_iclog_hsize + iclog->ic_offset;
1998	count = roundup(count_init, log->l_iclog_roundoff);
1999
2000	*roundoff = count - count_init;
2001
2002	ASSERT(count >= count_init);
2003	ASSERT(*roundoff < log->l_iclog_roundoff);
2004	return count;
2005	}
2006
2007	/*
2008	* Flush out the in-core log (iclog) to the on-disk log in an asynchronous
2009	* fashion. Previously, we should have moved the current iclog
2010	* ptr in the log to point to the next available iclog. This allows further
2011	* write to continue while this code syncs out an iclog ready to go.
2012	* Before an in-core log can be written out, the data section must be scanned
2013	* to save away the 1st word of each BBSIZE block into the header. We replace
2014	* it with the current cycle count. Each BBSIZE block is tagged with the
2015	* cycle count because there in an implicit assumption that drives will
2016	* guarantee that entire 512 byte blocks get written at once. In other words,
2017	* we can't have part of a 512 byte block written and part not written. By
2018	* tagging each block, we will know which blocks are valid when recovering
2019	* after an unclean shutdown.
2020	*
2021	* This routine is single threaded on the iclog. No other thread can be in
2022	* this routine with the same iclog. Changing contents of iclog can there-
2023	* fore be done without grabbing the state machine lock. Updating the global
2024	* log will require grabbing the lock though.
2025	*
2026	* The entire log manager uses a logical block numbering scheme. Only
2027	* xlog_write_iclog knows about the fact that the log may not start with
2028	* block zero on a given device.
2029	*/
2030	STATIC void
2031	xlog_sync(
2032	struct xlog *log,
2033	struct xlog_in_core *iclog,
2034	struct xlog_ticket *ticket)
2035	{
2036	unsigned int count; / byte count of bwrite /
2037	unsigned int roundoff; / roundoff to BB or stripe /
2038	uint64_t bno;
2039	unsigned int size;
2040
2041	ASSERT(atomic_read(&iclog->ic_refcnt) == `0`);
2042	trace_xlog_iclog_sync(iclog, _RET_IP_);
2043
2044	count = xlog_calc_iclog_size(log, iclog, roundoff: &roundoff);
2045
2046	/*
2047	* If we have a ticket, account for the roundoff via the ticket
2048	* reservation to avoid touching the hot grant heads needlessly.
2049	* Otherwise, we have to move grant heads directly.
2050	*/
2051	if (ticket) {
2052	ticket->t_curr_res -= roundoff;
2053	} else {
2054	xlog_grant_add_space(log, head: &log->l_reserve_head.grant, bytes: roundoff);
2055	xlog_grant_add_space(log, head: &log->l_write_head.grant, bytes: roundoff);
2056	}
2057
2058	/ put cycle number in every block /
2059	xlog_pack_data(log, iclog, roundoff);
2060
2061	/ real byte length /
2062	size = iclog->ic_offset;
2063	if (xfs_has_logv2(mp: log->l_mp))
2064	size += roundoff;
2065	iclog->ic_header.h_len = cpu_to_be32(size);
2066
2067	XFS_STATS_INC(log->l_mp, xs_log_writes);
2068	XFS_STATS_ADD(log->l_mp, xs_log_blocks, BTOBB(count));
2069
2070	bno = BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn));
2071
2072	/ Do we need to split this write into 2 parts? /
2073	if (bno + BTOBB(count) > log->l_logBBsize)
2074	xlog_split_iclog(log, data: &iclog->ic_header, bno, count);
2075
2076	/ calculcate the checksum /
2077	iclog->ic_header.h_crc = xlog_cksum(log, rhead: &iclog->ic_header,
2078	dp: iclog->ic_datap, size);
2079	/*
2080	* Intentionally corrupt the log record CRC based on the error injection
2081	* frequency, if defined. This facilitates testing log recovery in the
2082	* event of torn writes. Hence, set the IOABORT state to abort the log
2083	* write on I/O completion and shutdown the fs. The subsequent mount
2084	* detects the bad CRC and attempts to recover.
2085	*/
2086	#ifdef DEBUG
2087	if (XFS_TEST_ERROR(false, log->l_mp, XFS_ERRTAG_LOG_BAD_CRC)) {
2088	iclog->ic_header.h_crc &= cpu_to_le32(`0xAAAAAAAA`);
2089	iclog->ic_fail_crc = true;
2090	xfs_warn(log->l_mp,
2091	"Intentionally corrupted log record at LSN 0x%llx. Shutdown imminent.",
2092	be64_to_cpu(iclog->ic_header.h_lsn));
2093	}
2094	#endif
2095	xlog_verify_iclog(log, iclog, count);
2096	xlog_write_iclog(log, iclog, bno, count);
2097	}
2098
2099	/*
2100	* Deallocate a log structure
2101	*/
2102	STATIC void
2103	xlog_dealloc_log(
2104	struct xlog *log)
2105	{
2106	xlog_in_core_t iclog, next_iclog;
2107	int i;
2108
2109	/*
2110	* Destroy the CIL after waiting for iclog IO completion because an
2111	* iclog EIO error will try to shut down the log, which accesses the
2112	* CIL to wake up the waiters.
2113	*/
2114	xlog_cil_destroy(log);
2115
2116	iclog = log->l_iclog;
2117	for (i = `0`; i < log->l_iclog_bufs; i++) {
2118	next_iclog = iclog->ic_next;
2119	kmem_free(ptr: iclog->ic_data);
2120	kmem_free(ptr: iclog);
2121	iclog = next_iclog;
2122	}
2123
2124	log->l_mp->m_log = NULL;
2125	destroy_workqueue(wq: log->l_ioend_workqueue);
2126	kmem_free(ptr: log);
2127	}
2128
2129	/*
2130	* Update counters atomically now that memcpy is done.
2131	*/
2132	static inline void
2133	xlog_state_finish_copy(
2134	struct xlog *log,
2135	struct xlog_in_core *iclog,
2136	int record_cnt,
2137	int copy_bytes)
2138	{
2139	lockdep_assert_held(&log->l_icloglock);
2140
2141	be32_add_cpu(var: &iclog->ic_header.h_num_logops, val: record_cnt);
2142	iclog->ic_offset += copy_bytes;
2143	}
2144
2145	/*
2146	* print out info relating to regions written which consume
2147	* the reservation
2148	*/
2149	void
2150	xlog_print_tic_res(
2151	struct xfs_mount *mp,
2152	struct xlog_ticket *ticket)
2153	{
2154	xfs_warn(mp, "ticket reservation summary:");
2155	xfs_warn(mp, " unit res = %d bytes", ticket->t_unit_res);
2156	xfs_warn(mp, " current res = %d bytes", ticket->t_curr_res);
2157	xfs_warn(mp, " original count = %d", ticket->t_ocnt);
2158	xfs_warn(mp, " remaining count = %d", ticket->t_cnt);
2159	}
2160
2161	/*
2162	* Print a summary of the transaction.
2163	*/
2164	void
2165	xlog_print_trans(
2166	struct xfs_trans *tp)
2167	{
2168	struct xfs_mount *mp = tp->t_mountp;
2169	struct xfs_log_item *lip;
2170
2171	/ dump core transaction and ticket info /
2172	xfs_warn(mp, "transaction summary:");
2173	xfs_warn(mp, " log res = %d", tp->t_log_res);
2174	xfs_warn(mp, " log count = %d", tp->t_log_count);
2175	xfs_warn(mp, " flags = 0x%x", tp->t_flags);
2176
2177	xlog_print_tic_res(mp, ticket: tp->t_ticket);
2178
2179	/ dump each log item /
2180	list_for_each_entry(lip, &tp->t_items, li_trans) {
2181	struct xfs_log_vec *lv = lip->li_lv;
2182	struct xfs_log_iovec *vec;
2183	int i;
2184
2185	xfs_warn(mp, "log item: ");
2186	xfs_warn(mp, " type = 0x%x", lip->li_type);
2187	xfs_warn(mp, " flags = 0x%lx", lip->li_flags);
2188	if (!lv)
2189	continue;
2190	xfs_warn(mp, " niovecs = %d", lv->lv_niovecs);
2191	xfs_warn(mp, " size = %d", lv->lv_size);
2192	xfs_warn(mp, " bytes = %d", lv->lv_bytes);
2193	xfs_warn(mp, " buf len = %d", lv->lv_buf_len);
2194
2195	/ dump each iovec for the log item /
2196	vec = lv->lv_iovecp;
2197	for (i = `0`; i < lv->lv_niovecs; i++) {
2198	int dumplen = min(vec->i_len, `32`);
2199
2200	xfs_warn(mp, " iovec[%d]", i);
2201	xfs_warn(mp, " type = 0x%x", vec->i_type);
2202	xfs_warn(mp, " len = %d", vec->i_len);
2203	xfs_warn(mp, " first %d bytes of iovec[%d]:", dumplen, i);
2204	xfs_hex_dump(p: vec->i_addr, length: dumplen);
2205
2206	vec++;
2207	}
2208	}
2209	}
2210
2211	static inline void
2212	xlog_write_iovec(
2213	struct xlog_in_core *iclog,
2214	uint32_t *log_offset,
2215	void *data,
2216	uint32_t write_len,
2217	int *bytes_left,
2218	uint32_t *record_cnt,
2219	uint32_t *data_cnt)
2220	{
2221	ASSERT(*log_offset < iclog->ic_log->l_iclog_size);
2222	ASSERT(log_offset % sizeof*(int32_t) == `0`);
2223	ASSERT(write_len % sizeof(int32_t) == `0`);
2224
2225	memcpy(iclog->ic_datap + *log_offset, data, write_len);
2226	*log_offset += write_len;
2227	*bytes_left -= write_len;
2228	(*record_cnt)++;
2229	*data_cnt += write_len;
2230	}
2231
2232	/*
2233	* Write log vectors into a single iclog which is guaranteed by the caller
2234	* to have enough space to write the entire log vector into.
2235	*/
2236	static void
2237	xlog_write_full(
2238	struct xfs_log_vec *lv,
2239	struct xlog_ticket *ticket,
2240	struct xlog_in_core *iclog,
2241	uint32_t *log_offset,
2242	uint32_t *len,
2243	uint32_t *record_cnt,
2244	uint32_t *data_cnt)
2245	{
2246	int index;
2247
2248	ASSERT(log_offset + len <= iclog->ic_size \|\|
2249	iclog->ic_state == XLOG_STATE_WANT_SYNC);
2250
2251	/*
2252	* Ordered log vectors have no regions to write so this
2253	* loop will naturally skip them.
2254	*/
2255	for (index = `0`; index < lv->lv_niovecs; index++) {
2256	struct xfs_log_iovec *reg = &lv->lv_iovecp[index];
2257	struct xlog_op_header *ophdr = reg->i_addr;
2258
2259	ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
2260	xlog_write_iovec(iclog, log_offset, data: reg->i_addr,
2261	write_len: reg->i_len, bytes_left: len, record_cnt, data_cnt);
2262	}
2263	}
2264
2265	static int
2266	xlog_write_get_more_iclog_space(
2267	struct xlog_ticket *ticket,
2268	struct xlog_in_core **iclogp,
2269	uint32_t *log_offset,
2270	uint32_t len,
2271	uint32_t *record_cnt,
2272	uint32_t *data_cnt)
2273	{
2274	struct xlog_in_core iclog = iclogp;
2275	struct xlog *log = iclog->ic_log;
2276	int error;
2277
2278	spin_lock(lock: &log->l_icloglock);
2279	ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC);
2280	xlog_state_finish_copy(log, iclog, record_cnt: record_cnt, copy_bytes: data_cnt);
2281	error = xlog_state_release_iclog(log, iclog, ticket);
2282	spin_unlock(lock: &log->l_icloglock);
2283	if (error)
2284	return error;
2285
2286	error = xlog_state_get_iclog_space(log, len, iclog: &iclog, ticket,
2287	logoffsetp: log_offset);
2288	if (error)
2289	return error;
2290	*record_cnt = `0`;
2291	*data_cnt = `0`;
2292	*iclogp = iclog;
2293	return `0`;
2294	}
2295
2296	/*
2297	* Write log vectors into a single iclog which is smaller than the current chain
2298	* length. We write until we cannot fit a full record into the remaining space
2299	* and then stop. We return the log vector that is to be written that cannot
2300	* wholly fit in the iclog.
2301	*/
2302	static int
2303	xlog_write_partial(
2304	struct xfs_log_vec *lv,
2305	struct xlog_ticket *ticket,
2306	struct xlog_in_core **iclogp,
2307	uint32_t *log_offset,
2308	uint32_t *len,
2309	uint32_t *record_cnt,
2310	uint32_t *data_cnt)
2311	{
2312	struct xlog_in_core iclog = iclogp;
2313	struct xlog_op_header *ophdr;
2314	int index = `0`;
2315	uint32_t rlen;
2316	int error;
2317
2318	/ walk the logvec, copying until we run out of space in the iclog /
2319	for (index = `0`; index < lv->lv_niovecs; index++) {
2320	struct xfs_log_iovec *reg = &lv->lv_iovecp[index];
2321	uint32_t reg_offset = `0`;
2322
2323	/*
2324	* The first region of a continuation must have a non-zero
2325	* length otherwise log recovery will just skip over it and
2326	* start recovering from the next opheader it finds. Because we
2327	* mark the next opheader as a continuation, recovery will then
2328	* incorrectly add the continuation to the previous region and
2329	* that breaks stuff.
2330	*
2331	* Hence if there isn't space for region data after the
2332	* opheader, then we need to start afresh with a new iclog.
2333	*/
2334	if (iclog->ic_size - *log_offset <=
2335	sizeof(struct xlog_op_header)) {
2336	error = xlog_write_get_more_iclog_space(ticket,
2337	iclogp: &iclog, log_offset, len: *len, record_cnt,
2338	data_cnt);
2339	if (error)
2340	return error;
2341	}
2342
2343	ophdr = reg->i_addr;
2344	rlen = min_t(uint32_t, reg->i_len, iclog->ic_size - *log_offset);
2345
2346	ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
2347	ophdr->oh_len = cpu_to_be32(rlen - sizeof(struct xlog_op_header));
2348	if (rlen != reg->i_len)
2349	ophdr->oh_flags \|= XLOG_CONTINUE_TRANS;
2350
2351	xlog_write_iovec(iclog, log_offset, data: reg->i_addr,
2352	write_len: rlen, bytes_left: len, record_cnt, data_cnt);
2353
2354	/ If we wrote the whole region, move to the next. /
2355	if (rlen == reg->i_len)
2356	continue;
2357
2358	/*
2359	* We now have a partially written iovec, but it can span
2360	* multiple iclogs so we loop here. First we release the iclog
2361	* we currently have, then we get a new iclog and add a new
2362	* opheader. Then we continue copying from where we were until
2363	* we either complete the iovec or fill the iclog. If we
2364	* complete the iovec, then we increment the index and go right
2365	* back to the top of the outer loop. if we fill the iclog, we
2366	* run the inner loop again.
2367	*
2368	* This is complicated by the tail of a region using all the
2369	* space in an iclog and hence requiring us to release the iclog
2370	* and get a new one before returning to the outer loop. We must
2371	* always guarantee that we exit this inner loop with at least
2372	* space for log transaction opheaders left in the current
2373	* iclog, hence we cannot just terminate the loop at the end
2374	* of the of the continuation. So we loop while there is no
2375	* space left in the current iclog, and check for the end of the
2376	* continuation after getting a new iclog.
2377	*/
2378	do {
2379	/*
2380	* Ensure we include the continuation opheader in the
2381	* space we need in the new iclog by adding that size
2382	* to the length we require. This continuation opheader
2383	* needs to be accounted to the ticket as the space it
2384	* consumes hasn't been accounted to the lv we are
2385	* writing.
2386	*/
2387	error = xlog_write_get_more_iclog_space(ticket,
2388	&iclog, log_offset,
2389	len + sizeof(struct* xlog_op_header),
2390	record_cnt, data_cnt);
2391	if (error)
2392	return error;
2393
2394	ophdr = iclog->ic_datap + *log_offset;
2395	ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
2396	ophdr->oh_clientid = XFS_TRANSACTION;
2397	ophdr->oh_res2 = `0`;
2398	ophdr->oh_flags = XLOG_WAS_CONT_TRANS;
2399
2400	ticket->t_curr_res -= sizeof(struct xlog_op_header);
2401	log_offset += sizeof(struct* xlog_op_header);
2402	data_cnt += sizeof(struct* xlog_op_header);
2403
2404	/*
2405	* If rlen fits in the iclog, then end the region
2406	* continuation. Otherwise we're going around again.
2407	*/
2408	reg_offset += rlen;
2409	rlen = reg->i_len - reg_offset;
2410	if (rlen <= iclog->ic_size - *log_offset)
2411	ophdr->oh_flags \|= XLOG_END_TRANS;
2412	else
2413	ophdr->oh_flags \|= XLOG_CONTINUE_TRANS;
2414
2415	rlen = min_t(uint32_t, rlen, iclog->ic_size - *log_offset);
2416	ophdr->oh_len = cpu_to_be32(rlen);
2417
2418	xlog_write_iovec(iclog, log_offset,
2419	data: reg->i_addr + reg_offset,
2420	write_len: rlen, bytes_left: len, record_cnt, data_cnt);
2421
2422	} while (ophdr->oh_flags & XLOG_CONTINUE_TRANS);
2423	}
2424
2425	/*
2426	* No more iovecs remain in this logvec so return the next log vec to
2427	* the caller so it can go back to fast path copying.
2428	*/
2429	*iclogp = iclog;
2430	return `0`;
2431	}
2432
2433	/*
2434	* Write some region out to in-core log
2435	*
2436	* This will be called when writing externally provided regions or when
2437	* writing out a commit record for a given transaction.
2438	*
2439	* General algorithm:
2440	* 1. Find total length of this write. This may include adding to the
2441	* lengths passed in.
2442	* 2. Check whether we violate the tickets reservation.
2443	* 3. While writing to this iclog
2444	* A. Reserve as much space in this iclog as can get
2445	* B. If this is first write, save away start lsn
2446	* C. While writing this region:
2447	* 1. If first write of transaction, write start record
2448	* 2. Write log operation header (header per region)
2449	* 3. Find out if we can fit entire region into this iclog
2450	* 4. Potentially, verify destination memcpy ptr
2451	* 5. Memcpy (partial) region
2452	* 6. If partial copy, release iclog; otherwise, continue
2453	* copying more regions into current iclog
2454	* 4. Mark want sync bit (in simulation mode)
2455	* 5. Release iclog for potential flush to on-disk log.
2456	*
2457	* ERRORS:
2458	* 1. Panic if reservation is overrun. This should never happen since
2459	* reservation amounts are generated internal to the filesystem.
2460	* NOTES:
2461	* 1. Tickets are single threaded data structures.
2462	* 2. The XLOG_END_TRANS & XLOG_CONTINUE_TRANS flags are passed down to the
2463	* syncing routine. When a single log_write region needs to span
2464	* multiple in-core logs, the XLOG_CONTINUE_TRANS bit should be set
2465	* on all log operation writes which don't contain the end of the
2466	* region. The XLOG_END_TRANS bit is used for the in-core log
2467	* operation which contains the end of the continued log_write region.
2468	* 3. When xlog_state_get_iclog_space() grabs the rest of the current iclog,
2469	* we don't really know exactly how much space will be used. As a result,
2470	* we don't update ic_offset until the end when we know exactly how many
2471	* bytes have been written out.
2472	*/
2473	int
2474	xlog_write(
2475	struct xlog *log,
2476	struct xfs_cil_ctx *ctx,
2477	struct list_head *lv_chain,
2478	struct xlog_ticket *ticket,
2479	uint32_t len)
2480
2481	{
2482	struct xlog_in_core *iclog = NULL;
2483	struct xfs_log_vec *lv;
2484	uint32_t record_cnt = `0`;
2485	uint32_t data_cnt = `0`;
2486	int error = `0`;
2487	int log_offset;
2488
2489	if (ticket->t_curr_res < `0`) {
2490	xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
2491	"ctx ticket reservation ran out. Need to up reservation");
2492	xlog_print_tic_res(mp: log->l_mp, ticket);
2493	xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
2494	}
2495
2496	error = xlog_state_get_iclog_space(log, len, iclog: &iclog, ticket,
2497	logoffsetp: &log_offset);
2498	if (error)
2499	return error;
2500
2501	ASSERT(log_offset <= iclog->ic_size - `1`);
2502
2503	/*
2504	* If we have a context pointer, pass it the first iclog we are
2505	* writing to so it can record state needed for iclog write
2506	* ordering.
2507	*/
2508	if (ctx)
2509	xlog_cil_set_ctx_write_state(ctx, iclog);
2510
2511	list_for_each_entry(lv, lv_chain, lv_list) {
2512	/*
2513	* If the entire log vec does not fit in the iclog, punt it to
2514	* the partial copy loop which can handle this case.
2515	*/
2516	if (lv->lv_niovecs &&
2517	lv->lv_bytes > iclog->ic_size - log_offset) {
2518	error = xlog_write_partial(lv, ticket, iclogp: &iclog,
2519	log_offset: &log_offset, len: &len, record_cnt: &record_cnt,
2520	data_cnt: &data_cnt);
2521	if (error) {
2522	/*
2523	* We have no iclog to release, so just return
2524	* the error immediately.
2525	*/
2526	return error;
2527	}
2528	} else {
2529	xlog_write_full(lv, ticket, iclog, log_offset: &log_offset,
2530	len: &len, record_cnt: &record_cnt, data_cnt: &data_cnt);
2531	}
2532	}
2533	ASSERT(len == `0`);
2534
2535	/*
2536	* We've already been guaranteed that the last writes will fit inside
2537	* the current iclog, and hence it will already have the space used by
2538	* those writes accounted to it. Hence we do not need to update the
2539	* iclog with the number of bytes written here.
2540	*/
2541	spin_lock(lock: &log->l_icloglock);
2542	xlog_state_finish_copy(log, iclog, record_cnt, copy_bytes: `0`);
2543	error = xlog_state_release_iclog(log, iclog, ticket);
2544	spin_unlock(lock: &log->l_icloglock);
2545
2546	return error;
2547	}
2548
2549	static void
2550	xlog_state_activate_iclog(
2551	struct xlog_in_core *iclog,
2552	int *iclogs_changed)
2553	{
2554	ASSERT(list_empty_careful(&iclog->ic_callbacks));
2555	trace_xlog_iclog_activate(iclog, _RET_IP_);
2556
2557	/*
2558	* If the number of ops in this iclog indicate it just contains the
2559	* dummy transaction, we can change state into IDLE (the second time
2560	* around). Otherwise we should change the state into NEED a dummy.
2561	* We don't need to cover the dummy.
2562	*/
2563	if (*iclogs_changed == `0` &&
2564	iclog->ic_header.h_num_logops == cpu_to_be32(XLOG_COVER_OPS)) {
2565	*iclogs_changed = `1`;
2566	} else {
2567	/*
2568	* We have two dirty iclogs so start over. This could also be
2569	* num of ops indicating this is not the dummy going out.
2570	*/
2571	*iclogs_changed = `2`;
2572	}
2573
2574	iclog->ic_state = XLOG_STATE_ACTIVE;
2575	iclog->ic_offset = `0`;
2576	iclog->ic_header.h_num_logops = `0`;
2577	memset(iclog->ic_header.h_cycle_data, `0`,
2578	sizeof(iclog->ic_header.h_cycle_data));
2579	iclog->ic_header.h_lsn = `0`;
2580	iclog->ic_header.h_tail_lsn = `0`;
2581	}
2582
2583	/*
2584	* Loop through all iclogs and mark all iclogs currently marked DIRTY as
2585	* ACTIVE after iclog I/O has completed.
2586	*/
2587	static void
2588	xlog_state_activate_iclogs(
2589	struct xlog *log,
2590	int *iclogs_changed)
2591	{
2592	struct xlog_in_core *iclog = log->l_iclog;
2593
2594	do {
2595	if (iclog->ic_state == XLOG_STATE_DIRTY)
2596	xlog_state_activate_iclog(iclog, iclogs_changed);
2597	/*
2598	* The ordering of marking iclogs ACTIVE must be maintained, so
2599	* an iclog doesn't become ACTIVE beyond one that is SYNCING.
2600	*/
2601	else if (iclog->ic_state != XLOG_STATE_ACTIVE)
2602	break;
2603	} while ((iclog = iclog->ic_next) != log->l_iclog);
2604	}
2605
2606	static int
2607	xlog_covered_state(
2608	int prev_state,
2609	int iclogs_changed)
2610	{
2611	/*
2612	* We go to NEED for any non-covering writes. We go to NEED2 if we just
2613	* wrote the first covering record (DONE). We go to IDLE if we just
2614	* wrote the second covering record (DONE2) and remain in IDLE until a
2615	* non-covering write occurs.
2616	*/
2617	switch (prev_state) {
2618	case XLOG_STATE_COVER_IDLE:
2619	if (iclogs_changed == `1`)
2620	return XLOG_STATE_COVER_IDLE;
2621	fallthrough;
2622	case XLOG_STATE_COVER_NEED:
2623	case XLOG_STATE_COVER_NEED2:
2624	break;
2625	case XLOG_STATE_COVER_DONE:
2626	if (iclogs_changed == `1`)
2627	return XLOG_STATE_COVER_NEED2;
2628	break;
2629	case XLOG_STATE_COVER_DONE2:
2630	if (iclogs_changed == `1`)
2631	return XLOG_STATE_COVER_IDLE;
2632	break;
2633	default:
2634	ASSERT(`0`);
2635	}
2636
2637	return XLOG_STATE_COVER_NEED;
2638	}
2639
2640	STATIC void
2641	xlog_state_clean_iclog(
2642	struct xlog *log,
2643	struct xlog_in_core *dirty_iclog)
2644	{
2645	int iclogs_changed = `0`;
2646
2647	trace_xlog_iclog_clean(iclog: dirty_iclog, _RET_IP_);
2648
2649	dirty_iclog->ic_state = XLOG_STATE_DIRTY;
2650
2651	xlog_state_activate_iclogs(log, iclogs_changed: &iclogs_changed);
2652	wake_up_all(&dirty_iclog->ic_force_wait);
2653
2654	if (iclogs_changed) {
2655	log->l_covered_state = xlog_covered_state(prev_state: log->l_covered_state,
2656	iclogs_changed);
2657	}
2658	}
2659
2660	STATIC xfs_lsn_t
2661	xlog_get_lowest_lsn(
2662	struct xlog *log)
2663	{
2664	struct xlog_in_core *iclog = log->l_iclog;
2665	xfs_lsn_t lowest_lsn = `0`, lsn;
2666
2667	do {
2668	if (iclog->ic_state == XLOG_STATE_ACTIVE \|\|
2669	iclog->ic_state == XLOG_STATE_DIRTY)
2670	continue;
2671
2672	lsn = be64_to_cpu(iclog->ic_header.h_lsn);
2673	if ((lsn && !lowest_lsn) \|\| XFS_LSN_CMP(lsn, lowest_lsn) < `0`)
2674	lowest_lsn = lsn;
2675	} while ((iclog = iclog->ic_next) != log->l_iclog);
2676
2677	return lowest_lsn;
2678	}
2679
2680	/*
2681	* Completion of a iclog IO does not imply that a transaction has completed, as
2682	* transactions can be large enough to span many iclogs. We cannot change the
2683	* tail of the log half way through a transaction as this may be the only
2684	* transaction in the log and moving the tail to point to the middle of it
2685	* will prevent recovery from finding the start of the transaction. Hence we
2686	* should only update the last_sync_lsn if this iclog contains transaction
2687	* completion callbacks on it.
2688	*
2689	* We have to do this before we drop the icloglock to ensure we are the only one
2690	* that can update it.
2691	*
2692	* If we are moving the last_sync_lsn forwards, we also need to ensure we kick
2693	* the reservation grant head pushing. This is due to the fact that the push
2694	* target is bound by the current last_sync_lsn value. Hence if we have a large
2695	* amount of log space bound up in this committing transaction then the
2696	* last_sync_lsn value may be the limiting factor preventing tail pushing from
2697	* freeing space in the log. Hence once we've updated the last_sync_lsn we
2698	* should push the AIL to ensure the push target (and hence the grant head) is
2699	* no longer bound by the old log head location and can move forwards and make
2700	* progress again.
2701	*/
2702	static void
2703	xlog_state_set_callback(
2704	struct xlog *log,
2705	struct xlog_in_core *iclog,
2706	xfs_lsn_t header_lsn)
2707	{
2708	trace_xlog_iclog_callback(iclog, _RET_IP_);
2709	iclog->ic_state = XLOG_STATE_CALLBACK;
2710
2711	ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn),
2712	header_lsn) <= `0`);
2713
2714	if (list_empty_careful(head: &iclog->ic_callbacks))
2715	return;
2716
2717	atomic64_set(v: &log->l_last_sync_lsn, i: header_lsn);
2718	xlog_grant_push_ail(log, need_bytes: `0`);
2719	}
2720
2721	/*
2722	* Return true if we need to stop processing, false to continue to the next
2723	* iclog. The caller will need to run callbacks if the iclog is returned in the
2724	* XLOG_STATE_CALLBACK state.
2725	*/
2726	static bool
2727	xlog_state_iodone_process_iclog(
2728	struct xlog *log,
2729	struct xlog_in_core *iclog)
2730	{
2731	xfs_lsn_t lowest_lsn;
2732	xfs_lsn_t header_lsn;
2733
2734	switch (iclog->ic_state) {
2735	case XLOG_STATE_ACTIVE:
2736	case XLOG_STATE_DIRTY:
2737	/*
2738	* Skip all iclogs in the ACTIVE & DIRTY states:
2739	*/
2740	return false;
2741	case XLOG_STATE_DONE_SYNC:
2742	/*
2743	* Now that we have an iclog that is in the DONE_SYNC state, do
2744	* one more check here to see if we have chased our tail around.
2745	* If this is not the lowest lsn iclog, then we will leave it
2746	* for another completion to process.
2747	*/
2748	header_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
2749	lowest_lsn = xlog_get_lowest_lsn(log);
2750	if (lowest_lsn && XFS_LSN_CMP(lowest_lsn, header_lsn) < `0`)
2751	return false;
2752	xlog_state_set_callback(log, iclog, header_lsn);
2753	return false;
2754	default:
2755	/*
2756	* Can only perform callbacks in order. Since this iclog is not
2757	* in the DONE_SYNC state, we skip the rest and just try to
2758	* clean up.
2759	*/
2760	return true;
2761	}
2762	}
2763
2764	/*
2765	* Loop over all the iclogs, running attached callbacks on them. Return true if
2766	* we ran any callbacks, indicating that we dropped the icloglock. We don't need
2767	* to handle transient shutdown state here at all because
2768	* xlog_state_shutdown_callbacks() will be run to do the necessary shutdown
2769	* cleanup of the callbacks.
2770	*/
2771	static bool
2772	xlog_state_do_iclog_callbacks(
2773	struct xlog *log)
2774	__releases(&log->l_icloglock)
2775	__acquires(&log->l_icloglock)
2776	{
2777	struct xlog_in_core *first_iclog = log->l_iclog;
2778	struct xlog_in_core *iclog = first_iclog;
2779	bool ran_callback = false;
2780
2781	do {
2782	LIST_HEAD(cb_list);
2783
2784	if (xlog_state_iodone_process_iclog(log, iclog))
2785	break;
2786	if (iclog->ic_state != XLOG_STATE_CALLBACK) {
2787	iclog = iclog->ic_next;
2788	continue;
2789	}
2790	list_splice_init(list: &iclog->ic_callbacks, head: &cb_list);
2791	spin_unlock(lock: &log->l_icloglock);
2792
2793	trace_xlog_iclog_callbacks_start(iclog, _RET_IP_);
2794	xlog_cil_process_committed(list: &cb_list);
2795	trace_xlog_iclog_callbacks_done(iclog, _RET_IP_);
2796	ran_callback = true;
2797
2798	spin_lock(lock: &log->l_icloglock);
2799	xlog_state_clean_iclog(log, dirty_iclog: iclog);
2800	iclog = iclog->ic_next;
2801	} while (iclog != first_iclog);
2802
2803	return ran_callback;
2804	}
2805
2806
2807	/*
2808	* Loop running iclog completion callbacks until there are no more iclogs in a
2809	* state that can run callbacks.
2810	*/
2811	STATIC void
2812	xlog_state_do_callback(
2813	struct xlog *log)
2814	{
2815	int flushcnt = `0`;
2816	int repeats = `0`;
2817
2818	spin_lock(lock: &log->l_icloglock);
2819	while (xlog_state_do_iclog_callbacks(log)) {
2820	if (xlog_is_shutdown(log))
2821	break;
2822
2823	if (++repeats > `5000`) {
2824	flushcnt += repeats;
2825	repeats = `0`;
2826	xfs_warn(log->l_mp,
2827	"%s: possible infinite loop (%d iterations)",
2828	__func__, flushcnt);
2829	}
2830	}
2831
2832	if (log->l_iclog->ic_state == XLOG_STATE_ACTIVE)
2833	wake_up_all(&log->l_flush_wait);
2834
2835	spin_unlock(lock: &log->l_icloglock);
2836	}
2837
2838
2839	/*
2840	* Finish transitioning this iclog to the dirty state.
2841	*
2842	* Callbacks could take time, so they are done outside the scope of the
2843	* global state machine log lock.
2844	*/
2845	STATIC void
2846	xlog_state_done_syncing(
2847	struct xlog_in_core *iclog)
2848	{
2849	struct xlog *log = iclog->ic_log;
2850
2851	spin_lock(lock: &log->l_icloglock);
2852	ASSERT(atomic_read(&iclog->ic_refcnt) == `0`);
2853	trace_xlog_iclog_sync_done(iclog, _RET_IP_);
2854
2855	/*
2856	* If we got an error, either on the first buffer, or in the case of
2857	* split log writes, on the second, we shut down the file system and
2858	* no iclogs should ever be attempted to be written to disk again.
2859	*/
2860	if (!xlog_is_shutdown(log)) {
2861	ASSERT(iclog->ic_state == XLOG_STATE_SYNCING);
2862	iclog->ic_state = XLOG_STATE_DONE_SYNC;
2863	}
2864
2865	/*
2866	* Someone could be sleeping prior to writing out the next
2867	* iclog buffer, we wake them all, one will get to do the
2868	* I/O, the others get to wait for the result.
2869	*/
2870	wake_up_all(&iclog->ic_write_wait);
2871	spin_unlock(lock: &log->l_icloglock);
2872	xlog_state_do_callback(log);
2873	}
2874
2875	/*
2876	* If the head of the in-core log ring is not (ACTIVE or DIRTY), then we must
2877	* sleep. We wait on the flush queue on the head iclog as that should be
2878	* the first iclog to complete flushing. Hence if all iclogs are syncing,
2879	* we will wait here and all new writes will sleep until a sync completes.
2880	*
2881	* The in-core logs are used in a circular fashion. They are not used
2882	* out-of-order even when an iclog past the head is free.
2883	*
2884	* return:
2885	* * log_offset where xlog_write() can start writing into the in-core
2886	* log's data space.
2887	* * in-core log pointer to which xlog_write() should write.
2888	* * boolean indicating this is a continued write to an in-core log.
2889	* If this is the last write, then the in-core log's offset field
2890	* needs to be incremented, depending on the amount of data which
2891	* is copied.
2892	*/
2893	STATIC int
2894	xlog_state_get_iclog_space(
2895	struct xlog *log,
2896	int len,
2897	struct xlog_in_core **iclogp,
2898	struct xlog_ticket *ticket,
2899	int *logoffsetp)
2900	{
2901	int log_offset;
2902	xlog_rec_header_t *head;
2903	xlog_in_core_t *iclog;
2904
2905	restart:
2906	spin_lock(lock: &log->l_icloglock);
2907	if (xlog_is_shutdown(log)) {
2908	spin_unlock(lock: &log->l_icloglock);
2909	return -EIO;
2910	}
2911
2912	iclog = log->l_iclog;
2913	if (iclog->ic_state != XLOG_STATE_ACTIVE) {
2914	XFS_STATS_INC(log->l_mp, xs_log_noiclogs);
2915
2916	/ Wait for log writes to have flushed /
2917	xlog_wait(wq: &log->l_flush_wait, lock: &log->l_icloglock);
2918	goto restart;
2919	}
2920
2921	head = &iclog->ic_header;
2922
2923	atomic_inc(v: &iclog->ic_refcnt); / prevents sync /
2924	log_offset = iclog->ic_offset;
2925
2926	trace_xlog_iclog_get_space(iclog, _RET_IP_);
2927
2928	/ On the 1st write to an iclog, figure out lsn. This works*
2929	* if iclogs marked XLOG_STATE_WANT_SYNC always write out what they are
2930	* committing to. If the offset is set, that's how many blocks
2931	* must be written.
2932	*/
2933	if (log_offset == `0`) {
2934	ticket->t_curr_res -= log->l_iclog_hsize;
2935	head->h_cycle = cpu_to_be32(log->l_curr_cycle);
2936	head->h_lsn = cpu_to_be64(
2937	xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block));
2938	ASSERT(log->l_curr_block >= `0`);
2939	}
2940
2941	/ If there is enough room to write everything, then do it. Otherwise,*
2942	* claim the rest of the region and make sure the XLOG_STATE_WANT_SYNC
2943	* bit is on, so this will get flushed out. Don't update ic_offset
2944	* until you know exactly how many bytes get copied. Therefore, wait
2945	* until later to update ic_offset.
2946	*
2947	* xlog_write() algorithm assumes that at least 2 xlog_op_header_t's
2948	* can fit into remaining data section.
2949	*/
2950	if (iclog->ic_size - iclog->ic_offset < `2`*sizeof(xlog_op_header_t)) {
2951	int error = `0`;
2952
2953	xlog_state_switch_iclogs(log, iclog, eventual_size: iclog->ic_size);
2954
2955	/*
2956	* If we are the only one writing to this iclog, sync it to
2957	* disk. We need to do an atomic compare and decrement here to
2958	* avoid racing with concurrent atomic_dec_and_lock() calls in
2959	* xlog_state_release_iclog() when there is more than one
2960	* reference to the iclog.
2961	*/
2962	if (!atomic_add_unless(v: &iclog->ic_refcnt, a: -`1`, u: `1`))
2963	error = xlog_state_release_iclog(log, iclog, ticket);
2964	spin_unlock(lock: &log->l_icloglock);
2965	if (error)
2966	return error;
2967	goto restart;
2968	}
2969
2970	/ Do we have enough room to write the full amount in the remainder*
2971	* of this iclog? Or must we continue a write on the next iclog and
2972	* mark this iclog as completely taken? In the case where we switch
2973	* iclogs (to mark it taken), this particular iclog will release/sync
2974	* to disk in xlog_write().
2975	*/
2976	if (len <= iclog->ic_size - iclog->ic_offset)
2977	iclog->ic_offset += len;
2978	else
2979	xlog_state_switch_iclogs(log, iclog, eventual_size: iclog->ic_size);
2980	*iclogp = iclog;
2981
2982	ASSERT(iclog->ic_offset <= iclog->ic_size);
2983	spin_unlock(lock: &log->l_icloglock);
2984
2985	*logoffsetp = log_offset;
2986	return `0`;
2987	}
2988
2989	/*
2990	* The first cnt-1 times a ticket goes through here we don't need to move the
2991	* grant write head because the permanent reservation has reserved cnt times the
2992	* unit amount. Release part of current permanent unit reservation and reset
2993	* current reservation to be one units worth. Also move grant reservation head
2994	* forward.
2995	*/
2996	void
2997	xfs_log_ticket_regrant(
2998	struct xlog *log,
2999	struct xlog_ticket *ticket)
3000	{
3001	trace_xfs_log_ticket_regrant(log, tic: ticket);
3002
3003	if (ticket->t_cnt > `0`)
3004	ticket->t_cnt--;
3005
3006	xlog_grant_sub_space(log, head: &log->l_reserve_head.grant,
3007	bytes: ticket->t_curr_res);
3008	xlog_grant_sub_space(log, head: &log->l_write_head.grant,
3009	bytes: ticket->t_curr_res);
3010	ticket->t_curr_res = ticket->t_unit_res;
3011
3012	trace_xfs_log_ticket_regrant_sub(log, tic: ticket);
3013
3014	/ just return if we still have some of the pre-reserved space /
3015	if (!ticket->t_cnt) {
3016	xlog_grant_add_space(log, head: &log->l_reserve_head.grant,
3017	bytes: ticket->t_unit_res);
3018	trace_xfs_log_ticket_regrant_exit(log, tic: ticket);
3019
3020	ticket->t_curr_res = ticket->t_unit_res;
3021	}
3022
3023	xfs_log_ticket_put(ticket);
3024	}
3025
3026	/*
3027	* Give back the space left from a reservation.
3028	*
3029	* All the information we need to make a correct determination of space left
3030	* is present. For non-permanent reservations, things are quite easy. The
3031	* count should have been decremented to zero. We only need to deal with the
3032	* space remaining in the current reservation part of the ticket. If the
3033	* ticket contains a permanent reservation, there may be left over space which
3034	* needs to be released. A count of N means that N-1 refills of the current
3035	* reservation can be done before we need to ask for more space. The first
3036	* one goes to fill up the first current reservation. Once we run out of
3037	* space, the count will stay at zero and the only space remaining will be
3038	* in the current reservation field.
3039	*/
3040	void
3041	xfs_log_ticket_ungrant(
3042	struct xlog *log,
3043	struct xlog_ticket *ticket)
3044	{
3045	int bytes;
3046
3047	trace_xfs_log_ticket_ungrant(log, tic: ticket);
3048
3049	if (ticket->t_cnt > `0`)
3050	ticket->t_cnt--;
3051
3052	trace_xfs_log_ticket_ungrant_sub(log, tic: ticket);
3053
3054	/*
3055	* If this is a permanent reservation ticket, we may be able to free
3056	* up more space based on the remaining count.
3057	*/
3058	bytes = ticket->t_curr_res;
3059	if (ticket->t_cnt > `0`) {
3060	ASSERT(ticket->t_flags & XLOG_TIC_PERM_RESERV);
3061	bytes += ticket->t_unit_res*ticket->t_cnt;
3062	}
3063
3064	xlog_grant_sub_space(log, head: &log->l_reserve_head.grant, bytes);
3065	xlog_grant_sub_space(log, head: &log->l_write_head.grant, bytes);
3066
3067	trace_xfs_log_ticket_ungrant_exit(log, tic: ticket);
3068
3069	xfs_log_space_wake(mp: log->l_mp);
3070	xfs_log_ticket_put(ticket);
3071	}
3072
3073	/*
3074	* This routine will mark the current iclog in the ring as WANT_SYNC and move
3075	* the current iclog pointer to the next iclog in the ring.
3076	*/
3077	void
3078	xlog_state_switch_iclogs(
3079	struct xlog *log,
3080	struct xlog_in_core *iclog,
3081	int eventual_size)
3082	{
3083	ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
3084	assert_spin_locked(&log->l_icloglock);
3085	trace_xlog_iclog_switch(iclog, _RET_IP_);
3086
3087	if (!eventual_size)
3088	eventual_size = iclog->ic_offset;
3089	iclog->ic_state = XLOG_STATE_WANT_SYNC;
3090	iclog->ic_header.h_prev_block = cpu_to_be32(log->l_prev_block);
3091	log->l_prev_block = log->l_curr_block;
3092	log->l_prev_cycle = log->l_curr_cycle;
3093
3094	/ roll log?: ic_offset changed later /
3095	log->l_curr_block += BTOBB(eventual_size)+BTOBB(log->l_iclog_hsize);
3096
3097	/ Round up to next log-sunit /
3098	if (log->l_iclog_roundoff > BBSIZE) {
3099	uint32_t sunit_bb = BTOBB(log->l_iclog_roundoff);
3100	log->l_curr_block = roundup(log->l_curr_block, sunit_bb);
3101	}
3102
3103	if (log->l_curr_block >= log->l_logBBsize) {
3104	/*
3105	* Rewind the current block before the cycle is bumped to make
3106	* sure that the combined LSN never transiently moves forward
3107	* when the log wraps to the next cycle. This is to support the
3108	* unlocked sample of these fields from xlog_valid_lsn(). Most
3109	* other cases should acquire l_icloglock.
3110	*/
3111	log->l_curr_block -= log->l_logBBsize;
3112	ASSERT(log->l_curr_block >= `0`);
3113	smp_wmb();
3114	log->l_curr_cycle++;
3115	if (log->l_curr_cycle == XLOG_HEADER_MAGIC_NUM)
3116	log->l_curr_cycle++;
3117	}
3118	ASSERT(iclog == log->l_iclog);
3119	log->l_iclog = iclog->ic_next;
3120	}
3121
3122	/*
3123	* Force the iclog to disk and check if the iclog has been completed before
3124	* xlog_force_iclog() returns. This can happen on synchronous (e.g.
3125	* pmem) or fast async storage because we drop the icloglock to issue the IO.
3126	* If completion has already occurred, tell the caller so that it can avoid an
3127	* unnecessary wait on the iclog.
3128	*/
3129	static int
3130	xlog_force_and_check_iclog(
3131	struct xlog_in_core *iclog,
3132	bool *completed)
3133	{
3134	xfs_lsn_t lsn = be64_to_cpu(iclog->ic_header.h_lsn);
3135	int error;
3136
3137	*completed = false;
3138	error = xlog_force_iclog(iclog);
3139	if (error)
3140	return error;
3141
3142	/*
3143	* If the iclog has already been completed and reused the header LSN
3144	* will have been rewritten by completion
3145	*/
3146	if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn)
3147	*completed = true;
3148	return `0`;
3149	}
3150
3151	/*
3152	* Write out all data in the in-core log as of this exact moment in time.
3153	*
3154	* Data may be written to the in-core log during this call. However,
3155	* we don't guarantee this data will be written out. A change from past
3156	* implementation means this routine will not write out zero length LRs.
3157	*
3158	* Basically, we try and perform an intelligent scan of the in-core logs.
3159	* If we determine there is no flushable data, we just return. There is no
3160	* flushable data if:
3161	*
3162	* 1. the current iclog is active and has no data; the previous iclog
3163	* is in the active or dirty state.
3164	* 2. the current iclog is drity, and the previous iclog is in the
3165	* active or dirty state.
3166	*
3167	* We may sleep if:
3168	*
3169	* 1. the current iclog is not in the active nor dirty state.
3170	* 2. the current iclog dirty, and the previous iclog is not in the
3171	* active nor dirty state.
3172	* 3. the current iclog is active, and there is another thread writing
3173	* to this particular iclog.
3174	* 4. a) the current iclog is active and has no other writers
3175	* b) when we return from flushing out this iclog, it is still
3176	* not in the active nor dirty state.
3177	*/
3178	int
3179	xfs_log_force(
3180	struct xfs_mount *mp,
3181	uint flags)
3182	{
3183	struct xlog *log = mp->m_log;
3184	struct xlog_in_core *iclog;
3185
3186	XFS_STATS_INC(mp, xs_log_force);
3187	trace_xfs_log_force(mp, `0`, _RET_IP_);
3188
3189	xlog_cil_force(log);
3190
3191	spin_lock(lock: &log->l_icloglock);
3192	if (xlog_is_shutdown(log))
3193	goto out_error;
3194
3195	iclog = log->l_iclog;
3196	trace_xlog_iclog_force(iclog, _RET_IP_);
3197
3198	if (iclog->ic_state == XLOG_STATE_DIRTY \|\|
3199	(iclog->ic_state == XLOG_STATE_ACTIVE &&
3200	atomic_read(v: &iclog->ic_refcnt) == `0` && iclog->ic_offset == `0`)) {
3201	/*
3202	* If the head is dirty or (active and empty), then we need to
3203	* look at the previous iclog.
3204	*
3205	* If the previous iclog is active or dirty we are done. There
3206	* is nothing to sync out. Otherwise, we attach ourselves to the
3207	* previous iclog and go to sleep.
3208	*/
3209	iclog = iclog->ic_prev;
3210	} else if (iclog->ic_state == XLOG_STATE_ACTIVE) {
3211	if (atomic_read(v: &iclog->ic_refcnt) == `0`) {
3212	/ We have exclusive access to this iclog. /
3213	bool completed;
3214
3215	if (xlog_force_and_check_iclog(iclog, completed: &completed))
3216	goto out_error;
3217
3218	if (completed)
3219	goto out_unlock;
3220	} else {
3221	/*
3222	* Someone else is still writing to this iclog, so we
3223	* need to ensure that when they release the iclog it
3224	* gets synced immediately as we may be waiting on it.
3225	*/
3226	xlog_state_switch_iclogs(log, iclog, eventual_size: `0`);
3227	}
3228	}
3229
3230	/*
3231	* The iclog we are about to wait on may contain the checkpoint pushed
3232	* by the above xlog_cil_force() call, but it may not have been pushed
3233	* to disk yet. Like the ACTIVE case above, we need to make sure caches
3234	* are flushed when this iclog is written.
3235	*/
3236	if (iclog->ic_state == XLOG_STATE_WANT_SYNC)
3237	iclog->ic_flags \|= XLOG_ICL_NEED_FLUSH \| XLOG_ICL_NEED_FUA;
3238
3239	if (flags & XFS_LOG_SYNC)
3240	return xlog_wait_on_iclog(iclog);
3241	out_unlock:
3242	spin_unlock(lock: &log->l_icloglock);
3243	return `0`;
3244	out_error:
3245	spin_unlock(lock: &log->l_icloglock);
3246	return -EIO;
3247	}
3248
3249	/*
3250	* Force the log to a specific LSN.
3251	*
3252	* If an iclog with that lsn can be found:
3253	* If it is in the DIRTY state, just return.
3254	* If it is in the ACTIVE state, move the in-core log into the WANT_SYNC
3255	* state and go to sleep or return.
3256	* If it is in any other state, go to sleep or return.
3257	*
3258	* Synchronous forces are implemented with a wait queue. All callers trying
3259	* to force a given lsn to disk must wait on the queue attached to the
3260	* specific in-core log. When given in-core log finally completes its write
3261	* to disk, that thread will wake up all threads waiting on the queue.
3262	*/
3263	static int
3264	xlog_force_lsn(
3265	struct xlog *log,
3266	xfs_lsn_t lsn,
3267	uint flags,
3268	int *log_flushed,
3269	bool already_slept)
3270	{
3271	struct xlog_in_core *iclog;
3272	bool completed;
3273
3274	spin_lock(lock: &log->l_icloglock);
3275	if (xlog_is_shutdown(log))
3276	goto out_error;
3277
3278	iclog = log->l_iclog;
3279	while (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) {
3280	trace_xlog_iclog_force_lsn(iclog, _RET_IP_);
3281	iclog = iclog->ic_next;
3282	if (iclog == log->l_iclog)
3283	goto out_unlock;
3284	}
3285
3286	switch (iclog->ic_state) {
3287	case XLOG_STATE_ACTIVE:
3288	/*
3289	* We sleep here if we haven't already slept (e.g. this is the
3290	* first time we've looked at the correct iclog buf) and the
3291	* buffer before us is going to be sync'ed. The reason for this
3292	* is that if we are doing sync transactions here, by waiting
3293	* for the previous I/O to complete, we can allow a few more
3294	* transactions into this iclog before we close it down.
3295	*
3296	* Otherwise, we mark the buffer WANT_SYNC, and bump up the
3297	* refcnt so we can release the log (which drops the ref count).
3298	* The state switch keeps new transaction commits from using
3299	* this buffer. When the current commits finish writing into
3300	* the buffer, the refcount will drop to zero and the buffer
3301	* will go out then.
3302	*/
3303	if (!already_slept &&
3304	(iclog->ic_prev->ic_state == XLOG_STATE_WANT_SYNC \|\|
3305	iclog->ic_prev->ic_state == XLOG_STATE_SYNCING)) {
3306	xlog_wait(wq: &iclog->ic_prev->ic_write_wait,
3307	lock: &log->l_icloglock);
3308	return -EAGAIN;
3309	}
3310	if (xlog_force_and_check_iclog(iclog, completed: &completed))
3311	goto out_error;
3312	if (log_flushed)
3313	*log_flushed = `1`;
3314	if (completed)
3315	goto out_unlock;
3316	break;
3317	case XLOG_STATE_WANT_SYNC:
3318	/*
3319	* This iclog may contain the checkpoint pushed by the
3320	* xlog_cil_force_seq() call, but there are other writers still
3321	* accessing it so it hasn't been pushed to disk yet. Like the
3322	* ACTIVE case above, we need to make sure caches are flushed
3323	* when this iclog is written.
3324	*/
3325	iclog->ic_flags \|= XLOG_ICL_NEED_FLUSH \| XLOG_ICL_NEED_FUA;
3326	break;
3327	default:
3328	/*
3329	* The entire checkpoint was written by the CIL force and is on
3330	* its way to disk already. It will be stable when it
3331	* completes, so we don't need to manipulate caches here at all.
3332	* We just need to wait for completion if necessary.
3333	*/
3334	break;
3335	}
3336
3337	if (flags & XFS_LOG_SYNC)
3338	return xlog_wait_on_iclog(iclog);
3339	out_unlock:
3340	spin_unlock(lock: &log->l_icloglock);
3341	return `0`;
3342	out_error:
3343	spin_unlock(lock: &log->l_icloglock);
3344	return -EIO;
3345	}
3346
3347	/*
3348	* Force the log to a specific checkpoint sequence.
3349	*
3350	* First force the CIL so that all the required changes have been flushed to the
3351	* iclogs. If the CIL force completed it will return a commit LSN that indicates
3352	* the iclog that needs to be flushed to stable storage. If the caller needs
3353	* a synchronous log force, we will wait on the iclog with the LSN returned by
3354	* xlog_cil_force_seq() to be completed.
3355	*/
3356	int
3357	xfs_log_force_seq(
3358	struct xfs_mount *mp,
3359	xfs_csn_t seq,
3360	uint flags,
3361	int *log_flushed)
3362	{
3363	struct xlog *log = mp->m_log;
3364	xfs_lsn_t lsn;
3365	int ret;
3366	ASSERT(seq != `0`);
3367
3368	XFS_STATS_INC(mp, xs_log_force);
3369	trace_xfs_log_force(mp, seq, _RET_IP_);
3370
3371	lsn = xlog_cil_force_seq(log, seq);
3372	if (lsn == NULLCOMMITLSN)
3373	return `0`;
3374
3375	ret = xlog_force_lsn(log, lsn, flags, log_flushed, false);
3376	if (ret == -EAGAIN) {
3377	XFS_STATS_INC(mp, xs_log_force_sleep);
3378	ret = xlog_force_lsn(log, lsn, flags, log_flushed, true);
3379	}
3380	return ret;
3381	}
3382
3383	/*
3384	* Free a used ticket when its refcount falls to zero.
3385	*/
3386	void
3387	xfs_log_ticket_put(
3388	xlog_ticket_t *ticket)
3389	{
3390	ASSERT(atomic_read(&ticket->t_ref) > `0`);
3391	if (atomic_dec_and_test(v: &ticket->t_ref))
3392	kmem_cache_free(s: xfs_log_ticket_cache, objp: ticket);
3393	}
3394
3395	xlog_ticket_t *
3396	xfs_log_ticket_get(
3397	xlog_ticket_t *ticket)
3398	{
3399	ASSERT(atomic_read(&ticket->t_ref) > `0`);
3400	atomic_inc(v: &ticket->t_ref);
3401	return ticket;
3402	}
3403
3404	/*
3405	* Figure out the total log space unit (in bytes) that would be
3406	* required for a log ticket.
3407	*/
3408	static int
3409	xlog_calc_unit_res(
3410	struct xlog *log,
3411	int unit_bytes,
3412	int *niclogs)
3413	{
3414	int iclog_space;
3415	uint num_headers;
3416
3417	/*
3418	* Permanent reservations have up to 'cnt'-1 active log operations
3419	* in the log. A unit in this case is the amount of space for one
3420	* of these log operations. Normal reservations have a cnt of 1
3421	* and their unit amount is the total amount of space required.
3422	*
3423	* The following lines of code account for non-transaction data
3424	* which occupy space in the on-disk log.
3425	*
3426	* Normal form of a transaction is:
3427	* <oph><trans-hdr><start-oph><reg1-oph><reg1><reg2-oph>...<commit-oph>
3428	* and then there are LR hdrs, split-recs and roundoff at end of syncs.
3429	*
3430	* We need to account for all the leadup data and trailer data
3431	* around the transaction data.
3432	* And then we need to account for the worst case in terms of using
3433	* more space.
3434	* The worst case will happen if:
3435	* - the placement of the transaction happens to be such that the
3436	* roundoff is at its maximum
3437	* - the transaction data is synced before the commit record is synced
3438	* i.e. <transaction-data><roundoff> \| <commit-rec><roundoff>
3439	* Therefore the commit record is in its own Log Record.
3440	* This can happen as the commit record is called with its
3441	* own region to xlog_write().
3442	* This then means that in the worst case, roundoff can happen for
3443	* the commit-rec as well.
3444	* The commit-rec is smaller than padding in this scenario and so it is
3445	* not added separately.
3446	*/
3447
3448	/ for trans header /
3449	unit_bytes += sizeof(xlog_op_header_t);
3450	unit_bytes += sizeof(xfs_trans_header_t);
3451
3452	/ for start-rec /
3453	unit_bytes += sizeof(xlog_op_header_t);
3454
3455	/*
3456	* for LR headers - the space for data in an iclog is the size minus
3457	* the space used for the headers. If we use the iclog size, then we
3458	* undercalculate the number of headers required.
3459	*
3460	* Furthermore - the addition of op headers for split-recs might
3461	* increase the space required enough to require more log and op
3462	* headers, so take that into account too.
3463	*
3464	* IMPORTANT: This reservation makes the assumption that if this
3465	* transaction is the first in an iclog and hence has the LR headers
3466	* accounted to it, then the remaining space in the iclog is
3467	* exclusively for this transaction. i.e. if the transaction is larger
3468	* than the iclog, it will be the only thing in that iclog.
3469	* Fundamentally, this means we must pass the entire log vector to
3470	* xlog_write to guarantee this.
3471	*/
3472	iclog_space = log->l_iclog_size - log->l_iclog_hsize;
3473	num_headers = howmany(unit_bytes, iclog_space);
3474
3475	/ for split-recs - ophdrs added when data split over LRs /
3476	unit_bytes += sizeof(xlog_op_header_t) * num_headers;
3477
3478	/ add extra header reservations if we overrun /
3479	while (!num_headers \|\|
3480	howmany(unit_bytes, iclog_space) > num_headers) {
3481	unit_bytes += sizeof(xlog_op_header_t);
3482	num_headers++;
3483	}
3484	unit_bytes += log->l_iclog_hsize * num_headers;
3485
3486	/ for commit-rec LR header - note: padding will subsume the ophdr /
3487	unit_bytes += log->l_iclog_hsize;
3488
3489	/ roundoff padding for transaction data and one for commit record /
3490	unit_bytes += `2` * log->l_iclog_roundoff;
3491
3492	if (niclogs)
3493	*niclogs = num_headers;
3494	return unit_bytes;
3495	}
3496
3497	int
3498	xfs_log_calc_unit_res(
3499	struct xfs_mount *mp,
3500	int unit_bytes)
3501	{
3502	return xlog_calc_unit_res(log: mp->m_log, unit_bytes, NULL);
3503	}
3504
3505	/*
3506	* Allocate and initialise a new log ticket.
3507	*/
3508	struct xlog_ticket *
3509	xlog_ticket_alloc(
3510	struct xlog *log,
3511	int unit_bytes,
3512	int cnt,
3513	bool permanent)
3514	{
3515	struct xlog_ticket *tic;
3516	int unit_res;
3517
3518	tic = kmem_cache_zalloc(k: xfs_log_ticket_cache, GFP_NOFS \| __GFP_NOFAIL);
3519
3520	unit_res = xlog_calc_unit_res(log, unit_bytes, niclogs: &tic->t_iclog_hdrs);
3521
3522	atomic_set(v: &tic->t_ref, i: `1`);
3523	tic->t_task = current;
3524	INIT_LIST_HEAD(list: &tic->t_queue);
3525	tic->t_unit_res = unit_res;
3526	tic->t_curr_res = unit_res;
3527	tic->t_cnt = cnt;
3528	tic->t_ocnt = cnt;
3529	tic->t_tid = get_random_u32();
3530	if (permanent)
3531	tic->t_flags \|= XLOG_TIC_PERM_RESERV;
3532
3533	return tic;
3534	}
3535
3536	#if defined(DEBUG)
3537	/*
3538	* Check to make sure the grant write head didn't just over lap the tail. If
3539	* the cycles are the same, we can't be overlapping. Otherwise, make sure that
3540	* the cycles differ by exactly one and check the byte count.
3541	*
3542	* This check is run unlocked, so can give false positives. Rather than assert
3543	* on failures, use a warn-once flag and a panic tag to allow the admin to
3544	* determine if they want to panic the machine when such an error occurs. For
3545	* debug kernels this will have the same effect as using an assert but, unlinke
3546	* an assert, it can be turned off at runtime.
3547	*/
3548	STATIC void
3549	xlog_verify_grant_tail(
3550	struct xlog *log)
3551	{
3552	int tail_cycle, tail_blocks;
3553	int cycle, space;
3554
3555	xlog_crack_grant_head(head: &log->l_write_head.grant, cycle: &cycle, space: &space);
3556	xlog_crack_atomic_lsn(lsn: &log->l_tail_lsn, cycle: &tail_cycle, block: &tail_blocks);
3557	if (tail_cycle != cycle) {
3558	if (cycle - `1` != tail_cycle &&
3559	!test_and_set_bit(XLOG_TAIL_WARN, addr: &log->l_opstate)) {
3560	xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
3561	"%s: cycle - 1 != tail_cycle", __func__);
3562	}
3563
3564	if (space > BBTOB(tail_blocks) &&
3565	!test_and_set_bit(XLOG_TAIL_WARN, addr: &log->l_opstate)) {
3566	xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
3567	"%s: space > BBTOB(tail_blocks)", __func__);
3568	}
3569	}
3570	}
3571
3572	/ check if it will fit /
3573	STATIC void
3574	xlog_verify_tail_lsn(
3575	struct xlog *log,
3576	struct xlog_in_core *iclog)
3577	{
3578	xfs_lsn_t tail_lsn = be64_to_cpu(iclog->ic_header.h_tail_lsn);
3579	int blocks;
3580
3581	if (CYCLE_LSN(tail_lsn) == log->l_prev_cycle) {
3582	blocks =
3583	log->l_logBBsize - (log->l_prev_block - BLOCK_LSN(tail_lsn));
3584	if (blocks < BTOBB(iclog->ic_offset)+BTOBB(log->l_iclog_hsize))
3585	xfs_emerg(log->l_mp, "%s: ran out of log space", __func__);
3586	} else {
3587	ASSERT(CYCLE_LSN(tail_lsn)+`1` == log->l_prev_cycle);
3588
3589	if (BLOCK_LSN(tail_lsn) == log->l_prev_block)
3590	xfs_emerg(log->l_mp, "%s: tail wrapped", __func__);
3591
3592	blocks = BLOCK_LSN(tail_lsn) - log->l_prev_block;
3593	if (blocks < BTOBB(iclog->ic_offset) + `1`)
3594	xfs_emerg(log->l_mp, "%s: ran out of log space", __func__);
3595	}
3596	}
3597
3598	/*
3599	* Perform a number of checks on the iclog before writing to disk.
3600	*
3601	* 1. Make sure the iclogs are still circular
3602	* 2. Make sure we have a good magic number
3603	* 3. Make sure we don't have magic numbers in the data
3604	* 4. Check fields of each log operation header for:
3605	* A. Valid client identifier
3606	* B. tid ptr value falls in valid ptr space (user space code)
3607	* C. Length in log record header is correct according to the
3608	* individual operation headers within record.
3609	* 5. When a bwrite will occur within 5 blocks of the front of the physical
3610	* log, check the preceding blocks of the physical log to make sure all
3611	* the cycle numbers agree with the current cycle number.
3612	*/
3613	STATIC void
3614	xlog_verify_iclog(
3615	struct xlog *log,
3616	struct xlog_in_core *iclog,
3617	int count)
3618	{
3619	xlog_op_header_t *ophead;
3620	xlog_in_core_t *icptr;
3621	xlog_in_core_2_t *xhdr;
3622	void base_ptr, ptr, *p;
3623	ptrdiff_t field_offset;
3624	uint8_t clientid;
3625	int len, i, j, k, op_len;
3626	int idx;
3627
3628	/ check validity of iclog pointers /
3629	spin_lock(lock: &log->l_icloglock);
3630	icptr = log->l_iclog;
3631	for (i = `0`; i < log->l_iclog_bufs; i++, icptr = icptr->ic_next)
3632	ASSERT(icptr);
3633
3634	if (icptr != log->l_iclog)
3635	xfs_emerg(log->l_mp, "%s: corrupt iclog ring", __func__);
3636	spin_unlock(lock: &log->l_icloglock);
3637
3638	/ check log magic numbers /
3639	if (iclog->ic_header.h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))
3640	xfs_emerg(log->l_mp, "%s: invalid magic num", __func__);
3641
3642	base_ptr = ptr = &iclog->ic_header;
3643	p = &iclog->ic_header;
3644	for (ptr += BBSIZE; ptr < base_ptr + count; ptr += BBSIZE) {
3645	if ((__be32 )ptr == cpu_to_be32(XLOG_HEADER_MAGIC_NUM))
3646	xfs_emerg(log->l_mp, "%s: unexpected magic num",
3647	__func__);
3648	}
3649
3650	/ check fields /
3651	len = be32_to_cpu(iclog->ic_header.h_num_logops);
3652	base_ptr = ptr = iclog->ic_datap;
3653	ophead = ptr;
3654	xhdr = iclog->ic_data;
3655	for (i = `0`; i < len; i++) {
3656	ophead = ptr;
3657
3658	/ clientid is only 1 byte /
3659	p = &ophead->oh_clientid;
3660	field_offset = p - base_ptr;
3661	if (field_offset & `0x1ff`) {
3662	clientid = ophead->oh_clientid;
3663	} else {
3664	idx = BTOBBT((void *)&ophead->oh_clientid - iclog->ic_datap);
3665	if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) {
3666	j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3667	k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3668	clientid = xlog_get_client_id(
3669	xhdr[j].hic_xheader.xh_cycle_data[k]);
3670	} else {
3671	clientid = xlog_get_client_id(
3672	i: iclog->ic_header.h_cycle_data[idx]);
3673	}
3674	}
3675	if (clientid != XFS_TRANSACTION && clientid != XFS_LOG) {
3676	xfs_warn(log->l_mp,
3677	"%s: op %d invalid clientid %d op "PTR_FMT" offset 0x%lx",
3678	__func__, i, clientid, ophead,
3679	(unsigned long)field_offset);
3680	}
3681
3682	/ check length /
3683	p = &ophead->oh_len;
3684	field_offset = p - base_ptr;
3685	if (field_offset & `0x1ff`) {
3686	op_len = be32_to_cpu(ophead->oh_len);
3687	} else {
3688	idx = BTOBBT((void *)&ophead->oh_len - iclog->ic_datap);
3689	if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) {
3690	j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3691	k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3692	op_len = be32_to_cpu(xhdr[j].hic_xheader.xh_cycle_data[k]);
3693	} else {
3694	op_len = be32_to_cpu(iclog->ic_header.h_cycle_data[idx]);
3695	}
3696	}
3697	ptr += sizeof(xlog_op_header_t) + op_len;
3698	}
3699	}
3700	#endif
3701
3702	/*
3703	* Perform a forced shutdown on the log.
3704	*
3705	* This can be called from low level log code to trigger a shutdown, or from the
3706	* high level mount shutdown code when the mount shuts down.
3707	*
3708	* Our main objectives here are to make sure that:
3709	* a. if the shutdown was not due to a log IO error, flush the logs to
3710	* disk. Anything modified after this is ignored.
3711	* b. the log gets atomically marked 'XLOG_IO_ERROR' for all interested
3712	* parties to find out. Nothing new gets queued after this is done.
3713	* c. Tasks sleeping on log reservations, pinned objects and
3714	* other resources get woken up.
3715	* d. The mount is also marked as shut down so that log triggered shutdowns
3716	* still behave the same as if they called xfs_forced_shutdown().
3717	*
3718	* Return true if the shutdown cause was a log IO error and we actually shut the
3719	* log down.
3720	*/
3721	bool
3722	xlog_force_shutdown(
3723	struct xlog *log,
3724	uint32_t shutdown_flags)
3725	{
3726	bool log_error = (shutdown_flags & SHUTDOWN_LOG_IO_ERROR);
3727
3728	if (!log)
3729	return false;
3730
3731	/*
3732	* Flush all the completed transactions to disk before marking the log
3733	* being shut down. We need to do this first as shutting down the log
3734	* before the force will prevent the log force from flushing the iclogs
3735	* to disk.
3736	*
3737	* When we are in recovery, there are no transactions to flush, and
3738	* we don't want to touch the log because we don't want to perturb the
3739	* current head/tail for future recovery attempts. Hence we need to
3740	* avoid a log force in this case.
3741	*
3742	* If we are shutting down due to a log IO error, then we must avoid
3743	* trying to write the log as that may just result in more IO errors and
3744	* an endless shutdown/force loop.
3745	*/
3746	if (!log_error && !xlog_in_recovery(log))
3747	xfs_log_force(mp: log->l_mp, XFS_LOG_SYNC);
3748
3749	/*
3750	* Atomically set the shutdown state. If the shutdown state is already
3751	* set, there someone else is performing the shutdown and so we are done
3752	* here. This should never happen because we should only ever get called
3753	* once by the first shutdown caller.
3754	*
3755	* Much of the log state machine transitions assume that shutdown state
3756	* cannot change once they hold the log->l_icloglock. Hence we need to
3757	* hold that lock here, even though we use the atomic test_and_set_bit()
3758	* operation to set the shutdown state.
3759	*/
3760	spin_lock(lock: &log->l_icloglock);
3761	if (test_and_set_bit(XLOG_IO_ERROR, addr: &log->l_opstate)) {
3762	spin_unlock(lock: &log->l_icloglock);
3763	return false;
3764	}
3765	spin_unlock(lock: &log->l_icloglock);
3766
3767	/*
3768	* If this log shutdown also sets the mount shutdown state, issue a
3769	* shutdown warning message.
3770	*/
3771	if (!test_and_set_bit(XFS_OPSTATE_SHUTDOWN, addr: &log->l_mp->m_opstate)) {
3772	xfs_alert_tag(log->l_mp, XFS_PTAG_SHUTDOWN_LOGERROR,
3773	"Filesystem has been shut down due to log error (0x%x).",
3774	shutdown_flags);
3775	xfs_alert(log->l_mp,
3776	"Please unmount the filesystem and rectify the problem(s).");
3777	if (xfs_error_level >= XFS_ERRLEVEL_HIGH)
3778	xfs_stack_trace();
3779	}
3780
3781	/*
3782	* We don't want anybody waiting for log reservations after this. That
3783	* means we have to wake up everybody queued up on reserveq as well as
3784	* writeq. In addition, we make sure in xlog_{re}grant_log_space that
3785	* we don't enqueue anything once the SHUTDOWN flag is set, and this
3786	* action is protected by the grant locks.
3787	*/
3788	xlog_grant_head_wake_all(head: &log->l_reserve_head);
3789	xlog_grant_head_wake_all(head: &log->l_write_head);
3790
3791	/*
3792	* Wake up everybody waiting on xfs_log_force. Wake the CIL push first
3793	* as if the log writes were completed. The abort handling in the log
3794	* item committed callback functions will do this again under lock to
3795	* avoid races.
3796	*/
3797	spin_lock(lock: &log->l_cilp->xc_push_lock);
3798	wake_up_all(&log->l_cilp->xc_start_wait);
3799	wake_up_all(&log->l_cilp->xc_commit_wait);
3800	spin_unlock(lock: &log->l_cilp->xc_push_lock);
3801
3802	spin_lock(lock: &log->l_icloglock);
3803	xlog_state_shutdown_callbacks(log);
3804	spin_unlock(lock: &log->l_icloglock);
3805
3806	wake_up_var(var: &log->l_opstate);
3807	return log_error;
3808	}
3809
3810	STATIC int
3811	xlog_iclogs_empty(
3812	struct xlog *log)
3813	{
3814	xlog_in_core_t *iclog;
3815
3816	iclog = log->l_iclog;
3817	do {
3818	/ endianness does not matter here, zero is zero in*
3819	* any language.
3820	*/
3821	if (iclog->ic_header.h_num_logops)
3822	return `0`;
3823	iclog = iclog->ic_next;
3824	} while (iclog != log->l_iclog);
3825	return `1`;
3826	}
3827
3828	/*
3829	* Verify that an LSN stamped into a piece of metadata is valid. This is
3830	* intended for use in read verifiers on v5 superblocks.
3831	*/
3832	bool
3833	xfs_log_check_lsn(
3834	struct xfs_mount *mp,
3835	xfs_lsn_t lsn)
3836	{
3837	struct xlog *log = mp->m_log;
3838	bool valid;
3839
3840	/*
3841	* norecovery mode skips mount-time log processing and unconditionally
3842	* resets the in-core LSN. We can't validate in this mode, but
3843	* modifications are not allowed anyways so just return true.
3844	*/
3845	if (xfs_has_norecovery(mp))
3846	return true;
3847
3848	/*
3849	* Some metadata LSNs are initialized to NULL (e.g., the agfl). This is
3850	* handled by recovery and thus safe to ignore here.
3851	*/
3852	if (lsn == NULLCOMMITLSN)
3853	return true;
3854
3855	valid = xlog_valid_lsn(mp->m_log, lsn);
3856
3857	/ warn the user about what's gone wrong before verifier failure /
3858	if (!valid) {
3859	spin_lock(lock: &log->l_icloglock);
3860	xfs_warn(mp,
3861	"Corruption warning: Metadata has LSN (%d:%d) ahead of current LSN (%d:%d). "
3862	"Please unmount and run xfs_repair (>= v4.3) to resolve.",
3863	CYCLE_LSN(lsn), BLOCK_LSN(lsn),
3864	log->l_curr_cycle, log->l_curr_block);
3865	spin_unlock(lock: &log->l_icloglock);
3866	}
3867
3868	return valid;
3869	}
3870
3871	/*
3872	* Notify the log that we're about to start using a feature that is protected
3873	* by a log incompat feature flag. This will prevent log covering from
3874	* clearing those flags.
3875	*/
3876	void
3877	xlog_use_incompat_feat(
3878	struct xlog *log)
3879	{
3880	down_read(sem: &log->l_incompat_users);
3881	}
3882
3883	/ Notify the log that we've finished using log incompat features. /
3884	void
3885	xlog_drop_incompat_feat(
3886	struct xlog *log)
3887	{
3888	up_read(sem: &log->l_incompat_users);
3889	}
3890

source code of linux/fs/xfs/xfs_log.c