diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index bd588a4cdddc..817c9bfd0cd0 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -513,7 +513,7 @@ __xlog_state_release_iclog( * Flush iclog to disk if this is the last reference to the given iclog and the * it is in the WANT_SYNC state. */ -static int +int xlog_state_release_iclog( struct xlog *log, struct xlog_in_core *iclog) @@ -533,23 +533,6 @@ xlog_state_release_iclog( return 0; } -void -xfs_log_release_iclog( - struct xlog_in_core *iclog) -{ - struct xlog *log = iclog->ic_log; - bool sync = false; - - if (atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock)) { - if (iclog->ic_state != XLOG_STATE_IOERROR) - sync = __xlog_state_release_iclog(log, iclog); - spin_unlock(&log->l_icloglock); - } - - if (sync) - xlog_sync(log, iclog); -} - /* * Mount a log filesystem * @@ -837,6 +820,14 @@ xlog_write_unmount_record( /* account for space used by record data */ ticket->t_curr_res -= sizeof(ulf); + + /* + * For external log devices, we need to flush the data device cache + * first to ensure all metadata writeback is on stable storage before we + * stamp the tail LSN into the unmount record. + */ + if (log->l_targ != log->l_mp->m_ddev_targp) + blkdev_issue_flush(log->l_targ->bt_bdev); return xlog_write(log, &vec, ticket, NULL, NULL, XLOG_UNMOUNT_TRANS); } @@ -874,6 +865,11 @@ out_err: else ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC || iclog->ic_state == XLOG_STATE_IOERROR); + /* + * Ensure the journal is fully flushed and on stable storage once the + * iclog containing the unmount record is written. + */ + iclog->ic_flags |= (XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA); error = xlog_state_release_iclog(log, iclog); xlog_wait_on_iclog(iclog); @@ -1755,8 +1751,7 @@ xlog_write_iclog( struct xlog *log, struct xlog_in_core *iclog, uint64_t bno, - unsigned int count, - bool need_flush) + unsigned int count) { ASSERT(bno < log->l_logBBsize); @@ -1794,10 +1789,12 @@ xlog_write_iclog( * writeback throttle from throttling log writes behind background * metadata writeback and causing priority inversions. */ - iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC | - REQ_IDLE | REQ_FUA; - if (need_flush) + iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC | REQ_IDLE; + if (iclog->ic_flags & XLOG_ICL_NEED_FLUSH) iclog->ic_bio.bi_opf |= REQ_PREFLUSH; + if (iclog->ic_flags & XLOG_ICL_NEED_FUA) + iclog->ic_bio.bi_opf |= REQ_FUA; + iclog->ic_flags &= ~(XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA); if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) { xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); @@ -1900,7 +1897,6 @@ xlog_sync( unsigned int roundoff; /* roundoff to BB or stripe */ uint64_t bno; unsigned int size; - bool need_flush = true, split = false; ASSERT(atomic_read(&iclog->ic_refcnt) == 0); @@ -1925,10 +1921,8 @@ xlog_sync( bno = BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn)); /* Do we need to split this write into 2 parts? */ - if (bno + BTOBB(count) > log->l_logBBsize) { + if (bno + BTOBB(count) > log->l_logBBsize) xlog_split_iclog(log, &iclog->ic_header, bno, count); - split = true; - } /* calculcate the checksum */ iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header, @@ -1949,22 +1943,8 @@ xlog_sync( be64_to_cpu(iclog->ic_header.h_lsn)); } #endif - - /* - * Flush the data device before flushing the log to make sure all meta - * data written back from the AIL actually made it to disk before - * stamping the new log tail LSN into the log buffer. For an external - * log we need to issue the flush explicitly, and unfortunately - * synchronously here; for an internal log we can simply use the block - * layer state machine for preflushes. - */ - if (log->l_targ != log->l_mp->m_ddev_targp || split) { - blkdev_issue_flush(log->l_mp->m_ddev_targp->bt_bdev); - need_flush = false; - } - xlog_verify_iclog(log, iclog, count); - xlog_write_iclog(log, iclog, bno, count, need_flush); + xlog_write_iclog(log, iclog, bno, count); } /* @@ -2418,7 +2398,7 @@ xlog_write( ASSERT(log_offset <= iclog->ic_size - 1); ptr = iclog->ic_datap + log_offset; - /* start_lsn is the first lsn written to. That's all we need. */ + /* Start_lsn is the first lsn written to. */ if (start_lsn && !*start_lsn) *start_lsn = be64_to_cpu(iclog->ic_header.h_lsn); diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 044e02cb8921..99f9d6ed9598 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -117,7 +117,6 @@ void xfs_log_mount_cancel(struct xfs_mount *); xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp); xfs_lsn_t xlog_assign_tail_lsn_locked(struct xfs_mount *mp); void xfs_log_space_wake(struct xfs_mount *mp); -void xfs_log_release_iclog(struct xlog_in_core *iclog); int xfs_log_reserve(struct xfs_mount *mp, int length, int count, diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 172bb3551d6b..9d2fa8464289 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -890,15 +890,25 @@ restart: /* * If the checkpoint spans multiple iclogs, wait for all previous - * iclogs to complete before we submit the commit_iclog. + * iclogs to complete before we submit the commit_iclog. In this case, + * the commit_iclog write needs to issue a pre-flush so that the + * ordering is correctly preserved down to stable storage. */ + spin_lock(&log->l_icloglock); if (ctx->start_lsn != commit_lsn) { - spin_lock(&log->l_icloglock); xlog_wait_on_iclog(commit_iclog->ic_prev); + spin_lock(&log->l_icloglock); + commit_iclog->ic_flags |= XLOG_ICL_NEED_FLUSH; } - /* release the hounds! */ - xfs_log_release_iclog(commit_iclog); + /* + * The commit iclog must be written to stable storage to guarantee + * journal IO vs metadata writeback IO is correctly ordered on stable + * storage. + */ + commit_iclog->ic_flags |= XLOG_ICL_NEED_FUA; + xlog_state_release_iclog(log, commit_iclog); + spin_unlock(&log->l_icloglock); return; out_skip: diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 56e1942c47df..2203ccecafb6 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -133,6 +133,9 @@ enum xlog_iclog_state { #define XLOG_COVER_OPS 5 +#define XLOG_ICL_NEED_FLUSH (1 << 0) /* iclog needs REQ_PREFLUSH */ +#define XLOG_ICL_NEED_FUA (1 << 1) /* iclog needs REQ_FUA */ + /* Ticket reservation region accounting */ #define XLOG_TIC_LEN_MAX 15 @@ -201,6 +204,7 @@ typedef struct xlog_in_core { u32 ic_size; u32 ic_offset; enum xlog_iclog_state ic_state; + unsigned int ic_flags; char *ic_datap; /* pointer to iclog data */ /* Callback structures need their own cacheline */ @@ -486,6 +490,8 @@ int xlog_commit_record(struct xlog *log, struct xlog_ticket *ticket, void xfs_log_ticket_ungrant(struct xlog *log, struct xlog_ticket *ticket); void xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket); +int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog); + /* * When we crack an atomic LSN, we sample it first so that the value will not * change while we are cracking it into the component values. This means we