-----BEGIN PGP SIGNATURE-----
 
 iQEzBAABCAAdFiEEq1nRK9aeMoq1VSgcnJ2qBz9kQNkFAl9JG9wACgkQnJ2qBz9k
 QNlp3ggA3B/Xopb2X3cCpf2fFw63YGJU4i0XJxi+3fC/v6m8U+D4XbqJUjaM5TZz
 +4XABQf7OHvSwDezc3n6KXXD/zbkZCeVm9aohEXvfMYLyKbs+S7QNQALHEtpfBUU
 3IY2pQ90K7JT9cD9pJls/Y/EaA1ObWP7+3F1zpw8OutGchKcE8SvVjzL3SSJaj7k
 d8OTtMosAFuTe4saFWfsf9CmZzbx4sZw3VAzXEXAArrxsmqFKIcY8dI8TQ0WaYNh
 C3wQFvW+n9wHapylyi7RhGl2QH9Tj8POfnCTahNFFJbsmJBx0Z3r42mCBAk4janG
 FW+uDdH5V780bTNNVUKz0v4C/YDiKg==
 =jQnW
 -----END PGP SIGNATURE-----

Merge tag 'writeback_for_v5.9-rc3' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs

Pull writeback fixes from Jan Kara:
 "Fixes for writeback code occasionally skipping writeback of some
  inodes or livelocking sync(2)"

* tag 'writeback_for_v5.9-rc3' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs:
  writeback: Drop I_DIRTY_TIME_EXPIRE
  writeback: Fix sync livelock due to b_dirty_time processing
  writeback: Avoid skipping inode writeback
  writeback: Protect inode->i_io_list with inode->i_lock
This commit is contained in:
Linus Torvalds 2020-08-28 10:57:14 -07:00
commit e309428590
5 changed files with 67 additions and 63 deletions

View File

@ -4901,7 +4901,7 @@ static void __ext4_update_other_inode_time(struct super_block *sb,
(inode->i_state & I_DIRTY_TIME)) { (inode->i_state & I_DIRTY_TIME)) {
struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_inode_info *ei = EXT4_I(inode);
inode->i_state &= ~(I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED); inode->i_state &= ~I_DIRTY_TIME;
spin_unlock(&inode->i_lock); spin_unlock(&inode->i_lock);
spin_lock(&ei->i_raw_lock); spin_lock(&ei->i_raw_lock);

View File

@ -42,7 +42,6 @@
struct wb_writeback_work { struct wb_writeback_work {
long nr_pages; long nr_pages;
struct super_block *sb; struct super_block *sb;
unsigned long *older_than_this;
enum writeback_sync_modes sync_mode; enum writeback_sync_modes sync_mode;
unsigned int tagged_writepages:1; unsigned int tagged_writepages:1;
unsigned int for_kupdate:1; unsigned int for_kupdate:1;
@ -144,7 +143,9 @@ static void inode_io_list_del_locked(struct inode *inode,
struct bdi_writeback *wb) struct bdi_writeback *wb)
{ {
assert_spin_locked(&wb->list_lock); assert_spin_locked(&wb->list_lock);
assert_spin_locked(&inode->i_lock);
inode->i_state &= ~I_SYNC_QUEUED;
list_del_init(&inode->i_io_list); list_del_init(&inode->i_io_list);
wb_io_lists_depopulated(wb); wb_io_lists_depopulated(wb);
} }
@ -1122,7 +1123,9 @@ void inode_io_list_del(struct inode *inode)
struct bdi_writeback *wb; struct bdi_writeback *wb;
wb = inode_to_wb_and_lock_list(inode); wb = inode_to_wb_and_lock_list(inode);
spin_lock(&inode->i_lock);
inode_io_list_del_locked(inode, wb); inode_io_list_del_locked(inode, wb);
spin_unlock(&inode->i_lock);
spin_unlock(&wb->list_lock); spin_unlock(&wb->list_lock);
} }
EXPORT_SYMBOL(inode_io_list_del); EXPORT_SYMBOL(inode_io_list_del);
@ -1172,8 +1175,10 @@ void sb_clear_inode_writeback(struct inode *inode)
* the case then the inode must have been redirtied while it was being written * the case then the inode must have been redirtied while it was being written
* out and we don't reset its dirtied_when. * out and we don't reset its dirtied_when.
*/ */
static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) static void redirty_tail_locked(struct inode *inode, struct bdi_writeback *wb)
{ {
assert_spin_locked(&inode->i_lock);
if (!list_empty(&wb->b_dirty)) { if (!list_empty(&wb->b_dirty)) {
struct inode *tail; struct inode *tail;
@ -1182,6 +1187,14 @@ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
inode->dirtied_when = jiffies; inode->dirtied_when = jiffies;
} }
inode_io_list_move_locked(inode, wb, &wb->b_dirty); inode_io_list_move_locked(inode, wb, &wb->b_dirty);
inode->i_state &= ~I_SYNC_QUEUED;
}
static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
{
spin_lock(&inode->i_lock);
redirty_tail_locked(inode, wb);
spin_unlock(&inode->i_lock);
} }
/* /*
@ -1220,16 +1233,13 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
#define EXPIRE_DIRTY_ATIME 0x0001 #define EXPIRE_DIRTY_ATIME 0x0001
/* /*
* Move expired (dirtied before work->older_than_this) dirty inodes from * Move expired (dirtied before dirtied_before) dirty inodes from
* @delaying_queue to @dispatch_queue. * @delaying_queue to @dispatch_queue.
*/ */
static int move_expired_inodes(struct list_head *delaying_queue, static int move_expired_inodes(struct list_head *delaying_queue,
struct list_head *dispatch_queue, struct list_head *dispatch_queue,
int flags, unsigned long dirtied_before)
struct wb_writeback_work *work)
{ {
unsigned long *older_than_this = NULL;
unsigned long expire_time;
LIST_HEAD(tmp); LIST_HEAD(tmp);
struct list_head *pos, *node; struct list_head *pos, *node;
struct super_block *sb = NULL; struct super_block *sb = NULL;
@ -1237,21 +1247,15 @@ static int move_expired_inodes(struct list_head *delaying_queue,
int do_sb_sort = 0; int do_sb_sort = 0;
int moved = 0; int moved = 0;
if ((flags & EXPIRE_DIRTY_ATIME) == 0)
older_than_this = work->older_than_this;
else if (!work->for_sync) {
expire_time = jiffies - (dirtytime_expire_interval * HZ);
older_than_this = &expire_time;
}
while (!list_empty(delaying_queue)) { while (!list_empty(delaying_queue)) {
inode = wb_inode(delaying_queue->prev); inode = wb_inode(delaying_queue->prev);
if (older_than_this && if (inode_dirtied_after(inode, dirtied_before))
inode_dirtied_after(inode, *older_than_this))
break; break;
list_move(&inode->i_io_list, &tmp); list_move(&inode->i_io_list, &tmp);
moved++; moved++;
if (flags & EXPIRE_DIRTY_ATIME) spin_lock(&inode->i_lock);
set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state); inode->i_state |= I_SYNC_QUEUED;
spin_unlock(&inode->i_lock);
if (sb_is_blkdev_sb(inode->i_sb)) if (sb_is_blkdev_sb(inode->i_sb))
continue; continue;
if (sb && sb != inode->i_sb) if (sb && sb != inode->i_sb)
@ -1289,18 +1293,22 @@ out:
* | * |
* +--> dequeue for IO * +--> dequeue for IO
*/ */
static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work) static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work,
unsigned long dirtied_before)
{ {
int moved; int moved;
unsigned long time_expire_jif = dirtied_before;
assert_spin_locked(&wb->list_lock); assert_spin_locked(&wb->list_lock);
list_splice_init(&wb->b_more_io, &wb->b_io); list_splice_init(&wb->b_more_io, &wb->b_io);
moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work); moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, dirtied_before);
if (!work->for_sync)
time_expire_jif = jiffies - dirtytime_expire_interval * HZ;
moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io, moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
EXPIRE_DIRTY_ATIME, work); time_expire_jif);
if (moved) if (moved)
wb_io_lists_populated(wb); wb_io_lists_populated(wb);
trace_writeback_queue_io(wb, work, moved); trace_writeback_queue_io(wb, work, dirtied_before, moved);
} }
static int write_inode(struct inode *inode, struct writeback_control *wbc) static int write_inode(struct inode *inode, struct writeback_control *wbc)
@ -1394,7 +1402,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
* writeback is not making progress due to locked * writeback is not making progress due to locked
* buffers. Skip this inode for now. * buffers. Skip this inode for now.
*/ */
redirty_tail(inode, wb); redirty_tail_locked(inode, wb);
return; return;
} }
@ -1414,7 +1422,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
* retrying writeback of the dirty page/inode * retrying writeback of the dirty page/inode
* that cannot be performed immediately. * that cannot be performed immediately.
*/ */
redirty_tail(inode, wb); redirty_tail_locked(inode, wb);
} }
} else if (inode->i_state & I_DIRTY) { } else if (inode->i_state & I_DIRTY) {
/* /*
@ -1422,10 +1430,11 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
* such as delayed allocation during submission or metadata * such as delayed allocation during submission or metadata
* updates after data IO completion. * updates after data IO completion.
*/ */
redirty_tail(inode, wb); redirty_tail_locked(inode, wb);
} else if (inode->i_state & I_DIRTY_TIME) { } else if (inode->i_state & I_DIRTY_TIME) {
inode->dirtied_when = jiffies; inode->dirtied_when = jiffies;
inode_io_list_move_locked(inode, wb, &wb->b_dirty_time); inode_io_list_move_locked(inode, wb, &wb->b_dirty_time);
inode->i_state &= ~I_SYNC_QUEUED;
} else { } else {
/* The inode is clean. Remove from writeback lists. */ /* The inode is clean. Remove from writeback lists. */
inode_io_list_del_locked(inode, wb); inode_io_list_del_locked(inode, wb);
@ -1472,18 +1481,14 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
spin_lock(&inode->i_lock); spin_lock(&inode->i_lock);
dirty = inode->i_state & I_DIRTY; dirty = inode->i_state & I_DIRTY;
if (inode->i_state & I_DIRTY_TIME) { if ((inode->i_state & I_DIRTY_TIME) &&
if ((dirty & I_DIRTY_INODE) || ((dirty & I_DIRTY_INODE) ||
wbc->sync_mode == WB_SYNC_ALL || wbc->sync_mode == WB_SYNC_ALL || wbc->for_sync ||
unlikely(inode->i_state & I_DIRTY_TIME_EXPIRED) || time_after(jiffies, inode->dirtied_time_when +
unlikely(time_after(jiffies, dirtytime_expire_interval * HZ))) {
(inode->dirtied_time_when + dirty |= I_DIRTY_TIME;
dirtytime_expire_interval * HZ)))) { trace_writeback_lazytime(inode);
dirty |= I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED; }
trace_writeback_lazytime(inode);
}
} else
inode->i_state &= ~I_DIRTY_TIME_EXPIRED;
inode->i_state &= ~dirty; inode->i_state &= ~dirty;
/* /*
@ -1669,8 +1674,8 @@ static long writeback_sb_inodes(struct super_block *sb,
*/ */
spin_lock(&inode->i_lock); spin_lock(&inode->i_lock);
if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
redirty_tail_locked(inode, wb);
spin_unlock(&inode->i_lock); spin_unlock(&inode->i_lock);
redirty_tail(inode, wb);
continue; continue;
} }
if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) { if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) {
@ -1811,7 +1816,7 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
blk_start_plug(&plug); blk_start_plug(&plug);
spin_lock(&wb->list_lock); spin_lock(&wb->list_lock);
if (list_empty(&wb->b_io)) if (list_empty(&wb->b_io))
queue_io(wb, &work); queue_io(wb, &work, jiffies);
__writeback_inodes_wb(wb, &work); __writeback_inodes_wb(wb, &work);
spin_unlock(&wb->list_lock); spin_unlock(&wb->list_lock);
blk_finish_plug(&plug); blk_finish_plug(&plug);
@ -1831,7 +1836,7 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
* takes longer than a dirty_writeback_interval interval, then leave a * takes longer than a dirty_writeback_interval interval, then leave a
* one-second gap. * one-second gap.
* *
* older_than_this takes precedence over nr_to_write. So we'll only write back * dirtied_before takes precedence over nr_to_write. So we'll only write back
* all dirty pages if they are all attached to "old" mappings. * all dirty pages if they are all attached to "old" mappings.
*/ */
static long wb_writeback(struct bdi_writeback *wb, static long wb_writeback(struct bdi_writeback *wb,
@ -1839,14 +1844,11 @@ static long wb_writeback(struct bdi_writeback *wb,
{ {
unsigned long wb_start = jiffies; unsigned long wb_start = jiffies;
long nr_pages = work->nr_pages; long nr_pages = work->nr_pages;
unsigned long oldest_jif; unsigned long dirtied_before = jiffies;
struct inode *inode; struct inode *inode;
long progress; long progress;
struct blk_plug plug; struct blk_plug plug;
oldest_jif = jiffies;
work->older_than_this = &oldest_jif;
blk_start_plug(&plug); blk_start_plug(&plug);
spin_lock(&wb->list_lock); spin_lock(&wb->list_lock);
for (;;) { for (;;) {
@ -1880,14 +1882,14 @@ static long wb_writeback(struct bdi_writeback *wb,
* safe. * safe.
*/ */
if (work->for_kupdate) { if (work->for_kupdate) {
oldest_jif = jiffies - dirtied_before = jiffies -
msecs_to_jiffies(dirty_expire_interval * 10); msecs_to_jiffies(dirty_expire_interval * 10);
} else if (work->for_background) } else if (work->for_background)
oldest_jif = jiffies; dirtied_before = jiffies;
trace_writeback_start(wb, work); trace_writeback_start(wb, work);
if (list_empty(&wb->b_io)) if (list_empty(&wb->b_io))
queue_io(wb, work); queue_io(wb, work, dirtied_before);
if (work->sb) if (work->sb)
progress = writeback_sb_inodes(work->sb, wb, work); progress = writeback_sb_inodes(work->sb, wb, work);
else else
@ -2289,11 +2291,12 @@ void __mark_inode_dirty(struct inode *inode, int flags)
inode->i_state |= flags; inode->i_state |= flags;
/* /*
* If the inode is being synced, just update its dirty state. * If the inode is queued for writeback by flush worker, just
* The unlocker will place the inode on the appropriate * update its dirty state. Once the flush worker is done with
* superblock list, based upon its state. * the inode it will place it on the appropriate superblock
* list, based upon its state.
*/ */
if (inode->i_state & I_SYNC) if (inode->i_state & I_SYNC_QUEUED)
goto out_unlock_inode; goto out_unlock_inode;
/* /*

View File

@ -110,9 +110,9 @@ xfs_trans_log_inode(
* to log the timestamps, or will clear already cleared fields in the * to log the timestamps, or will clear already cleared fields in the
* worst case. * worst case.
*/ */
if (inode->i_state & (I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED)) { if (inode->i_state & I_DIRTY_TIME) {
spin_lock(&inode->i_lock); spin_lock(&inode->i_lock);
inode->i_state &= ~(I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED); inode->i_state &= ~I_DIRTY_TIME;
spin_unlock(&inode->i_lock); spin_unlock(&inode->i_lock);
} }

View File

@ -2132,6 +2132,10 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src,
* *
* I_DONTCACHE Evict inode as soon as it is not used anymore. * I_DONTCACHE Evict inode as soon as it is not used anymore.
* *
* I_SYNC_QUEUED Inode is queued in b_io or b_more_io writeback lists.
* Used to detect that mark_inode_dirty() should not move
* inode between dirty lists.
*
* Q: What is the difference between I_WILL_FREE and I_FREEING? * Q: What is the difference between I_WILL_FREE and I_FREEING?
*/ */
#define I_DIRTY_SYNC (1 << 0) #define I_DIRTY_SYNC (1 << 0)
@ -2149,12 +2153,11 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src,
#define I_DIO_WAKEUP (1 << __I_DIO_WAKEUP) #define I_DIO_WAKEUP (1 << __I_DIO_WAKEUP)
#define I_LINKABLE (1 << 10) #define I_LINKABLE (1 << 10)
#define I_DIRTY_TIME (1 << 11) #define I_DIRTY_TIME (1 << 11)
#define __I_DIRTY_TIME_EXPIRED 12
#define I_DIRTY_TIME_EXPIRED (1 << __I_DIRTY_TIME_EXPIRED)
#define I_WB_SWITCH (1 << 13) #define I_WB_SWITCH (1 << 13)
#define I_OVL_INUSE (1 << 14) #define I_OVL_INUSE (1 << 14)
#define I_CREATING (1 << 15) #define I_CREATING (1 << 15)
#define I_DONTCACHE (1 << 16) #define I_DONTCACHE (1 << 16)
#define I_SYNC_QUEUED (1 << 17)
#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC) #define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
#define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES) #define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES)

View File

@ -20,7 +20,6 @@
{I_CLEAR, "I_CLEAR"}, \ {I_CLEAR, "I_CLEAR"}, \
{I_SYNC, "I_SYNC"}, \ {I_SYNC, "I_SYNC"}, \
{I_DIRTY_TIME, "I_DIRTY_TIME"}, \ {I_DIRTY_TIME, "I_DIRTY_TIME"}, \
{I_DIRTY_TIME_EXPIRED, "I_DIRTY_TIME_EXPIRED"}, \
{I_REFERENCED, "I_REFERENCED"} \ {I_REFERENCED, "I_REFERENCED"} \
) )
@ -498,8 +497,9 @@ DEFINE_WBC_EVENT(wbc_writepage);
TRACE_EVENT(writeback_queue_io, TRACE_EVENT(writeback_queue_io,
TP_PROTO(struct bdi_writeback *wb, TP_PROTO(struct bdi_writeback *wb,
struct wb_writeback_work *work, struct wb_writeback_work *work,
unsigned long dirtied_before,
int moved), int moved),
TP_ARGS(wb, work, moved), TP_ARGS(wb, work, dirtied_before, moved),
TP_STRUCT__entry( TP_STRUCT__entry(
__array(char, name, 32) __array(char, name, 32)
__field(unsigned long, older) __field(unsigned long, older)
@ -509,19 +509,17 @@ TRACE_EVENT(writeback_queue_io,
__field(ino_t, cgroup_ino) __field(ino_t, cgroup_ino)
), ),
TP_fast_assign( TP_fast_assign(
unsigned long *older_than_this = work->older_than_this;
strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32); strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32);
__entry->older = older_than_this ? *older_than_this : 0; __entry->older = dirtied_before;
__entry->age = older_than_this ? __entry->age = (jiffies - dirtied_before) * 1000 / HZ;
(jiffies - *older_than_this) * 1000 / HZ : -1;
__entry->moved = moved; __entry->moved = moved;
__entry->reason = work->reason; __entry->reason = work->reason;
__entry->cgroup_ino = __trace_wb_assign_cgroup(wb); __entry->cgroup_ino = __trace_wb_assign_cgroup(wb);
), ),
TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup_ino=%lu", TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup_ino=%lu",
__entry->name, __entry->name,
__entry->older, /* older_than_this in jiffies */ __entry->older, /* dirtied_before in jiffies */
__entry->age, /* older_than_this in relative milliseconds */ __entry->age, /* dirtied_before in relative milliseconds */
__entry->moved, __entry->moved,
__print_symbolic(__entry->reason, WB_WORK_REASON), __print_symbolic(__entry->reason, WB_WORK_REASON),
(unsigned long)__entry->cgroup_ino (unsigned long)__entry->cgroup_ino